## Credit Score Classification

### Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Data exploration & cleaning

In [None]:
# Load the dataset

df = pd.read_csv(filepath_or_buffer='./Data/train.csv', low_memory=False)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
# Check for duplicate entries in the dataset

df.duplicated().any()

In [None]:
# Get unique values from every column

columns = df.columns
for column in columns:
    print(f'{column}: {df[column].unique()}\n')

In [None]:
# Find errors in the SSN column

SSN_regex_pattern = r'^\d{3}-\d{2}-\d{4}$'

invalid_ssns = ~df['SSN'].str.match(pat=SSN_regex_pattern, na=True)
invalid_ssns.sum()

In [None]:
# Preview incorrect SSN entries

df.loc[invalid_ssns, 'SSN']

In [None]:
# View all the info for records with incorrect SSNs

invalid_ssns_info = df.loc[invalid_ssns, :]
invalid_ssns_info

#### Removing errors in data

In [None]:
# Remove incorrect entries from the SSN column

invalid_ssn_customer_IDs = invalid_ssns_info['Customer_ID'].unique()

for customer in invalid_ssn_customer_IDs:
    # Filter rows for the current Customer_ID
    customer_rows = df[df['Customer_ID'] == customer]
    # Extract valid SSN values if found
    valid_ssns = customer_rows['SSN'].dropna().unique()
    # Update the SSN for the customer with a valid one, if found
    if len(valid_ssns) > 0:
        df.loc[df['Customer_ID'] == customer, 'SSN'] = valid_ssns[0]

In [None]:
df[df['SSN'] == '#F%$D@*&8']

In [None]:
df[df['Customer_ID'] == 'CUS_0x132f']

In [None]:
# Remove leading and trailing underscores from the data

df['Age'] = df['Age'].str.strip(to_strip='_')
df['Annual_Income'] = df['Annual_Income'].str.strip(to_strip='_')
df['Num_of_Loan'] = df['Num_of_Loan'].str.strip(to_strip='_')
df['Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].str.strip(to_strip='_')
df['Outstanding_Debt'] = df['Outstanding_Debt'].str.strip(to_strip='_')

for column in columns:
    print(f'{column}: {df[column].unique()}\n')

In [None]:
# Remove the underscore entry from the Occupation column


# Remove the underscore entry from the Changed_Credit_Limit column


# Remove the underscore entry from the Credit_Mix column

In [None]:
# Remove leading 'and ' sequences from the Type_of_Load column

In [None]:
# Remove invalid entries (negative values, overestimated values)

In [None]:
# Remove incorrect entries from the Payment_Behaviour column