# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

# Import Data

In [4]:
df = pd.read_csv("./data/credit_train.csv")

In [5]:
print(df.shape)

(100514, 19)


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100514 entries, 0 to 100513
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Loan ID                       100000 non-null  object 
 1   Customer ID                   100000 non-null  object 
 2   Loan Status                   100000 non-null  object 
 3   Current Loan Amount           100000 non-null  float64
 4   Term                          100000 non-null  object 
 5   Credit Score                  80846 non-null   float64
 6   Annual Income                 80846 non-null   float64
 7   Years in current job          95778 non-null   object 
 8   Home Ownership                100000 non-null  object 
 9   Purpose                       100000 non-null  object 
 10  Monthly Debt                  100000 non-null  float64
 11  Years of Credit History       100000 non-null  float64
 12  Months since last delinquent  46859 non-null

In [9]:
df.describe()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
count,100000.0,80846.0,80846.0,100000.0,100000.0,46859.0,100000.0,100000.0,100000.0,99998.0,99796.0,99990.0
mean,11760450.0,1076.456089,1378277.0,18472.412336,18.199141,34.901321,11.12853,0.16831,294637.4,760798.4,0.11774,0.029313
std,31783940.0,1475.403791,1081360.0,12174.992609,7.015324,21.997829,5.00987,0.482705,376170.9,8384503.0,0.351424,0.258182
min,10802.0,585.0,76627.0,0.0,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,179652.0,705.0,848844.0,10214.1625,13.5,16.0,8.0,0.0,112670.0,273438.0,0.0,0.0
50%,312246.0,724.0,1174162.0,16220.3,16.9,32.0,10.0,0.0,209817.0,467874.0,0.0,0.0
75%,524942.0,741.0,1650663.0,24012.0575,21.7,51.0,14.0,0.0,367958.8,782958.0,0.0,0.0
max,100000000.0,7510.0,165557400.0,435843.28,70.5,176.0,76.0,15.0,32878970.0,1539738000.0,7.0,15.0


In [10]:
df.columns

Index(['Loan ID', 'Customer ID', 'Loan Status', 'Current Loan Amount', 'Term',
       'Credit Score', 'Annual Income', 'Years in current job',
       'Home Ownership', 'Purpose', 'Monthly Debt', 'Years of Credit History',
       'Months since last delinquent', 'Number of Open Accounts',
       'Number of Credit Problems', 'Current Credit Balance',
       'Maximum Open Credit', 'Bankruptcies', 'Tax Liens'],
      dtype='object')

# Clean Data

In [11]:
df.head()

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6.0,1.0,228190.0,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328.0,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666.0,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,,9.0,0.0,256329.0,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220.0,Short Term,,,5 years,Rent,Debt Consolidation,20639.7,6.1,,15.0,0.0,253460.0,427174.0,0.0,0.0


### Null Values

In [36]:
# Generate column names and number of null values
df.isna().sum()
        

Loan ID                           514
Customer ID                       514
Loan Status                       514
Current Loan Amount               514
Term                              514
Credit Score                    19668
Annual Income                   19668
Years in current job             4736
Home Ownership                    514
Purpose                           514
Monthly Debt                      514
Years of Credit History           514
Months since last delinquent    53655
Number of Open Accounts           514
Number of Credit Problems         514
Current Credit Balance            514
Maximum Open Credit               516
Bankruptcies                      718
Tax Liens                         524
dtype: int64

It seems like for some columns, the last 514 columns are just NaN values. Let's drop those rows

In [51]:
null_rows = [i for i in range(len(df)-514, len(df))]
df.drop(index=null_rows, inplace=True)

In [52]:
df.isna().sum()

Loan ID                             0
Customer ID                         0
Loan Status                         0
Current Loan Amount                 0
Term                                0
Credit Score                    19154
Annual Income                   19154
Years in current job             4222
Home Ownership                      0
Purpose                             0
Monthly Debt                        0
Years of Credit History             0
Months since last delinquent    53141
Number of Open Accounts             0
Number of Credit Problems           0
Current Credit Balance              0
Maximum Open Credit                 2
Bankruptcies                      204
Tax Liens                          10
dtype: int64

Credit Score

In [55]:
df['Credit Score'].isna().sum()

19154

In [58]:
df['Credit Score'].describe()

count    80846.000000
mean      1076.456089
std       1475.403791
min        585.000000
25%        705.000000
50%        724.000000
75%        741.000000
max       7510.000000
Name: Credit Score, dtype: float64

In [68]:
# How many credit scores over 850?
n_high_credit_scores = len(df[df['Credit Score'] > 850])
n_high_credit_scores

4551

In [70]:
high_cs_df = df[df['Credit Score'] > 850]

fig = px.histogram(high_cs_df, x='Credit Score')
fig.show()

It seems these credit scores over 850 just had a 0 appended to them.

The range is 5850-7540. Drop the trailing 0, and we get 585-754, which seems like a reasonable credit score range

In [71]:
# Let's remove that trailing 0 by dividing these values by 10
for i in range(len(df)):
    if df['Credit Score'][i] > 850:
        df['Credit Score'][i] = df['Credit Score'][i] / 10



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [72]:
df['Credit Score'].describe()

count    80846.000000
mean       716.293447
std         28.297164
min        585.000000
25%        703.000000
50%        722.000000
75%        738.000000
max        751.000000
Name: Credit Score, dtype: float64

This is a more reasonable distribution

In [75]:
# Impute credit score mean for null values
df['Credit Score'].fillna(value=df['Credit Score'].mean(), inplace=True)

Annual Income

In [76]:
df['Annual Income'].describe()

count    8.084600e+04
mean     1.378277e+06
std      1.081360e+06
min      7.662700e+04
25%      8.488440e+05
50%      1.174162e+06
75%      1.650663e+06
max      1.655574e+08
Name: Annual Income, dtype: float64

# LEFT OFF HERE