In [112]:
# Martyna Jakubowska

In [113]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier

In [114]:
data = pd.read_csv('Loan payments data.csv')

In [115]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Loan_ID         500 non-null    object 
 1   loan_status     500 non-null    object 
 2   Principal       500 non-null    int64  
 3   terms           500 non-null    int64  
 4   effective_date  500 non-null    object 
 5   due_date        500 non-null    object 
 6   paid_off_time   400 non-null    object 
 7   past_due_days   200 non-null    float64
 8   age             500 non-null    int64  
 9   education       500 non-null    object 
 10  Gender          500 non-null    object 
dtypes: float64(1), int64(3), object(7)
memory usage: 43.1+ KB


In [116]:
data.head()

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender
0,xqd20166231,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male
1,xqd20168902,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female
2,xqd20160003,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female
3,xqd20160004,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male
4,xqd20160005,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female


In [117]:
data.isna().sum()

Loan_ID             0
loan_status         0
Principal           0
terms               0
effective_date      0
due_date            0
paid_off_time     100
past_due_days     300
age                 0
education           0
Gender              0
dtype: int64

In [118]:
data['loan_status'].unique()

array(['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'], dtype=object)

In [119]:
{column: len(data[column].unique()) for column in data.columns}

{'Loan_ID': 500,
 'loan_status': 3,
 'Principal': 6,
 'terms': 3,
 'effective_date': 7,
 'due_date': 25,
 'paid_off_time': 321,
 'past_due_days': 34,
 'age': 33,
 'education': 4,
 'Gender': 2}

In [120]:
data['loan_status'].unique()

array(['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'], dtype=object)

In [121]:
{column: len(data[column].unique()) for column in data.columns}

{'Loan_ID': 500,
 'loan_status': 3,
 'Principal': 6,
 'terms': 3,
 'effective_date': 7,
 'due_date': 25,
 'paid_off_time': 321,
 'past_due_days': 34,
 'age': 33,
 'education': 4,
 'Gender': 2}

In [122]:
df = data.copy()
df = df.drop('Loan_ID', axis=1)

In [123]:
for column in ['effective_date', 'due_date', 'paid_off_time']:
        df[column] = pd.to_datetime(df[column])
    
df['effective_day'] = df['effective_date'].apply(lambda x: x.day)
    
df['due_month'] = df['due_date'].apply(lambda x: x.month)
df['due_day'] = df['due_date'].apply(lambda x: x.day)
    
df['paid_off_month'] = df['paid_off_time'].apply(lambda x: x.month)
df['paid_off_day'] = df['paid_off_time'].apply(lambda x: x.day)
df['paid_off_hour'] = df['paid_off_time'].apply(lambda x: x.hour)
    
df = df.drop(['effective_date', 'due_date', 'paid_off_time'], axis=1)

for column in ['past_due_days', 'paid_off_month', 'paid_off_day', 'paid_off_hour']:
        df[column] = df[column].fillna(df[column].mean())

In [124]:
df.head()

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender,effective_day,due_month,due_day,paid_off_month,paid_off_day,paid_off_hour
0,PAIDOFF,1000,30,36.01,45,High School or Below,male,8,10,7,9.0,14.0,19.0
1,PAIDOFF,1000,30,36.01,50,Bechalor,female,8,10,7,10.0,7.0,9.0
2,PAIDOFF,1000,30,36.01,33,Bechalor,female,8,10,7,9.0,25.0,16.0
3,PAIDOFF,1000,15,36.01,27,college,male,8,9,22,9.0,22.0,20.0
4,PAIDOFF,1000,30,36.01,28,college,female,9,10,8,9.0,23.0,21.0


In [125]:
data['education'].unique()

array(['High School or Below', 'Bechalor', 'college', 'Master or Above'],
      dtype=object)

In [127]:
education_order = [
        'High School or Below',
        'college',
        'Bechalor',
        'Master or Above'
    ]
loan_order = ['COLLECTION','PAIDOFF','COLLECTION_PAIDOFF' ]

In [128]:
df['Gender'] = df['Gender'].apply(lambda x: 1 if x == 'male' else 0)
df['education'] = df['education'].apply(lambda x: education_order.index(x))
df['loan_status'] = df['loan_status'].apply(lambda x: loan_order.index(x))

In [129]:
df

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender,effective_day,due_month,due_day,paid_off_month,paid_off_day,paid_off_hour
0,1,1000,30,36.01,45,0,1,8,10,7,9.0,14.0,19.0
1,1,1000,30,36.01,50,2,0,8,10,7,10.0,7.0,9.0
2,1,1000,30,36.01,33,2,0,8,10,7,9.0,25.0,16.0
3,1,1000,15,36.01,27,1,1,8,9,22,9.0,22.0,20.0
4,1,1000,30,36.01,28,1,0,9,10,8,9.0,23.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2,1000,30,3.00,28,0,1,12,10,11,10.0,14.0,19.0
496,2,1000,15,14.00,26,0,1,12,9,26,10.0,10.0,20.0
497,2,800,15,3.00,30,1,1,12,9,26,9.0,29.0,11.0
498,2,1000,30,1.00,38,1,0,12,11,10,11.0,11.0,22.0


In [130]:
df

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender,effective_day,due_month,due_day,paid_off_month,paid_off_day,paid_off_hour
0,1,1000,30,36.01,45,0,1,8,10,7,9.0,14.0,19.0
1,1,1000,30,36.01,50,2,0,8,10,7,10.0,7.0,9.0
2,1,1000,30,36.01,33,2,0,8,10,7,9.0,25.0,16.0
3,1,1000,15,36.01,27,1,1,8,9,22,9.0,22.0,20.0
4,1,1000,30,36.01,28,1,0,9,10,8,9.0,23.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2,1000,30,3.00,28,0,1,12,10,11,10.0,14.0,19.0
496,2,1000,15,14.00,26,0,1,12,9,26,10.0,10.0,20.0
497,2,800,15,3.00,30,1,1,12,9,26,9.0,29.0,11.0
498,2,1000,30,1.00,38,1,0,12,11,10,11.0,11.0,22.0


In [131]:
y = df['loan_status'].copy()
X = df.drop('loan_status', axis=1).copy()
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [135]:
X
y

0      1
1      1
2      1
3      1
4      1
      ..
495    2
496    2
497    2
498    2
499    2
Name: loan_status, Length: 500, dtype: int64

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

In [133]:
models = [
    LogisticRegression(),
    SVC(),
    DecisionTreeClassifier(),
    MLPClassifier(),
    RandomForestClassifier(),
]

for model in models:
    model.fit(X_train, y_train)



In [137]:
model_names = [
    "   Logistic Regression",
    "Support Vector Machine",
    "         Decision Tree",
    "        Neural Network",
    "         Random Forest",
]

for model, name in zip(models, model_names):
    print(name + ": {:.4f}%".format(model.score(X_test, y_test) * 100))

   Logistic Regression: 98.6667%
Support Vector Machine: 98.6667%
         Decision Tree: 100.0000%
        Neural Network: 100.0000%
         Random Forest: 100.0000%
