In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_recall_fscore_support

In [2]:
df = pd.read_csv('../data/loan_data.csv')

# Preparing the data

In [3]:
df.columns = [x.replace('.','_') for x in df.columns]

In [4]:
df['fully_paid'] = 1 - df.not_fully_paid
df.drop('not_fully_paid',axis=1,inplace=True)

In [5]:
df = pd.get_dummies(df)

In [6]:
X = df.drop('fully_paid',axis=1)
y = df.fully_paid

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=675, test_size=0.2)

# Completing the models

I'm not sure what parameter to use so I will try with different parameters.  Whichever model has the highest f1 score on the test set, this is the baseline to compare other models to.

In [12]:
dummy_names = ['stratified','most_frequent','uniform','constant']
dummies = [DummyClassifier(strategy='stratified'), 
           DummyClassifier(strategy='most_frequent'),
           DummyClassifier(strategy='uniform')]
for i in range(3):
    dummies[i].fit(X_train,y_train)
    preds = dummies[i].predict(X_test)
    scores = precision_recall_fscore_support(preds, y_test, average='binary')
    print("precision score for",dummy_names[i],": ",scores[0])
    print("recall score for", dummy_names[i], ": ", scores[1])
    print("f1 score for", dummy_names[i], ": ", scores[2])
    print('-'*50)

precision score for stratified :  0.8410636982065554
recall score for stratified :  0.8431494110353379
f1 score for stratified :  0.8421052631578948
--------------------------------------------------
precision score for most_frequent :  1.0
recall score for most_frequent :  0.843945720250522
f1 score for most_frequent :  0.9153693744692896
--------------------------------------------------
precision score for uniform :  0.5120593692022264
recall score for uniform :  0.8440366972477065
f1 score for uniform :  0.6374133949191686
--------------------------------------------------


Most frequent has the highest f1 score of 0.91