In [7]:
# importing the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler

# reading in the csv file
data = pd.read_csv('final_data.csv', encoding = 'ISO-8859-1')
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,year,state,lp_amount,status,months,pay_method_ACH,pay_method_Other,pay_method_credit card,...,region_south,region_south_east,region_west,status_Active,status_Decline,status_Returned,status_Returned < 90 days,status_Returned_90,income,18_months
0,0,0,2014.0,north_east,34.95,Returned_90,0.87,0,0,1,...,0,0,0,0,0,0,0,1,0.465506,0
1,1,1,2015.0,north_east,29.95,Returned < 90 days,0.0,0,0,1,...,0,0,0,0,0,0,1,0,0.465506,0
2,2,2,2016.0,north_east,29.95,Switched to LW,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0.465506,0
3,3,3,2016.0,north_east,39.95,Returned_90,0.67,0,0,1,...,0,0,0,0,0,0,0,1,0.465506,0
4,4,5,2014.0,north_east,34.95,Decline,0.57,0,0,1,...,0,0,0,0,1,0,0,0,0.624609,0


In [8]:
# Dropping rows that will not be used in the logistic regression
data = data.drop(['Unnamed: 0', 'Unnamed: 0.1', 'months', 'lp_amount', 'state', 'status', 'pay_method_Other'], axis = 1)

In [9]:
# Splitting the dataset into the features and target variables
X = data.iloc[:,:-1]
y = data['18_months']

# Splitting the data using the train_test_split sklearn package
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019)

In [10]:
# Using the RandomOverSampler package to deal with the imbalanced dataset
random = RandomOverSampler(random_state=2019)
X_train_ros, y_train_ros = random.fit_resample(X_train, y_train)

In [11]:
# Starting to build the parameters of the model
logreg = LogisticRegression(solver='liblinear')

# Fitting the model using the training data
logreg = logreg.fit(X_train_ros, y_train_ros)

# Creating the y2_pred variable
y_pred = logreg.predict(X_test)

# Creating the confusion matrix for the regression
cnf_mat = metrics.confusion_matrix(y_test, y_pred)
cnf_mat

array([[19011,  5904],
       [  250,  2543]])

In [12]:
# Printing the accuracy, precision, and recall for the balanced dataset
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.777898079976902
Precision: 0.30105362850716233
Recall: 0.9104905119942714
F1 Score: 0.8195462836775652


In [13]:
# Running the trained model on the training and test data to make sure the model is not overfitting

# Running accuracy on both sets
print("Train Accuracy:",metrics.accuracy_score(y_train, logreg.predict(X_train)))
print("Test Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Running precision on both sets
print("Train Precision:",metrics.precision_score(y_train, logreg.predict(X_train)))
print("Test Precision:",metrics.precision_score(y_test, y_pred))

# Running recall on both sets
print("Train Recall:",metrics.recall_score(y_train, logreg.predict(X_train)))
print("Test Recall:",metrics.recall_score(y_test, y_pred))

#Running f1 score on both sets
print("Train F1:",metrics.f1_score(y_train, logreg.predict(X_train), average='weighted'))
print("Test F1:",metrics.f1_score(y_test, y_pred, average='weighted'))

Train Accuracy: 0.7793510821312994
Test Accuracy: 0.777898079976902
Train Precision: 0.3080316697522354
Test Precision: 0.30105362850716233
Train Recall: 0.9099371654642774
Test Recall: 0.9104905119942714
Train F1: 0.8198647744893335
Test F1: 0.8195462836775652
