In [1]:
# importing the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler

# reading in the csv file
data = pd.read_csv('final_data1.csv', encoding = 'ISO-8859-1')
data.head()



Unnamed: 0.1,Unnamed: 0,year,state,months,pay_method_ACH,pay_method_credit card,pay_method_paper check,region_central,region_north_east,region_rocky,region_south,region_south_east,region_west,status_Active,status_Decline,status_Returned,status_Returned_90,income,18_months
0,0,2014.0,north_east,0.87,0,1,0,0,1,0,0,0,0,0,0,0,1,0.458461,0
1,1,2016.0,north_east,0.67,0,1,0,0,1,0,0,0,0,0,0,0,1,0.458461,0
2,2,2014.0,north_east,0.57,0,1,0,0,1,0,0,0,0,0,1,0,0,0.622086,0
3,3,2017.0,north_east,3.73,0,1,0,0,1,0,0,0,0,0,0,1,0,0.622086,0
4,4,2014.0,north_east,0.6,0,1,0,0,1,0,0,0,0,0,1,0,0,0.604332,0


In [2]:
# Dropping rows that will not be used in the logistic regression
data = data.drop(['Unnamed: 0', 'state', 'months'], axis = 1)

In [3]:
# Splitting the dataset into the features and target variables
X = data.iloc[:,:-1]
y = data['18_months']

# Splitting the data using the train_test_split sklearn package
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019)

In [4]:
# Using the SMOTE package to deal with the imbalanced dataset
random = RandomOverSampler(random_state=2019)
X_train_ros, y_train_ros = random.fit_resample(X_train, y_train)

In [5]:
# Starting to build the parameters of the model
logreg = LogisticRegression(solver='liblinear')

# Fitting the model using the training data
logreg = logreg.fit(X_train_ros, y_train_ros)

# Creating the y2_pred variable
y_pred = logreg.predict(X_test)

# Creating the confusion matrix for the regression
cnf_mat = metrics.confusion_matrix(y_test, y_pred)
cnf_mat

array([[12283,  5817],
       [  242,  2635]])

In [6]:
# Printing the accuracy, precision, and recall for the balanced dataset
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred, average='weighted'))
print("AUC:",metrics.roc_auc_score(y_test, y_pred))
print("MCC:",metrics.matthews_corrcoef(y_test, y_pred))

Accuracy: 0.7111598417314201
Precision: 0.31176053005205867
Recall: 0.9158846020159889
F1 Score: 0.7559386021008159
AUC: 0.7972516932731879
MCC: 0.41696052399307765
