In [8]:
# importing the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler

# reading in the csv file
data = pd.read_csv('final_data.csv', encoding = 'ISO-8859-1')
data.head()

Unnamed: 0.1,Unnamed: 0,year,agent,pay_method,state,zip_code,status,months,pay_method_ACH,pay_method_credit card,...,region_rocky,region_south,region_south_east,region_west,status_Active,status_Decline,status_Returned,status_Returned_90,income,Y
0,0,2017.0,Yasha - Pers1 (264925),credit card,north_east,1002.0,Returned,3.73,0,1,...,0,0,0,0,0,0,1,0,0.620241,0
1,1,2015.0,Benefits and More 4 Paid (143595),credit card,north_east,1010.0,Returned,13.67,0,1,...,0,0,0,0,0,0,1,0,0.701398,1
2,2,2014.0,Newspaper (109455),credit card,north_east,1013.0,Order Cancelled,2.07,0,1,...,0,0,0,0,0,0,0,0,0.333751,0
3,3,2014.0,Hospital (109465),credit card,north_east,1013.0,Returned_90,3.0,0,1,...,0,0,0,0,0,0,0,1,0.333751,0
4,4,2015.0,Benefits and More 4 Paid (143595),credit card,north_east,1013.0,Returned,13.57,0,1,...,0,0,0,0,0,0,1,0,0.333751,1


In [9]:
# Dropping rows that will not be used in the logistic regression
data = data.drop(['Unnamed: 0', 'agent', 'pay_method', 'state', 'months', 'zip_code', 'status'], axis = 1)

In [10]:
# Splitting the dataset into the features and target variables
X = data.iloc[:,:-1]
y = data['Y']
print(data.Y.value_counts())

# Splitting the data using the train_test_split sklearn package
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019)

0    35444
1    19121
Name: Y, dtype: int64


In [4]:
# Creating the RandomOverSampler function
random = RandomOverSampler(random_state=2019)
X_train_ros, y_train_ros = random.fit_resample(X_train, y_train)

In [11]:
# Starting to build the parameters of the model
logreg = LogisticRegression(solver='liblinear')

# Fitting the model using the training data
logreg = logreg.fit(X_train, y_train)

# Creating the y2_pred variable
y_pred = logreg.predict(X_test)

# Creating the confusion matrix for the regression
cnf_mat = metrics.confusion_matrix(y_test, y_pred)
cnf_mat

array([[8472,  431],
       [3372, 1367]])

In [12]:
# Printing the accuracy, precision, and recall for the balanced dataset
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred, average='weighted'))
print("AUC:",metrics.roc_auc_score(y_test, y_pred))
print("MCC:",metrics.matthews_corrcoef(y_test, y_pred))

Accuracy: 0.7212285588623369
Precision: 0.7602892102335929
Recall: 0.28845748048111414
F1 Score: 0.6782775276700741
AUC: 0.6200234161924835
MCC: 0.3378807717282003
