In [3]:
# importing the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler

# reading in the csv file
data = pd.read_csv('final_data.csv', encoding = 'ISO-8859-1')
data.head()

Unnamed: 0.1,Unnamed: 0,year,agent,pay_method,state,status,pay_method_ACH,pay_method_credit card,pay_method_paper check,region_central,region_east,region_south,region_west,status_Active,status_Decline,status_Returned,status_Returned_90,income,Y
0,0,2017.0,Yasha - Pers1 (264925),credit card,east,Returned,0,1,0,0,1,0,0,0,0,1,0,0.620241,0
1,1,2015.0,Benefits and More 4 Paid (143595),credit card,east,Returned,0,1,0,0,1,0,0,0,0,1,0,0.701398,1
2,2,2014.0,Newspaper (109455),credit card,east,Order Cancelled,0,1,0,0,1,0,0,0,0,0,0,0.333751,0
3,3,2014.0,Hospital (109465),credit card,east,Returned_90,0,1,0,0,1,0,0,0,0,0,1,0.333751,0
4,4,2015.0,Benefits and More 4 Paid (143595),credit card,east,Returned,0,1,0,0,1,0,0,0,0,1,0,0.333751,1


In [5]:
# Dropping rows that will not be used in the logistic regression
data = data.drop(['Unnamed: 0', 'agent', 'pay_method', 'state', 'status'], axis = 1)

In [6]:
# Splitting the dataset into the features and target variables
X = data.iloc[:,:-1]
y = data['Y']
print(data.Y.value_counts())

# Splitting the data using the train_test_split sklearn package
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019)

0    35444
1    19121
Name: Y, dtype: int64


In [7]:
# Creating the RandomOverSampler function
random = RandomOverSampler(random_state=2019)
X_train_ros, y_train_ros = random.fit_resample(X_train, y_train)

In [8]:
# Starting to build the parameters of the model
logreg = LogisticRegression(solver='liblinear')

# Fitting the model using the training data
logreg = logreg.fit(X_train_ros, y_train_ros)

# Creating the y2_pred variable
y_pred = logreg.predict(X_test)

# Creating the confusion matrix for the regression
cnf_mat = metrics.confusion_matrix(y_test, y_pred)
cnf_mat

array([[4629, 4274],
       [ 375, 4364]])

In [9]:
# Printing the accuracy, precision, and recall for the data
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred, average='weighted'))
print("AUC:",metrics.roc_auc_score(y_test, y_pred))
print("MCC:",metrics.matthews_corrcoef(y_test, y_pred))

Accuracy: 0.6592141914675268
Precision: 0.5052095392451956
Recall: 0.9208693817261026
F1 Score: 0.6611069204328531
AUC: 0.7204032407900423
MCC: 0.43550612003043204
