In [69]:
# Load in relevant modules
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression

from src.data import prepare_train_valid_test

# Turn off SettingWithCopyWarning
pd.options.mode.chained_assignment = None

In [3]:
# Load in the data
df = pd.read_csv('data/creditcard.csv')

In [4]:
# The first step is to create train and test datasets.
df_train, _, df_test = prepare_train_valid_test(df, valid_prop=0, test_prop=.3)

In [61]:
# Next I need to set the class weights to pass into the loss function to account
# for the class imbalance
class_weights_dict = {0: .1, 1: .9}

In [62]:
# Create the logistic regression model
lr_model = LogisticRegression(class_weight=class_weights_dict)

# Fit the model
lr_model.fit(df_train[df_train.columns[:-1]], df_train[df_train.columns[-1]])

In [68]:
# Evaluate performance on the test set
test_fraud_indices = list(df_test.reset_index()[df_test.reset_index()['Class'] == 1].index)
test_no_fraud_indices = list(df_test.reset_index()[df_test.reset_index()['Class'] == 0].index)
preds = lr_model.predict(df_test[df_test.columns[:-1]])
tp = sum(preds[test_fraud_indices])
fn = len(preds[test_fraud_indices]) - tp
fp = sum(preds[test_no_fraud_indices])
tn = len(preds[test_no_fraud_indices]) - fn
print(f'accuracy: {np.round(100 * (tp+tn) / (tp+fp+tn+fn), 2)}%')
print(f'precision: {np.round(100 * tp / (tp+fp), 2)}%')
print(f'recall: {np.round(100 * tp / (tp+fn), 2)}%')

accuracy: 99.93%
precision: 69.17%
recall: 76.85%


In [71]:
# Save model
filename = 'logistic_regression.pickle'
pickle.dump(lr_model, open('model_files/' + filename, "wb"))

**Takeaway**: Logistic regression does an incredible job in detecting fraud for this problem.