In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [2]:
train = pd.read_csv('train.csv', sep = ',')
validate = pd.read_csv('validate.csv', sep = ',')
test = pd.read_csv('test.csv', sep = ',')

print(len(train))
print(len(test))
print(len(validate))


350000
100000
50000


<h3> Run Logistic Regression on Base Model

In [3]:
feature_cols = [col for col in train.columns if (col != 'fraud')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

LogR = LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000) # for imbalanced learning
LogR.fit(X_train, y_train.values.ravel())

# Calculate precision, recall and f1-score for the training dataset
precision_train = precision_score(y_train, LogR.predict(X_train))
recall_train = recall_score(y_train, LogR.predict(X_train))
f1_score_train = f1_score(y_train, LogR.predict(X_train))

# Calculate precision, recall and f1-score for the validation dataset
precision_validate = precision_score(y_validate, LogR.predict(X_validate))
recall_validate = recall_score(y_validate, LogR.predict(X_validate))
f1_score_validate = f1_score(y_validate, LogR.predict(X_validate))

# Calculate precision, recall and f1-score for the test dataset
precision_test = precision_score(y_test, LogR.predict(X_test))
recall_test = recall_score(y_test, LogR.predict(X_test))
f1_score_test = f1_score(y_test, LogR.predict(X_test))

print('Precision over training dataset: {:0.4f}'.format(precision_train))
print('Precision over validation dataset: {:0.4f}'.format(precision_validate))
print('Precision over test dataset: {:0.4f}'.format(precision_test))
print('Recall over training dataset: {:0.4f}'.format(recall_train))
print('Recall over validation dataset: {:0.4f}'.format(recall_validate))
print('Recall over test dataset: {:0.4f}'.format(recall_test))
print('F1 score over training dataset: {:0.4f}'.format(f1_score_train))
print('F1 score over validation dataset: {:0.4f}'.format(f1_score_validate))
print('F1 score over test dataset: {:0.4f}'.format(f1_score_test))

Precision over training dataset: 0.5312
Precision over validation dataset: 0.5320
Precision over test dataset: 0.5124
Recall over training dataset: 0.9857
Recall over validation dataset: 0.9823
Recall over test dataset: 0.9840
F1 score over training dataset: 0.6904
F1 score over validation dataset: 0.6902
F1 score over test dataset: 0.6739


<h3>create new feature `totalItems`

In [4]:
train['totalItems'] = train['totalScanTimeInSeconds'] * train['scannedLineItemsPerSecond']
validate['totalItems'] = validate['totalScanTimeInSeconds'] * validate['scannedLineItemsPerSecond']
test['totalItems'] = test['totalScanTimeInSeconds'] * test['scannedLineItemsPerSecond']

<h3>Check for skewness of data

In [5]:
# check skewness of data
feature_cols = [col for col in train.columns if (col != 'fraud')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

print(X_train.skew(axis = 0, skipna = True))

trustLevel                   -0.000681
totalScanTimeInSeconds       -0.000135
grandTotal                   -0.002165
lineItemVoids                 0.001811
scansWithoutRegistration      0.001743
quantityModifications         0.000359
scannedLineItemsPerSecond    31.774127
valuePerSecond               31.442346
lineItemVoidsPerPosition      4.535840
totalItems                   -0.000936
dtype: float64


Based on the results, we can see that `scannedLineItemsPerSecond`, `valuePerSecond`, and `lineItemVoidsPerPosition` are positively skewed. We can then do log transformation on these features

In [6]:
X_train.loc[:, 'scannedLineItemsPerSecond'] = np.log(X_train.loc[:, 'scannedLineItemsPerSecond'] + 1)
X_train.loc[:, 'valuePerSecond'] = np.log(X_train.loc[:, 'valuePerSecond'] + 1)
X_train.loc[:, 'lineItemVoidsPerPosition'] = np.log(X_train.loc[:, 'lineItemVoidsPerPosition'] + 1)

X_validate.loc[:, 'scannedLineItemsPerSecond'] = np.log(X_validate.loc[:, 'scannedLineItemsPerSecond'] + 1)
X_validate.loc[:, 'valuePerSecond'] = np.log(X_validate.loc[:, 'valuePerSecond'] + 1)
X_validate.loc[:, 'lineItemVoidsPerPosition'] = np.log(X_validate.loc[:, 'lineItemVoidsPerPosition'] + 1)

X_test.loc[:, 'scannedLineItemsPerSecond'] = np.log(X_test.loc[:, 'scannedLineItemsPerSecond'] + 1)
X_test.loc[:, 'valuePerSecond'] = np.log(X_test.loc[:, 'valuePerSecond'] + 1)
X_test.loc[:, 'lineItemVoidsPerPosition'] = np.log(X_test.loc[:, 'lineItemVoidsPerPosition'] + 1)


In [7]:
selected_features = ['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids', 'scansWithoutRegistration', 'totalItems']

feature_cols = [col for col in selected_features]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

LogR = LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000) # for imbalanced learning
LogR.fit(X_train, y_train.values.ravel())

# Calculate precision, recall and f1-score for the training dataset
precision_train = precision_score(y_train, LogR.predict(X_train))
recall_train = recall_score(y_train, LogR.predict(X_train))
f1_score_train = f1_score(y_train, LogR.predict(X_train))

# Calculate precision, recall and f1-score for the validation dataset
precision_validate = precision_score(y_validate, LogR.predict(X_validate))
recall_validate = recall_score(y_validate, LogR.predict(X_validate))
f1_score_validate = f1_score(y_validate, LogR.predict(X_validate))

# Calculate precision, recall and f1-score for the test dataset
precision_test = precision_score(y_test, LogR.predict(X_test))
recall_test = recall_score(y_test, LogR.predict(X_test))
f1_score_test = f1_score(y_test, LogR.predict(X_test))

print('Precision over training dataset: {:0.4f}'.format(precision_train))
print('Precision over validation dataset: {:0.4f}'.format(precision_validate))
print('Precision over test dataset: {:0.4f}'.format(precision_test))
print('Recall over training dataset: {:0.4f}'.format(recall_train))
print('Recall over validation dataset: {:0.4f}'.format(recall_validate))
print('Recall over test dataset: {:0.4f}'.format(recall_test))
print('F1 score over training dataset: {:0.4f}'.format(f1_score_train))
print('F1 score over validation dataset: {:0.4f}'.format(f1_score_validate))
print('F1 score over test dataset: {:0.4f}'.format(f1_score_test))

Precision over training dataset: 0.7327
Precision over validation dataset: 0.7317
Precision over test dataset: 0.7188
Recall over training dataset: 0.9944
Recall over validation dataset: 0.9951
Recall over test dataset: 0.9943
F1 score over training dataset: 0.8437
F1 score over validation dataset: 0.8433
F1 score over test dataset: 0.8344
