In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.feature_selection import RFE

In [9]:
train = pd.read_csv('train.csv', sep = ',')
validate = pd.read_csv('validate.csv', sep = ',')
test = pd.read_csv('test.csv', sep = ',')

print(len(train))
print(len(validate))
print(len(test))

350000
50000
100000


Run logistic regression on the base model

In [10]:
feature_cols = [col for col in train.columns if (col != 'fraud')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

LogR = LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000) # for imbalanced learning
LogR.fit(X_train, y_train.values.ravel())

# Calculate precision, recall and f1-score for the training dataset
precision_train = precision_score(y_train, LogR.predict(X_train))
recall_train = recall_score(y_train, LogR.predict(X_train))
f1_score_train = f1_score(y_train, LogR.predict(X_train))

# Calculate precision, recall and f1-score for the validation dataset
precision_validate = precision_score(y_validate, LogR.predict(X_validate))
recall_validate = recall_score(y_validate, LogR.predict(X_validate))
f1_score_validate = f1_score(y_validate, LogR.predict(X_validate))

# Calculate precision, recall and f1-score for the test dataset
precision_test = precision_score(y_test, LogR.predict(X_test))
recall_test = recall_score(y_test, LogR.predict(X_test))
f1_score_test = f1_score(y_test, LogR.predict(X_test))

print('Precision over training dataset: {:0.4f}'.format(precision_train))
print('Precision over validation dataset: {:0.4f}'.format(precision_validate))
print('Precision over test dataset: {:0.4f}'.format(precision_test))
print('Recall over training dataset: {:0.4f}'.format(recall_train))
print('Recall over validation dataset: {:0.4f}'.format(recall_validate))
print('Recall over test dataset: {:0.4f}'.format(recall_test))
print('F1 score over training dataset: {:0.4f}'.format(f1_score_train))
print('F1 score over validation dataset: {:0.4f}'.format(f1_score_validate))
print('F1 score over test dataset: {:0.4f}'.format(f1_score_test))

Precision over training dataset: 0.5312
Precision over validation dataset: 0.5320
Precision over test dataset: 0.5124
Recall over training dataset: 0.9857
Recall over validation dataset: 0.9823
Recall over test dataset: 0.9840
F1 score over training dataset: 0.6904
F1 score over validation dataset: 0.6902
F1 score over test dataset: 0.6739


Create new features that might be useful
- attempted_scan: attempted (1) or did not attempt (0)
- made_modification: made modification (1) or did not make any modification (0)
- no. of items scanned = totalScanTimeInSeconds * scannedLineItemsPerSecond
- avg value of each item = grandTotal / no. of items scanned

In [11]:
train['attempted_scan'] = np.where(train['scansWithoutRegistration'] > 0, 1, 0)
validate['attempted_scan'] = np.where(validate['scansWithoutRegistration'] > 0, 1, 0)
test['attempted_scan'] = np.where(test['scansWithoutRegistration'] > 0, 1, 0)

train['made_modification'] = np.where(train['quantityModifications'] > 0, 1, 0)
validate['made_modification'] = np.where(validate['quantityModifications'] > 0, 1, 0)
test['made_modification'] = np.where(test['quantityModifications'] > 0, 1, 0)

train['totalItems'] = train['totalScanTimeInSeconds'] * train['scannedLineItemsPerSecond']
validate['totalItems'] = validate['totalScanTimeInSeconds'] * validate['scannedLineItemsPerSecond']
test['totalItems'] = test['totalScanTimeInSeconds'] * test['scannedLineItemsPerSecond']

train['value_per_item'] = train['grandTotal'] / train['total_items_scanned']
validate['value_per_item'] = validate['grandTotal'] / validate['total_items_scanned']
test['value_per_item'] = test['grandTotal'] / test['total_items_scanned']

train["total_cost"] = train["totalScanTimeInSeconds"] * train["valuePerSecond"]
validate["total_cost"] = validate["totalScanTimeInSeconds"] * validate["valuePerSecond"]
test["total_cost"] = test["totalScanTimeInSeconds"] * test["valuePerSecond"]

In [12]:
# check skewness of data
feature_cols = [col for col in train.columns if (col != 'fraud')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

print(X_train.skew(axis = 0, skipna = True))

trustLevel                   -0.000681
totalScanTimeInSeconds       -0.000135
grandTotal                   -0.002165
lineItemVoids                 0.001811
scansWithoutRegistration      0.001743
quantityModifications         0.000359
scannedLineItemsPerSecond    31.774127
valuePerSecond               31.442346
lineItemVoidsPerPosition      4.535840
attempted_scan               -2.852944
made_modification            -1.784765
total_items_scanned          -0.000936
value_per_item                4.467149
total_cost                   -0.002165
dtype: float64


Based on the results, we can see that `scannedLineItemsPerSecond`, `valuePerSecond`, and `lineItemVoidsPerPosition`, `value_per_item` are positively skewed. We can then do log transformation on these features

In [13]:
X_train.loc[:, 'scannedLineItemsPerSecond'] = np.log(X_train.loc[:, 'scannedLineItemsPerSecond'] + 1)
X_train.loc[:, 'valuePerSecond'] = np.log(X_train.loc[:, 'valuePerSecond'] + 1)
X_train.loc[:, 'lineItemVoidsPerPosition'] = np.log(X_train.loc[:, 'lineItemVoidsPerPosition'] + 1)
X_train.loc[:, 'value_per_item'] = np.log(X_train.loc[:, 'value_per_item'] + 1)

X_validate.loc[:, 'scannedLineItemsPerSecond'] = np.log(X_validate.loc[:, 'scannedLineItemsPerSecond'] + 1)
X_validate.loc[:, 'valuePerSecond'] = np.log(X_validate.loc[:, 'valuePerSecond'] + 1)
X_validate.loc[:, 'lineItemVoidsPerPosition'] = np.log(X_validate.loc[:, 'lineItemVoidsPerPosition'] + 1)
X_validate.loc[:, 'value_per_item'] = np.log(X_validate.loc[:, 'value_per_item'] + 1)

X_test.loc[:, 'scannedLineItemsPerSecond'] = np.log(X_test.loc[:, 'scannedLineItemsPerSecond'] + 1)
X_test.loc[:, 'valuePerSecond'] = np.log(X_test.loc[:, 'valuePerSecond'] + 1)
X_test.loc[:, 'lineItemVoidsPerPosition'] = np.log(X_test.loc[:, 'lineItemVoidsPerPosition'] + 1)
X_test.loc[:, 'value_per_item'] = np.log(X_test.loc[:, 'value_per_item'] + 1)

Perform min-max scaling on the other continuous features

In [14]:
all_cols = [col for col in train.columns if 
            (col != 'fraud') and (col != 'attempted_scan') and (col != 'made_modification') and (col != 'trustLevel') and
            (col != 'scannedLineItemsPerSecond') and (col != 'valuePerSecond') and (col != 'lineItemVoidsPerPosition') and
            (col != 'value_per_item')]

# Create a MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the selected features in the train, validate, and test sets
train[all_cols] = scaler.fit_transform(train[all_cols])
validate[all_cols] = scaler.transform(validate[all_cols])
test[all_cols] = scaler.transform(test[all_cols])

Run logistic regression again on ALL features

In [10]:
feature_cols = [col for col in train.columns if (col != 'fraud')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

LogR = LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000) # for imbalanced learning
LogR.fit(X_train, y_train.values.ravel())

# Calculate precision, recall and f1-score for the training dataset
precision_train = precision_score(y_train, LogR.predict(X_train))
recall_train = recall_score(y_train, LogR.predict(X_train))
f1_score_train = f1_score(y_train, LogR.predict(X_train))

# Calculate precision, recall and f1-score for the validation dataset
precision_validate = precision_score(y_validate, LogR.predict(X_validate))
recall_validate = recall_score(y_validate, LogR.predict(X_validate))
f1_score_validate = f1_score(y_validate, LogR.predict(X_validate))

# Calculate precision, recall and f1-score for the test dataset
precision_test = precision_score(y_test, LogR.predict(X_test))
recall_test = recall_score(y_test, LogR.predict(X_test))
f1_score_test = f1_score(y_test, LogR.predict(X_test))

print('Precision over training dataset: {:0.4f}'.format(precision_train))
print('Precision over validation dataset: {:0.4f}'.format(precision_validate))
print('Precision over test dataset: {:0.4f}'.format(precision_test))
print('Recall over training dataset: {:0.4f}'.format(recall_train))
print('Recall over validation dataset: {:0.4f}'.format(recall_validate))
print('Recall over test dataset: {:0.4f}'.format(recall_test))
print('F1 score over training dataset: {:0.4f}'.format(f1_score_train))
print('F1 score over validation dataset: {:0.4f}'.format(f1_score_validate))
print('F1 score over test dataset: {:0.4f}'.format(f1_score_test))

Precision over training dataset: 0.7247
Precision over validation dataset: 0.7268
Precision over test dataset: 0.7088
Recall over training dataset: 0.9958
Recall over validation dataset: 0.9971
Recall over test dataset: 0.9961
F1 score over training dataset: 0.8389
F1 score over validation dataset: 0.8408
F1 score over test dataset: 0.8282


Remove highly correlated features

In [15]:
# Assuming you have already calculated the correlation matrix
correlation_matrix_train = train.corr()

# Set a threshold for high correlation
threshold = 0.7  # You can adjust this threshold as needed

# Find highly correlated pairs of variables
highly_correlated_pairs = set()
for i in range(len(correlation_matrix_train.columns)):
    for j in range(i):
        if abs(correlation_matrix_train.iloc[i, j]) > threshold:
            col1 = correlation_matrix_train.columns[i]
            col2 = correlation_matrix_train.columns[j]
            highly_correlated_pairs.add((col1, col2))

# Print the highly correlated variable pairs
print("Highly correlated variable pairs:")
for pair in highly_correlated_pairs:
    print(f"{pair[0]} and {pair[1]} have a correlation of {correlation_matrix_train.loc[pair[0], pair[1]]}")

Highly correlated variable pairs:
valuePerSecond and scannedLineItemsPerSecond have a correlation of 0.7472137111808428
total_cost and grandTotal have a correlation of 0.9999999999999999


Remove each and see the difference it makes to the model

Removing valuePerSecond only:

In [14]:
feature_cols = [col for col in train.columns if (col != 'fraud') and (col != 'valuePerSecond')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

LogR = LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000) # for imbalanced learning
LogR.fit(X_train, y_train.values.ravel())

# Calculate precision, recall and f1-score for the training dataset
precision_train = precision_score(y_train, LogR.predict(X_train))
recall_train = recall_score(y_train, LogR.predict(X_train))
f1_score_train = f1_score(y_train, LogR.predict(X_train))

# Calculate precision, recall and f1-score for the validation dataset
precision_validate = precision_score(y_validate, LogR.predict(X_validate))
recall_validate = recall_score(y_validate, LogR.predict(X_validate))
f1_score_validate = f1_score(y_validate, LogR.predict(X_validate))

# Calculate precision, recall and f1-score for the test dataset
precision_test = precision_score(y_test, LogR.predict(X_test))
recall_test = recall_score(y_test, LogR.predict(X_test))
f1_score_test = f1_score(y_test, LogR.predict(X_test))

print('Precision over training dataset: {:0.4f}'.format(precision_train))
print('Precision over validation dataset: {:0.4f}'.format(precision_validate))
print('Precision over test dataset: {:0.4f}'.format(precision_test))
print('Recall over training dataset: {:0.4f}'.format(recall_train))
print('Recall over validation dataset: {:0.4f}'.format(recall_validate))
print('Recall over test dataset: {:0.4f}'.format(recall_test))
print('F1 score over training dataset: {:0.4f}'.format(f1_score_train))
print('F1 score over validation dataset: {:0.4f}'.format(f1_score_validate))
print('F1 score over test dataset: {:0.4f}'.format(f1_score_test))


Precision over training dataset: 0.7246
Precision over validation dataset: 0.7270
Precision over test dataset: 0.7091
Recall over training dataset: 0.9958
Recall over validation dataset: 0.9971
Recall over test dataset: 0.9961
F1 score over training dataset: 0.8388
F1 score over validation dataset: 0.8409
F1 score over test dataset: 0.8284


Removing scannedLineItemsPerSecond only:

In [16]:
feature_cols = [col for col in train.columns if (col != 'fraud') and (col != 'scannedLineItemsPerSecond')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

LogR = LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000) # for imbalanced learning
LogR.fit(X_train, y_train.values.ravel())

# Calculate precision, recall and f1-score for the training dataset
precision_train = precision_score(y_train, LogR.predict(X_train))
recall_train = recall_score(y_train, LogR.predict(X_train))
f1_score_train = f1_score(y_train, LogR.predict(X_train))

# Calculate precision, recall and f1-score for the validation dataset
precision_validate = precision_score(y_validate, LogR.predict(X_validate))
recall_validate = recall_score(y_validate, LogR.predict(X_validate))
f1_score_validate = f1_score(y_validate, LogR.predict(X_validate))

# Calculate precision, recall and f1-score for the test dataset
precision_test = precision_score(y_test, LogR.predict(X_test))
recall_test = recall_score(y_test, LogR.predict(X_test))
f1_score_test = f1_score(y_test, LogR.predict(X_test))

print('Precision over training dataset: {:0.4f}'.format(precision_train))
print('Precision over validation dataset: {:0.4f}'.format(precision_validate))
print('Precision over test dataset: {:0.4f}'.format(precision_test))
print('Recall over training dataset: {:0.4f}'.format(recall_train))
print('Recall over validation dataset: {:0.4f}'.format(recall_validate))
print('Recall over test dataset: {:0.4f}'.format(recall_test))
print('F1 score over training dataset: {:0.4f}'.format(f1_score_train))
print('F1 score over validation dataset: {:0.4f}'.format(f1_score_validate))
print('F1 score over test dataset: {:0.4f}'.format(f1_score_test))

Precision over training dataset: 0.7241
Precision over validation dataset: 0.7255
Precision over test dataset: 0.7082
Recall over training dataset: 0.9958
Recall over validation dataset: 0.9971
Recall over test dataset: 0.9961
F1 score over training dataset: 0.8385
F1 score over validation dataset: 0.8399
F1 score over test dataset: 0.8278


Run RFE to choose the best features

In [18]:
feature_cols = [col for col in train.columns if (col != 'fraud') and (col != 'valuePerSecond')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

LogR = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000) # for imbalanced learning

# Use RFE to select the top 10 features
rfe = RFE(LogR, n_features_to_select=11)
rfe.fit(X_train, y_train.values.ravel())

# Print the selected features
print(rfe.support_)

# Get the selected feature indexes
selected_feature_indexes = [i for i, selected in enumerate(rfe.support_) if selected]
selected_features = [feature_cols[i] for i in selected_feature_indexes]
print(selected_features)


[ True  True  True  True  True False  True  True  True  True  True  True]
['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids', 'scansWithoutRegistration', 'scannedLineItemsPerSecond', 'lineItemVoidsPerPosition', 'attempted_scan', 'made_modification', 'total_items_scanned', 'value_per_item']


Based on the selected features, run the logistic regression

In [16]:
selected_features = ['trustLevel', 'lineItemVoids', 'scansWithoutRegistration', 'totalScanTimeInSeconds', 'grandTotal', 'total_items_scanned']

feature_cols = [col for col in selected_features]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

LogR = LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000) # for imbalanced learning
LogR.fit(X_train, y_train.values.ravel())

# Calculate precision, recall and f1-score for the training dataset
precision_train = precision_score(y_train, LogR.predict(X_train))
recall_train = recall_score(y_train, LogR.predict(X_train))
f1_score_train = f1_score(y_train, LogR.predict(X_train))

# Calculate precision, recall and f1-score for the validation dataset
precision_validate = precision_score(y_validate, LogR.predict(X_validate))
recall_validate = recall_score(y_validate, LogR.predict(X_validate))
f1_score_validate = f1_score(y_validate, LogR.predict(X_validate))

# Calculate precision, recall and f1-score for the test dataset
precision_test = precision_score(y_test, LogR.predict(X_test))
recall_test = recall_score(y_test, LogR.predict(X_test))
f1_score_test = f1_score(y_test, LogR.predict(X_test))

print('Precision over training dataset: {:0.4f}'.format(precision_train))
print('Precision over validation dataset: {:0.4f}'.format(precision_validate))
print('Precision over test dataset: {:0.4f}'.format(precision_test))
print('Recall over training dataset: {:0.4f}'.format(recall_train))
print('Recall over validation dataset: {:0.4f}'.format(recall_validate))
print('Recall over test dataset: {:0.4f}'.format(recall_test))
print('F1 score over training dataset: {:0.4f}'.format(f1_score_train))
print('F1 score over validation dataset: {:0.4f}'.format(f1_score_validate))
print('F1 score over test dataset: {:0.4f}'.format(f1_score_test))


Precision over training dataset: 0.7183
Precision over validation dataset: 0.7184
Precision over test dataset: 0.7054
Recall over training dataset: 0.9955
Recall over validation dataset: 0.9979
Recall over test dataset: 0.9954
F1 score over training dataset: 0.8345
F1 score over validation dataset: 0.8354
F1 score over test dataset: 0.8257
