<h3>Import Packages

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score, accuracy_score

<h3>Data Loading

In [None]:
# Read CSV files into pandas DataFrames
test = pd.read_csv("C:/Users/Kaavya/OneDrive - National University of Singapore/Y3S1/BT4012/Final Project/test.csv")
train = pd.read_csv("C:/Users/Kaavya/OneDrive - National University of Singapore/Y3S1/BT4012/Final Project/train.csv")
validate = pd.read_csv("C:/Users/Kaavya/OneDrive - National University of Singapore/Y3S1/BT4012/Final Project/validate.csv")

<h3> Logistic Regression on Base Model

In [None]:
feature_cols = [col for col in train.columns if (col != 'fraud')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

In [None]:
LogR_1 = LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000) # for imbalanced learning
LogR_1.fit(X_train, y_train.values.ravel())

In [None]:
train_predict = LogR_1.predict(X_train)
print("Train Data")
print('Accuracy score: {:0.4f}'.format(accuracy_score(y_train, train_predict)))
print('F1 score: {:0.4f}'.format(f1_score(y_train, train_predict)))
print('Precision score: {:0.4f}'.format(precision_score(y_train, train_predict)))
print('Recall score: {:0.4f}'.format(recall_score(y_train, train_predict)))
print("_________________________________________________________")

validation_predict = LogR_1.predict(X_validate)
print("Validate Data")
print('F1 score over validation dataset: {:0.4f}'.format(f1_score(y_validate, validation_predict)))
print('Accuracy score over validation dataset: {:0.4f}'.format(accuracy_score(y_validate, validation_predict)))
print('Precision score over validation dataset: {:0.4f}'.format(precision_score(y_validate, validation_predict)))
print('Recall score over validation dataset: {:0.4f}'.format(recall_score(y_validate, validation_predict)))
print("_________________________________________________________")

test_predict = LogR_1.predict(X_test)
print("Test Data")
print('F1 score over test dataset: {:0.4f}'.format(f1_score(y_test, test_predict)))
print('Accuracy score over test dataset: {:0.4f}'.format(accuracy_score(y_test, test_predict)))
print('Precision score over test dataset: {:0.4f}'.format(precision_score(y_test, test_predict)))
print('Recall score over test dataset: {:0.4f}'.format(recall_score(y_test, test_predict)))
print("_________________________________________________________")

Train Data
Accuracy score: 0.9572
F1 score: 0.6891
Precision score: 0.5296
Recall score: 0.9859
_________________________________________________________
Validate Data
F1 score over validation dataset: 0.6890
Accuracy score over validation dataset: 0.9569
Precision score over validation dataset: 0.5303
Recall score over validation dataset: 0.9831
_________________________________________________________
Test Data
F1 score over test dataset: 0.6729
Accuracy score over test dataset: 0.9563
Precision score over test dataset: 0.5111
Recall score over test dataset: 0.9844
_________________________________________________________


The evaluation metrics are consistent across the dataset which means that there is no overfitting. However, we can see that Recall is very high but Precision is quite low.

Let's try to include our feature engineered column `totalItems`, and reduce model complexity by running the logistic regression only on the selected features

<h3>Feature Engineering

<h5>Total Number of Items Scanned

In [None]:
train['totalItems'] = train['totalScanTimeInSeconds'] * train['scannedLineItemsPerSecond']
validate['totalItems'] = validate['totalScanTimeInSeconds'] * validate['scannedLineItemsPerSecond']
test['totalItems'] = test['totalScanTimeInSeconds'] * test['scannedLineItemsPerSecond']

<h3>Logistic Regression on Selected Features</h3>

In [None]:
selected_features = ['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids', 'scansWithoutRegistration', 'totalItems']

feature_cols = [col for col in selected_features]
X_train_selected_features = train[feature_cols].copy()
X_validate_selected_features = validate[feature_cols].copy()
X_test_selected_features  = test[feature_cols].copy()

In [None]:
LogR_2 = LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000) # for imbalanced learning
LogR_2.fit(X_train_selected_features, y_train.values.ravel())

In [None]:
train_predict = LogR_2.predict(X_train_selected_features)
print("Train Data")
print('Accuracy score: {:0.4f}'.format(accuracy_score(y_train, train_predict)))
print('F1 score: {:0.4f}'.format(f1_score(y_train, train_predict)))
print('Precision score: {:0.4f}'.format(precision_score(y_train, train_predict)))
print('Recall score: {:0.4f}'.format(recall_score(y_train, train_predict)))
print("_________________________________________________________")

validation_predict = LogR_2.predict(X_validate_selected_features)
print("Validate Data")
print('F1 score over validation dataset: {:0.4f}'.format(f1_score(y_validate, validation_predict)))
print('Accuracy score over validation dataset: {:0.4f}'.format(accuracy_score(y_validate, validation_predict)))
print('Precision score over validation dataset: {:0.4f}'.format(precision_score(y_validate, validation_predict)))
print('Recall score over validation dataset: {:0.4f}'.format(recall_score(y_validate, validation_predict)))
print("_________________________________________________________")

test_predict = LogR_2.predict(X_test_selected_features)
print("Test Data")
print('F1 score over test dataset: {:0.4f}'.format(f1_score(y_test, test_predict)))
print('Accuracy score over test dataset: {:0.4f}'.format(accuracy_score(y_test, test_predict)))
print('Precision score over test dataset: {:0.4f}'.format(precision_score(y_test, test_predict)))
print('Recall score over test dataset: {:0.4f}'.format(recall_score(y_test, test_predict)))
print("_________________________________________________________")

Train Data
Accuracy score: 0.9823
F1 score: 0.8437
Precision score: 0.7327
Recall score: 0.9944
_________________________________________________________
Validate Data
F1 score over validation dataset: 0.8433
Accuracy score over validation dataset: 0.9820
Precision score over validation dataset: 0.7317
Recall score over validation dataset: 0.9951
_________________________________________________________
Test Data
F1 score over test dataset: 0.8344
Accuracy score over test dataset: 0.9820
Precision score over test dataset: 0.7188
Recall score over test dataset: 0.9943
_________________________________________________________


By creating a new column `totalItems` and selecting our best features, we were able to successfully increase our Precision value and subsequently the F1-Score!