In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score

In [2]:
train = pd.read_csv('train.csv', sep = ',')
validate = pd.read_csv('validate.csv', sep = ',')
test = pd.read_csv('test.csv', sep = ',')

In [3]:
print(len(train))
print(len(validate))
print(len(test))

350000
50000
100000


In [4]:
feature_cols = [col for col in train.columns if (col != 'fraud')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

LogR = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000) # for imbalanced learning
LogR.fit(X_train, y_train)

# Calculate precision and recall for the training dataset
precision_train = precision_score(y_train, LogR.predict(X_train))
recall_train = recall_score(y_train, LogR.predict(X_train))

# Calculate precision and recall for the validation dataset
precision_validate = precision_score(y_validate, LogR.predict(X_validate))
recall_validate = recall_score(y_validate, LogR.predict(X_validate))

# Calculate precision and recall for the test dataset
precision_test = precision_score(y_test, LogR.predict(X_test))
recall_test = recall_score(y_test, LogR.predict(X_test))

print('Precision over training dataset: {:0.4f}'.format(precision_train))
print('Precision over validation dataset: {:0.4f}'.format(precision_validate))
print('Precision over test dataset: {:0.4f}'.format(precision_test))
print('Recall over training dataset: {:0.4f}'.format(recall_train))
print('Recall over validation dataset: {:0.4f}'.format(recall_validate))
print('Recall over test dataset: {:0.4f}'.format(recall_test))
print('F1 score over training dataset: {:0.4f}'.format(f1_score(y_train, LogR.predict(X_train))))
print('F1 score over validation dataset: {:0.4f}'.format(f1_score(y_validate, LogR.predict(X_validate))))
print('F1 score over test dataset: {:0.4f}'.format(f1_score(y_test, LogR.predict(X_test))))
temp = LogR.predict(X_test)
c_mat = confusion_matrix(y_test, temp)
print(c_mat)

  y = column_or_1d(y, warn=True)


Precision over training dataset: 0.5312
Precision over validation dataset: 0.5320
Precision over test dataset: 0.5124
Recall over training dataset: 0.9857
Recall over validation dataset: 0.9823
Recall over test dataset: 0.9840
F1 score over training dataset: 0.6904
F1 score over validation dataset: 0.6902
F1 score over test dataset: 0.6739
[[91163  4273]
 [   73  4491]]


Create new features that might be useful
- attempted_scan: attempted (1) or did not attempt (0)
- made_modification: made modification (1) or did not make any modification (0)
- no. of items scanned = totalScanTimeInSeconds * scannedLineItemsPerSecond
- avg value of each item = grandTotal / no. of items scanned

In [5]:
train['attempted_scan'] = np.where(train['scansWithoutRegistration'] > 0, 1, 0)
validate['attempted_scan'] = np.where(validate['scansWithoutRegistration'] > 0, 1, 0)
test['attempted_scan'] = np.where(test['scansWithoutRegistration'] > 0, 1, 0)

train['made_modification'] = np.where(train['quantityModifications'] > 0, 1, 0)
validate['made_modification'] = np.where(validate['quantityModifications'] > 0, 1, 0)
test['made_modification'] = np.where(test['quantityModifications'] > 0, 1, 0)

In [6]:
train['total_items_scanned'] = train['totalScanTimeInSeconds'] * train['scannedLineItemsPerSecond']
validate['total_items_scanned'] = validate['totalScanTimeInSeconds'] * validate['scannedLineItemsPerSecond']
test['total_items_scanned'] = test['totalScanTimeInSeconds'] * test['scannedLineItemsPerSecond']

train['value_per_item'] = train['grandTotal'] / train['total_items_scanned']
validate['value_per_item'] = validate['grandTotal'] / validate['total_items_scanned']
test['value_per_item'] = test['grandTotal'] / test['total_items_scanned']

**Logistic regression before anything**

Logistic Regression before scaling and binning

In [7]:
feature_cols = [col for col in train.columns if (col != 'fraud')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

In [8]:
feature_cols = [col for col in train.columns if (col != 'fraud')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

LogR = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000) # for imbalanced learning
LogR.fit(X_train, y_train)

# Calculate precision and recall for the training dataset
precision_train = precision_score(y_train, LogR.predict(X_train))
recall_train = recall_score(y_train, LogR.predict(X_train))

# Calculate precision and recall for the validation dataset
precision_validate = precision_score(y_validate, LogR.predict(X_validate))
recall_validate = recall_score(y_validate, LogR.predict(X_validate))

# Calculate precision and recall for the test dataset
precision_test = precision_score(y_test, LogR.predict(X_test))
recall_test = recall_score(y_test, LogR.predict(X_test))

print('Precision over training dataset: {:0.4f}'.format(precision_train))
print('Precision over validation dataset: {:0.4f}'.format(precision_validate))
print('Precision over test dataset: {:0.4f}'.format(precision_test))
print('Recall over training dataset: {:0.4f}'.format(recall_train))
print('Recall over validation dataset: {:0.4f}'.format(recall_validate))
print('Recall over test dataset: {:0.4f}'.format(recall_test))
print('F1 score over training dataset: {:0.4f}'.format(f1_score(y_train, LogR.predict(X_train))))
print('F1 score over validation dataset: {:0.4f}'.format(f1_score(y_validate, LogR.predict(X_validate))))
print('F1 score over test dataset: {:0.4f}'.format(f1_score(y_test, LogR.predict(X_test))))


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Precision over training dataset: 0.7232
Precision over validation dataset: 0.7284
Precision over test dataset: 0.7098
Recall over training dataset: 0.9941
Recall over validation dataset: 0.9942
Recall over test dataset: 0.9934
F1 score over training dataset: 0.8373
F1 score over validation dataset: 0.8408
F1 score over test dataset: 0.8280


Make trust levels as factors
- don't need?

Log scaling for scannedLineItemsPerSecond, lineItemVoidsPerPosition, total_items_scanned, totalScanTimeInSeconds


In [8]:
train_log = train.copy(deep=True)
test_log = test.copy(deep=True)
validate_log = validate.copy(deep=True)

train_log[['scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition']] = np.log(train_log[['scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition']] + 1e-10)
test_log[['scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition']] = np.log(test_log[['scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition']] + 1e-10)
validate_log[['scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition']] = np.log(validate_log[['scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition']] + 1e-10)

Scaling all continous variables

In [9]:
all_cols = [col for col in train.columns if 
            (col != 'fraud') and (col != 'attempted_scan') and (col != 'made_modification') and (col != 'trustLevel') and
            (col != 'scannedLineItemsPerSecond') and (col != 'valuePerSecond') and (col != 'lineItemVoidsPerPosition')]
new_col_names = [f'scaled_{col}' for col in all_cols]

scaler = StandardScaler().fit(train[all_cols])
train[new_col_names] = scaler.transform(train[all_cols])
validate[new_col_names] = scaler.transform(validate[all_cols])
test[new_col_names] = scaler.transform(test[all_cols])

# drop original columns
train = train.drop(columns = all_cols)
validate = validate.drop(columns = all_cols)
test = test.drop(columns = all_cols)

In [11]:
train.head()

Unnamed: 0,trustLevel,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud,attempted_scan,made_modification,scaled_totalScanTimeInSeconds,scaled_grandTotal,scaled_lineItemVoids,scaled_scansWithoutRegistration,scaled_quantityModifications,scaled_total_items_scanned,scaled_value_per_item
0,2,0.012834,0.046995,0.166667,0,1,1,0.035712,-0.209627,-1.014426,1.266427,1.464869,-0.404018,-0.258825
1,4,0.008403,0.078887,2.5,0,1,1,-0.832083,-0.430823,1.305782,0.31733,-0.877377,-1.327436,0.231571
2,5,0.016279,0.029093,0.571429,0,1,1,-0.919052,-1.29761,-0.434374,0.000964,0.293746,-0.981154,-0.419371
3,1,0.027344,0.141328,0.714286,0,1,1,-1.24802,-0.478247,-0.144348,-0.948134,0.879307,-0.981154,-0.129764
4,2,0.017785,0.078868,0.363636,0,1,1,0.60668,1.646488,0.72573,0.950061,0.293746,0.750256,-0.192631


In [10]:
feature_cols = [col for col in train.columns if (col != 'fraud')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

In [11]:
LogR = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000) # for imbalanced learning
LogR.fit(X_train, y_train)

# Calculate precision and recall for the training dataset
precision_train = precision_score(y_train, LogR.predict(X_train))
recall_train = recall_score(y_train, LogR.predict(X_train))

# Calculate precision and recall for the validation dataset
precision_validate = precision_score(y_validate, LogR.predict(X_validate))
recall_validate = recall_score(y_validate, LogR.predict(X_validate))

# Calculate precision and recall for the test dataset
precision_test = precision_score(y_test, LogR.predict(X_test))
recall_test = recall_score(y_test, LogR.predict(X_test))

print('Precision over training dataset: {:0.4f}'.format(precision_train))
print('Precision over validation dataset: {:0.4f}'.format(precision_validate))
print('Precision over test dataset: {:0.4f}'.format(precision_test))
print('Recall over training dataset: {:0.4f}'.format(recall_train))
print('Recall over validation dataset: {:0.4f}'.format(recall_validate))
print('Recall over test dataset: {:0.4f}'.format(recall_test))
print('F1 score over training dataset: {:0.4f}'.format(f1_score(y_train, LogR.predict(X_train))))
print('F1 score over validation dataset: {:0.4f}'.format(f1_score(y_validate, LogR.predict(X_validate))))
print('F1 score over test dataset: {:0.4f}'.format(f1_score(y_test, LogR.predict(X_test))))

  y = column_or_1d(y, warn=True)


Precision over training dataset: 0.7347
Precision over validation dataset: 0.7353
Precision over test dataset: 0.7216
Recall over training dataset: 0.9946
Recall over validation dataset: 0.9963
Recall over test dataset: 0.9947
F1 score over training dataset: 0.8451
F1 score over validation dataset: 0.8461
F1 score over test dataset: 0.8364


In [None]:
train.head()

Remove some features that are highly correlated to one another

In [17]:

# Assuming you have already calculated the correlation matrix
correlation_matrix_train = train.corr()

# Set a threshold for high correlation
threshold = 0.7  # You can adjust this threshold as needed

# Find highly correlated pairs of variables
highly_correlated_pairs = set()
for i in range(len(correlation_matrix_train.columns)):
    for j in range(i):
        if abs(correlation_matrix_train.iloc[i, j]) > threshold:
            col1 = correlation_matrix_train.columns[i]
            col2 = correlation_matrix_train.columns[j]
            highly_correlated_pairs.add((col1, col2))

# Print the highly correlated variable pairs
print("Highly correlated variable pairs:")
for pair in highly_correlated_pairs:
    print(f"{pair[0]} and {pair[1]} have a correlation of {correlation_matrix_train.loc[pair[0], pair[1]]}")


Highly correlated variable pairs:
valuePerSecond and scannedLineItemsPerSecond have a correlation of 0.7472137111808428


Remove those highly correlated
- scannedLineItemsPerSecond
- totalScanTimeInSeconds
- lineItemVoidsPerPosition

In [13]:
feature_cols = [col for col in train.columns if (col != 'fraud') and (col != 'scaled_quantityModifications')
                and (col != 'scaled_value_per_item') and (col != 'valuePerSecond')]
target_col = ['fraud']
X_train = train[feature_cols].copy()
y_train = train[target_col].copy()
X_validate = validate[feature_cols].copy()
y_validate = validate[target_col].copy()
X_test  = test[feature_cols].copy()
y_test  = test[target_col].copy()

In [16]:
print(feature_cols)

['trustLevel', 'scannedLineItemsPerSecond', 'lineItemVoidsPerPosition', 'attempted_scan', 'made_modification', 'scaled_totalScanTimeInSeconds', 'scaled_grandTotal', 'scaled_lineItemVoids', 'scaled_scansWithoutRegistration', 'scaled_total_items_scanned']


In [14]:
LogR = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000) # for imbalanced learning
LogR.fit(X_train, y_train)

# Calculate precision and recall for the training dataset
precision_train = precision_score(y_train, LogR.predict(X_train))
recall_train = recall_score(y_train, LogR.predict(X_train))

# Calculate precision and recall for the validation dataset
precision_validate = precision_score(y_validate, LogR.predict(X_validate))
recall_validate = recall_score(y_validate, LogR.predict(X_validate))

# Calculate precision and recall for the test dataset
precision_test = precision_score(y_test, LogR.predict(X_test))
recall_test = recall_score(y_test, LogR.predict(X_test))

print('Precision over training dataset: {:0.4f}'.format(precision_train))
print('Precision over validation dataset: {:0.4f}'.format(precision_validate))
print('Precision over test dataset: {:0.4f}'.format(precision_test))
print('Recall over training dataset: {:0.4f}'.format(recall_train))
print('Recall over validation dataset: {:0.4f}'.format(recall_validate))
print('Recall over test dataset: {:0.4f}'.format(recall_test))
print('F1 score over training dataset: {:0.4f}'.format(f1_score(y_train, LogR.predict(X_train))))
print('F1 score over validation dataset: {:0.4f}'.format(f1_score(y_validate, LogR.predict(X_validate))))
print('F1 score over test dataset: {:0.4f}'.format(f1_score(y_test, LogR.predict(X_test))))

  y = column_or_1d(y, warn=True)


Precision over training dataset: 0.7348
Precision over validation dataset: 0.7344
Precision over test dataset: 0.7212
Recall over training dataset: 0.9946
Recall over validation dataset: 0.9955
Recall over test dataset: 0.9947
F1 score over training dataset: 0.8452
F1 score over validation dataset: 0.8453
F1 score over test dataset: 0.8362
