<h3>Import Packages

In [None]:
import pandas as pd
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

<h3> Data Loading

In [None]:
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
validate = pd.read_csv("validate.csv")

<h3> Feature Engineering

In [None]:
train["totalItems"] = train["totalScanTimeInSeconds"] * train["scannedLineItemsPerSecond"]
test["totalItems"] = test["totalScanTimeInSeconds"] * test["scannedLineItemsPerSecond"]
validate["totalItems"] = validate["totalScanTimeInSeconds"] * validate["scannedLineItemsPerSecond"]

<h3> XGBoost on Selected Features

In [None]:
selected_features = ['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids', 'scansWithoutRegistration', 'totalItems']

feature_cols = [col for col in selected_features]
target_col = ['fraud']

X_train_selected_features = train[feature_cols].copy()
X_validate_selected_features = validate[feature_cols].copy()
X_test_selected_features  = test[feature_cols].copy()

y_train = train[target_col].copy()
y_validate = validate[target_col].copy()
y_test  = test[target_col].copy()

In [None]:
xgb_basic = XGBClassifier(
        objective= 'binary:logistic', # Since we are dealing with binary classification
        scale_pos_weight=1, # Because of unbalanced data
        seed=27)
xgb_basic.fit(X_train_selected_features, y_train)

y_pred_validate = xgb_basic.predict(X_validate_selected_features)
y_pred_train = xgb_basic.predict(X_train_selected_features)

In [None]:
print("Train Data")
print('Accuracy score: {:0.4f}'.format(accuracy_score(y_train, y_pred_train)))
print('F1 score: {:0.4f}'.format(f1_score(y_train, y_pred_train)))
print('Precision score: {:0.4f}'.format(precision_score(y_train, y_pred_train)))
print('Recall score: {:0.4f}'.format(recall_score(y_train, y_pred_train)))
print("_________________________________________________________")

print("Validate Data")
print('Accuracy score: {:0.4f}'.format(accuracy_score(y_validate, y_pred_validate)))
print('F1 score: {:0.4f}'.format(f1_score(y_validate, y_pred_validate)))
print('Precision score: {:0.4f}'.format(precision_score(y_validate, y_pred_validate)))
print('Recall score: {:0.4f}'.format(recall_score(y_validate, y_pred_validate)))
print("_________________________________________________________")


Train Data
Accuracy score: 0.9998
F1 score: 0.9979
Precision score: 0.9970
Recall score: 0.9988
_________________________________________________________
Validate Data
Accuracy score: 0.9992
F1 score: 0.9914
Precision score: 0.9905
Recall score: 0.9922
_________________________________________________________


Check importance of features in the model

In [None]:
importance = xgb_basic.feature_importances_
feature_names = X_train_selected_features.columns
feature_importance_dict = dict(zip(feature_names, importance))

# Sort the features by importance in descending order
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance_score in sorted_feature_importance:
    print(f"{feature}: {importance_score}")

trustLevel: 0.4074535369873047
totalItems: 0.3231649398803711
lineItemVoids: 0.09191131591796875
scansWithoutRegistration: 0.08899376541376114
totalScanTimeInSeconds: 0.07254774868488312
grandTotal: 0.015928681939840317


<h3> Hyperparameter Tuning

In [None]:
xgb1 = XGBClassifier(
        learning_rate =0.1,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic', # Since we are dealing with binary classification
        scale_pos_weight=1, # Because of unbalanced data
        seed=27)
xgb1.fit(X_train_selected_features, y_train)

y_pred_validate = xgb1.predict(X_validate_selected_features)
y_pred_train = xgb1.predict(X_train_selected_features)


In [None]:
print("Train Data")
print('Accuracy score: {:0.4f}'.format(accuracy_score(y_train, y_pred_train)))
print('F1 score: {:0.4f}'.format(f1_score(y_train, y_pred_train)))
print('Precision score: {:0.4f}'.format(precision_score(y_train, y_pred_train)))
print('Recall score: {:0.4f}'.format(recall_score(y_train, y_pred_train)))
print("_________________________________________________________")

print("Validate Data")
print('Accuracy score: {:0.4f}'.format(accuracy_score(y_validate, y_pred_validate)))
print('F1 score: {:0.4f}'.format(f1_score(y_validate, y_pred_validate)))
print('Precision score: {:0.4f}'.format(precision_score(y_validate, y_pred_validate)))
print('Recall score: {:0.4f}'.format(recall_score(y_validate, y_pred_validate)))
print("_________________________________________________________")


Train Data
Accuracy score: 0.9980
F1 score: 0.9795
Precision score: 0.9803
Recall score: 0.9787
_________________________________________________________
Validate Data
Accuracy score: 0.9977
F1 score: 0.9759
Precision score: 0.9777
Recall score: 0.9741
_________________________________________________________


Checking feature importance in model

In [None]:
importance = xgb1.feature_importances_
feature_names = X_train_selected_features.columns
feature_importance_dict = dict(zip(feature_names, importance))

# Sort the features by importance in descending order
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance_score in sorted_feature_importance:
    print(f"{feature}: {importance_score}")

trustLevel: 0.4817792773246765
totalItems: 0.3538365960121155
totalScanTimeInSeconds: 0.05783690884709358
scansWithoutRegistration: 0.052156370133161545
lineItemVoids: 0.04597991332411766
grandTotal: 0.008410969749093056


<h4>Grid Search

Using Grid Search for hyperparameters tuning of max_depth and min_child_weight which are the hyperparameters that will affect the xgboost the most

In [None]:
param_test1 = {
 'max_depth':range(3,10),
 'min_child_weight':range(1,6)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', scale_pos_weight=1, seed=27),
 param_grid = param_test1, scoring='f1', cv=5)
gsearch1.fit(X_train_selected_features, y_train)

In [None]:
best_score = gsearch1.best_score_
print("Best Score:", best_score)
best_params = gsearch1.best_params_
print("Best Parameters:", best_params)
best_xgboost = gsearch1.best_estimator_
print(best_xgboost)

Best Score: 0.9825041948585677
Best Parameters: {'max_depth': 7, 'min_child_weight': 3}
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=3, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)


<h5> Best Model from Grid Search

In [None]:
y_pred_validate = best_xgboost.predict(X_validate_selected_features)
y_pred_train = best_xgboost.predict(X_train_selected_features)
y_pred_test = best_xgboost.predict(X_test_selected_features)

In [None]:
print("Train Data")
print('Accuracy score: {:0.4f}'.format(accuracy_score(y_train, y_pred_train)))
print('F1 score: {:0.4f}'.format(f1_score(y_train, y_pred_train)))
print('Precision score: {:0.4f}'.format(precision_score(y_train, y_pred_train)))
print('Recall score: {:0.4f}'.format(recall_score(y_train, y_pred_train)))
print("_________________________________________________________")

print("Validate Data")
print('Accuracy score: {:0.4f}'.format(accuracy_score(y_validate, y_pred_validate)))
print('F1 score: {:0.4f}'.format(f1_score(y_validate, y_pred_validate)))
print('Precision score: {:0.4f}'.format(precision_score(y_validate, y_pred_validate)))
print('Recall score: {:0.4f}'.format(recall_score(y_validate, y_pred_validate)))
print("_________________________________________________________")

print("Test Data")
print('Accuracy score: {:0.4f}'.format(accuracy_score(y_test, y_pred_test)))
print('F1 score: {:0.4f}'.format(f1_score(y_test, y_pred_test)))
print('Precision score: {:0.4f}'.format(precision_score(y_test, y_pred_test)))
print('Recall score: {:0.4f}'.format(recall_score(y_test, y_pred_test)))
print("_________________________________________________________")

Train Data
Accuracy score: 0.9992
F1 score: 0.9917
Precision score: 0.9898
Recall score: 0.9935
_________________________________________________________
Validate Data
Accuracy score: 0.9983
F1 score: 0.9828
Precision score: 0.9799
Recall score: 0.9856
_________________________________________________________
Test Data
Accuracy score: 0.9983
F1 score: 0.9813
Precision score: 0.9805
Recall score: 0.9820
_________________________________________________________


Checking feature importance in model

In [None]:
importance = best_xgboost.feature_importances_
feature_names = X_train_selected_features.columns
feature_importance_dict = dict(zip(feature_names, importance))

# Sort the features by importance in descending order
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance_score in sorted_feature_importance:
    print(f"{feature}: {importance_score}")

trustLevel: 0.7196536660194397
totalItems: 0.1943889856338501
scansWithoutRegistration: 0.029879899695515633
totalScanTimeInSeconds: 0.027823973447084427
lineItemVoids: 0.024705473333597183
grandTotal: 0.0035480051301419735
