In [14]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

train_df = pd.read_csv("./data/train_p.csv")
test_df = pd.read_csv("./data/test_p.csv")

# scaled_cols = ['city_pop']
# # Initialize the StandardScaler
# scaler = StandardScaler()
# # Fit on the training set and transform both training and validation sets
# train_df[scaled_cols] = scaler.fit_transform(train_df[scaled_cols])

# Assuming 'train_df' includes both features and the target ('is_fraud')
X = train_df.drop(['is_fraud', 'Id', 'city_pop'], axis=1)
y = train_df['is_fraud']

num_cols = ['amt', 'Time_Delta', 'distance_to_prev', 'location_consistency', 'dist_to_home', 'amt_anomaly_score_cat', 'amt_anomaly_score_merch', 'amt_relative_avg', 'fraud_rate_cat', 'fraud_rate_merch', 'fraud_similarity', 'normal_similarity']

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [15]:
print(X_train.head(20))

        merchant  category       amt  gender  city  state  job  Time_Delta  \
303198       398         7 -0.134199       1   233     11  463   -0.506999   
204087       476         0 -0.145077       0    98     33  279   -0.376639   
196192       371        10 -0.025994       0   437     16  138   -0.266217   
101664       671         7  0.090209       1   723     21  148   -0.329096   
251609        61         7 -0.098493       0   454     49  139   -0.049974   
73482        316         4  0.754216       0   603      1   67    0.192341   
366791       506         6 -0.431105       0   373     33  466   -0.608219   
212430       129         1 -0.268958       1   571     28  425    1.646231   
335143       361         1 -0.086399       0   342     42  104   -0.579080   
259371       284        12 -0.430273       0   279      8   34    0.680038   
405806        53        12  6.567689       1   200     21  213    0.542011   
56169        667         7 -0.303256       1   795     21  128  

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pickle

# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(criterion='entropy' ,random_state=42)

# Parameters for GridSearchCV
param_grid = {
    'max_depth': [None, 7, 10, 15],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [2, 4, 8, 10]
}

# GridSearchCV
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5, scoring='f1_micro', n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Validation
best_dt = grid_search.best_estimator_

# with open('dt_model4.obj', 'wb') as f:
#     pickle.dump(best_dt, f)


y_pred_dt = best_dt.predict(X_val)

val_f1 = f1_score(y_val, y_pred_dt)

print(best_params, best_score, val_f1)
print(classification_report(y_val, y_pred_dt))

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV 2/5] END max_depth=None, min_samples_leaf=2, min_samples_split=2;, score=0.998 total time=   4.5s
[CV 3/5] END max_depth=None, min_samples_leaf=2, min_samples_split=2;, score=0.998 total time=   4.5s
[CV 4/5] END max_depth=None, min_samples_leaf=2, min_samples_split=2;, score=0.998 total time=   4.5s
[CV 2/5] END max_depth=None, min_samples_leaf=2, min_samples_split=5;, score=0.998 total time=   4.6s
[CV 1/5] END max_depth=None, min_samples_leaf=2, min_samples_split=2;, score=0.998 total time=   4.6s
[CV 3/5] END max_depth=None, min_samples_leaf=2, min_samples_split=5;, score=0.998 total time=   4.6s
[CV 1/5] END max_depth=None, min_samples_leaf=2, min_samples_split=5;, score=0.998 total time=   4.6s
[CV 4/5] END max_depth=None, min_samples_leaf=2, min_samples_split=5;, score=0.998 total time=   4.6s
[CV 5/5] END max_depth=None, min_samples_leaf=2, min_samples_split=5;, score=0.998 total time=   4.7s
[CV 5/5] END max_dep



[CV 4/5] END max_depth=None, min_samples_leaf=10, min_samples_split=10;, score=0.998 total time=   4.4s
[CV 3/5] END max_depth=None, min_samples_leaf=10, min_samples_split=10;, score=0.998 total time=   4.4s
[CV 2/5] END max_depth=None, min_samples_leaf=10, min_samples_split=15;, score=0.999 total time=   4.3s
[CV 5/5] END max_depth=None, min_samples_leaf=10, min_samples_split=10;, score=0.998 total time=   4.4s
[CV 1/5] END max_depth=None, min_samples_leaf=10, min_samples_split=15;, score=0.998 total time=   4.4s
[CV 4/5] END max_depth=None, min_samples_leaf=10, min_samples_split=15;, score=0.998 total time=   4.4s
[CV 3/5] END max_depth=None, min_samples_leaf=10, min_samples_split=15;, score=0.998 total time=   4.4s
[CV 5/5] END max_depth=None, min_samples_leaf=10, min_samples_split=15;, score=0.998 total time=   4.4s
[CV 1/5] END max_depth=7, min_samples_leaf=2, min_samples_split=2;, score=0.998 total time=   3.6s
[CV 2/5] END max_depth=7, min_samples_leaf=2, min_samples_split=2;, s

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score
from sklearn.decomposition import PCA

param_grid_knn = {
    'n_neighbors': range(1, 11),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()

grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='f1_micro', n_jobs=-1, verbose=3)

grid_search_knn.fit(X_train, y_train)

best_params_knn = grid_search_knn.best_params_
best_score_knn = grid_search_knn.best_score_

best_knn = grid_search_knn.best_estimator_

# with open('knn_model.obj', 'wb') as f:
#         pickle.dump(best_knn, f)

y_pred_knn = best_knn.predict(X_val)

print(f1_score(y_val, y_pred_knn))
print(classification_report(y_val, y_pred_knn))

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END metric=euclidean, n_neighbors=1, weights=distance;, score=0.995 total time= 1.2min
[CV 2/5] END metric=euclidean, n_neighbors=1, weights=distance;, score=0.996 total time= 1.2min
[CV 3/5] END metric=euclidean, n_neighbors=1, weights=distance;, score=0.995 total time= 1.2min
[CV 4/5] END metric=euclidean, n_neighbors=1, weights=distance;, score=0.996 total time= 1.2min
[CV 5/5] END metric=euclidean, n_neighbors=1, weights=distance;, score=0.996 total time= 1.2min
[CV 4/5] END metric=euclidean, n_neighbors=1, weights=uniform;, score=0.996 total time= 1.2min
[CV 5/5] END metric=euclidean, n_neighbors=1, weights=uniform;, score=0.996 total time= 1.2min
[CV 3/5] END metric=euclidean, n_neighbors=1, weights=uniform;, score=0.995 total time= 1.2min
[CV 1/5] END metric=euclidean, n_neighbors=1, weights=uniform;, score=0.995 total time= 1.2min
[CV 2/5] END metric=euclidean, n_neighbors=1, weights=uniform;, score=0.996 to



[CV 5/5] END metric=euclidean, n_neighbors=9, weights=distance;, score=0.996 total time= 1.1min
[CV 1/5] END metric=euclidean, n_neighbors=10, weights=uniform;, score=0.996 total time= 1.1min
[CV 2/5] END metric=euclidean, n_neighbors=10, weights=uniform;, score=0.996 total time= 1.1min
[CV 3/5] END metric=euclidean, n_neighbors=10, weights=uniform;, score=0.996 total time= 1.1min
[CV 4/5] END metric=euclidean, n_neighbors=10, weights=uniform;, score=0.996 total time= 1.4min
[CV 2/5] END metric=euclidean, n_neighbors=10, weights=distance;, score=0.996 total time= 1.1min
[CV 3/5] END metric=euclidean, n_neighbors=10, weights=distance;, score=0.996 total time= 1.1min
[CV 5/5] END metric=euclidean, n_neighbors=10, weights=distance;, score=0.996 total time= 1.1min
[CV 1/5] END metric=euclidean, n_neighbors=10, weights=distance;, score=0.996 total time= 1.3min
[CV 5/5] END metric=euclidean, n_neighbors=10, weights=uniform;, score=0.996 total time= 1.3min
[CV 4/5] END metric=euclidean, n_nei