In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

train_df = pd.read_csv("./data/train_processed.csv")
test_df = pd.read_csv("./data/test_processed.csv")



In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

scaled_cols = ['amt', 'city_pop', 'trans_dist']
# Assuming 'train_df' includes both features and the target ('is_fraud')
X = train_df.drop(['is_fraud', 'Id'], axis=1)  # Dropping the target to isolate features
y = train_df['is_fraud']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [3]:
# # Initialize the StandardScaler
# scaler = StandardScaler()
# # Fit on the training set and transform both training and validation sets
# X_train[scaled_cols] = scaler.fit_transform(X_train[scaled_cols])
# X_val[scaled_cols] = scaler.transform(X_val[scaled_cols])

In [5]:
import pickle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

param_grid_knn = {
    'n_neighbors': range(1, 21),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()

grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='f1_micro', n_jobs=-1, verbose=3)

grid_search_knn.fit(X_train, y_train)

best_params_knn = grid_search_knn.best_params_
best_score_knn = grid_search_knn.best_score_

best_knn = grid_search_knn.best_estimator_
with open('best_knn.obj', 'wb') as f:
    pickle.dump(best_knn, f)
y_pred_knn = best_knn.predict(X_val)
f1_score_knn = f1_score(y_val, y_pred_knn)
print(best_params_knn, best_score_knn, f1_score_knn)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV 3/5] END metric=euclidean, n_neighbors=1, weights=distance;, score=0.995 total time=   1.4s
[CV 4/5] END metric=euclidean, n_neighbors=1, weights=distance;, score=0.996 total time=   1.4s




[CV 1/5] END metric=euclidean, n_neighbors=2, weights=distance;, score=0.995 total time=   1.6s
[CV 1/5] END metric=euclidean, n_neighbors=1, weights=distance;, score=0.995 total time=   1.3s
[CV 2/5] END metric=euclidean, n_neighbors=1, weights=distance;, score=0.996 total time=   1.6s
[CV 4/5] END metric=euclidean, n_neighbors=1, weights=uniform;, score=0.996 total time=   2.3s
[CV 1/5] END metric=euclidean, n_neighbors=1, weights=uniform;, score=0.995 total time=   2.4s
[CV 5/5] END metric=euclidean, n_neighbors=1, weights=distance;, score=0.996 total time=   1.6s
[CV 2/5] END metric=euclidean, n_neighbors=1, weights=uniform;, score=0.996 total time=   2.5s
[CV 3/5] END metric=euclidean, n_neighbors=1, weights=uniform;, score=0.995 total time=   2.6s
[CV 1/5] END metric=euclidean, n_neighbors=2, weights=uniform;, score=0.996 total time=   2.6s
[CV 4/5] END metric=euclidean, n_neighbors=2, weights=uniform;, score=0.997 total time=   2.6s
[CV 3/5] END metric=euclidean, n_neighbors=2, 

In [19]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

with open('best_bagging.obj', 'rb') as f:
    bag_clf = pickle.load(f)

with open('best_knn.obj', 'rb') as f:
    knn = pickle.load(f)

with open('best_dt.obj', 'rb') as f:
    dt = pickle.load(f)

# Create a VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('bag_clf', bag_clf), ('knn', knn), ('dt', dt)],
    voting='soft',
    n_jobs=-1,
    verbose=3
)

# Train the ensemble model
vot_clf = voting_clf.fit(X_train, y_train)

# # Evaluate the model
# scores = cross_val_score(voting_clf, X, y, cv=5, scoring='f1_micro', n_jobs=-1, verbose=3)
# print("F1 Score: ", scores.mean())
y_pred_vote = vot_clf.predict(X_val)
f1_score_vote = f1_score(y_val, y_pred_vote)
print(f1_score_vote)


[Voting] ...................... (2 of 3) Processing knn, total=   0.2s
[Voting] ....................... (3 of 3) Processing dt, total=   2.2s
[Voting] .................. (1 of 3) Processing bag_clf, total=  21.7s
0.8006134969325154


In [17]:
from sklearn.naive_bayes import GaussianNB
naive_bayes = GaussianNB()

nb = naive_bayes.fit(X_train, y_train)

y_pred_nb = nb.predict(X_val)

f1_score_nb = f1_score(y_val, y_pred_nb)

print(f1_score_nb)

0.33463796477495106


In [16]:
pred = test_df.drop(['is_fraud', 'Id'], axis=1)
pred2 = test_df.drop(['is_fraud'], axis=1)

pred2['is_fraud'] = bag_clf.predict(pred)
pred2.is_fraud = pred2.is_fraud.astype(int)
submission = pred2[['Id', 'is_fraud']]
submission.to_csv("./data/submission.csv", index=False)

In [14]:
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier

# Reuse the previously defined and trained models: log_reg, rf, svm

# Create a stacking ensemble
stacking_clf = StackingClassifier(
    estimators=[('bag_clf', bag_clf), ('knn', knn), ('dt', dt)],
    final_estimator=DecisionTreeClassifier(),
    n_jobs=-1,
    verbose=3
)

# Train the ensemble model
st_clf = stacking_clf.fit(X_train, y_train)

# Evaluate the model
y_pred_st = st_clf.predict(X_val)
f1_score_st = f1_score(y_val, y_pred_st)
print(f1_score_st)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.7s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.8s finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.8s remaining:    4.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.9s finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.9min remaining:  4.4min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.0min finished


0.7384615384615385
