In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from geopy.distance import geodesic
from sklearn.cluster import KMeans

# Calculate distance from home function
def calculate_distance(row):
    home_location = (row['lat'], row['long'])
    merch_location = (row['merch_lat'], row['merch_long'])
    return geodesic(home_location, merch_location).miles

# Function to calculate distance between two points
def calculate_distance2(row1, row2):
    point1 = (row1['lat'], row1['long'])
    point2 = (row2['lat'], row2['long'])
    return geodesic(point1, point2).miles

def calculate_similarity_score(amount, fraud_mean, fraud_std, normal_mean, normal_std):
    # Calculate Z-scores for fraud and normal
    z_score_fraud = abs((amount - fraud_mean) / fraud_std)
    z_score_normal = abs((amount - normal_mean) / normal_std)
    
    # Invert the Z-scores to get similarity scores
    fraud_similarity = 1 / (1 + z_score_fraud)
    normal_similarity = 1 / (1 + z_score_normal)
    
    return fraud_similarity, normal_similarity

def process(df):
    # Add new features
    # Rearrange the rows
    df['original_order'] = range(df.shape[0])

    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%d/%m/%Y %H:%M')
    df['dob'] = pd.to_datetime(df['dob'], format='%d/%m/%Y')

    df.sort_values(by=['cc_num', 'trans_date_trans_time'], inplace=True)
    
    # Calculate the time difference between transactions
    df['Time_Delta'] = df.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds() / 60.0  # Time delta in minutes
    df['Time_Delta'] = df['Time_Delta'].fillna(value=0)
    
    # Shift the latitude and longitude to get the previous transaction's location
    df['prev_lat'] = df.groupby('cc_num')['merch_lat'].shift(1)
    df['prev_long'] = df.groupby('cc_num')['merch_long'].shift(1)

    # Calculate the distance to the previous transaction
    df['distance_to_prev'] = df.apply(
        lambda row: calculate_distance2(
            {'lat': row['merch_lat'], 'long': row['merch_long']},
            {'lat': row['prev_lat'], 'long': row['prev_long']}
        ) if not pd.isnull(row['prev_lat']) else None,
        axis=1
    )
    
    df['distance_to_prev'] = df['distance_to_prev'].fillna(value=0)

    # Calculate location consistency as the inverse of the average distance to previous transactions (higher value means more consistency)
    df['location_consistency'] = 100 / df.groupby('cc_num')['distance_to_prev'].transform('mean')

    # Time-based features
    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
   
    # Age of the account holder
    df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days // 365

    df['dist_to_home'] = df.apply(calculate_distance, axis=1)

    # Group by category and calculate the mean and standard deviation of transaction amounts
    category_stats = df.groupby('category')['amt'].agg(['mean', 'std']).reset_index()
    # Merge these stats back into the main dataframe
    df = df.merge(category_stats, on='category', how='left')
    # Calculate z-score for each transaction amount within its category
    df['amt_anomaly_score_cat'] = ((df['amt'] - df['mean']) / df['std'])
    df.drop(columns=['mean', 'std'], inplace=True)
    # Group by merchant and calculate the mean and standard deviation of transaction amounts
    merchant_stats = df.groupby('merchant')['amt'].agg(['mean', 'std']).reset_index()
    # Merge these stats back into the main dataframe
    df = df.merge(merchant_stats, on='merchant', how='left')
    # Calculate z-score for each transaction amount within its merchant
    df['amt_anomaly_score_merch'] = ((df['amt'] - df['mean']) / df['std'])
    df.drop(columns=['mean', 'std'], inplace=True)

    # Calculate the historical average transaction amount for each user
    avg_amt_per_user = df.groupby('cc_num')['amt'].transform('mean').rename('avg_amt_per_user')

    # Append this feature to the dataset
    df['amt_relative_avg'] = (abs(df['amt'] - avg_amt_per_user) / avg_amt_per_user)

    kmeans = KMeans(n_clusters=12, random_state=42)

    # Create a new column for the cluster labels
    df['city_pop_cluster'] = kmeans.fit_predict(df[['city_pop']])

    df.drop(columns=['trans_date_trans_time', 'lat', 'long', 'merch_lat', 'merch_long'], inplace=True)
    df.drop(columns=['prev_lat', 'prev_long', 'cc_num'], inplace=True)

    # Identify categorical columns to encode
    categorical_cols = ['merchant', 'category', 'gender', 'city', 'state', 'job']

    mappings = {}

    label_encoder = LabelEncoder()
    for col in categorical_cols:
        df[col] = label_encoder.fit_transform(df[col])
        mappings[col] = {label: index for index, label in enumerate(label_encoder.classes_)}


    # Calculate the fraud rate by category
    fraud_rate_by_category = df.groupby('category')['is_fraud'].mean().reset_index()
    fraud_rate_by_category.rename(columns={'is_fraud': 'fraud_rate_cat'}, inplace=True)

    # Merge the fraud rate back into the main DataFrame
    df = pd.merge(df, fraud_rate_by_category[['category', 'fraud_rate_cat']], on='category', how='left')

    # Calculate the fraud rate by merchant
    fraud_rate_by_merchant = df.groupby('merchant')['is_fraud'].mean().reset_index()
    fraud_rate_by_merchant.rename(columns={'is_fraud': 'fraud_rate_merch'}, inplace=True)

    # Merge the fraud rate back into the main DataFrame
    df = pd.merge(df, fraud_rate_by_merchant[['merchant', 'fraud_rate_merch']], on='merchant', how='left')

    # Separate the transactions
    fraud_trans = df[df['is_fraud'] == 1]['amt']
    normal_trans = df[df['is_fraud'] == 0]['amt']

    # Calculate statistics
    fraud_mean, fraud_std = fraud_trans.mean(), fraud_trans.std()
    normal_mean, normal_std = normal_trans.mean(), normal_trans.std()

    v_calculate_similarity_score = np.vectorize(calculate_similarity_score)

    # Apply the function
    df['fraud_similarity'], df['normal_similarity'] = v_calculate_similarity_score(
        df['amt'],
        fraud_mean, fraud_std,
        normal_mean, normal_std
    )

    # Sort the dataset back to its original order
    df.sort_values(by='original_order', inplace=True)
    df.drop(columns='original_order', inplace=True)

    return df, mappings

trainingSet = pd.read_csv("./data/train.csv")
submissionSet = pd.read_csv("./data/test.csv")
train_processed, cat_map = process(trainingSet)
train_processed.drop(columns=['first', 'last', 'street', 'dob', 'zip', 'trans_num', 'unix_time'], inplace=True)

# Merge on Id so that the test set can have feature columns as well
test_df = pd.merge(train_processed, submissionSet, left_on='Id', right_on='Id')
test_df = test_df.drop(columns=['is_fraud_x'])
test_df = test_df.rename(columns={'is_fraud_y': 'is_fraud'})

# The training set is where the score is not null
train_df = train_processed[train_processed['is_fraud'].notnull()]

# Save the datasets with the new features for easy access later
test_df.to_csv("./data/processed_test.csv", index=False)
train_df.to_csv("./data/processed_train.csv", index=False)

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Assuming 'train_df' includes both features and the target ('is_fraud')
X = train_df.drop(['is_fraud', 'Id', 'city_pop_cluster'], axis=1)
y = train_df['is_fraud']

num_cols = ['amt', 'Time_Delta', 'distance_to_prev', 'location_consistency', 'dist_to_home', 'amt_anomaly_score_cat', 'amt_anomaly_score_merch', 'amt_relative_avg', 'fraud_rate_cat', 'fraud_rate_merch', 'fraud_similarity', 'normal_similarity']

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [3]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
# Initialize the XGBClassifier
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'learning_rate': [0.1, 0.15, 0.2, 0.3],
    'n_estimators': [300, 500, 900],
    'colsample_bytree': [0.1, 0.3, 0.7, 0.9]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, scoring='f1', cv=5, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)
clf = grid_search.best_estimator_
# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best average F1 score found: ", f1_score(y_val, clf.predict(X_val)))

# Train the model with the optimal parameters
optimal_clf = XGBClassifier(**grid_search.best_params_, use_label_encoder=False, eval_metric='logloss')
optimal_clf.fit(X, y)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
[CV 3/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=3, n_estimators=300;, score=0.772 total time=   4.3s
[CV 2/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=3, n_estimators=300;, score=0.784 total time=   4.4s
[CV 4/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=3, n_estimators=300;, score=0.793 total time=   4.4s
[CV 5/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=3, n_estimators=300;, score=0.774 total time=   4.4s
[CV 1/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=3, n_estimators=300;, score=0.751 total time=   4.6s
[CV 1/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=3, n_estimators=500;, score=0.783 total time=   6.8s
[CV 5/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=3, n_estimators=500;, score=0.802 total time=   6.8s
[CV 3/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=3, n_estimators=500;, score=0.780 total time=   6.8s




[CV 1/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=5, n_estimators=900;, score=0.835 total time=  16.4s
[CV 2/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=5, n_estimators=900;, score=0.849 total time=  16.5s
[CV 4/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=5, n_estimators=900;, score=0.846 total time=  16.6s
[CV 5/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=5, n_estimators=900;, score=0.846 total time=  16.7s
[CV 3/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=5, n_estimators=900;, score=0.825 total time=  16.8s
[CV 1/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=7, n_estimators=500;, score=0.822 total time=  12.4s
[CV 2/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=7, n_estimators=500;, score=0.834 total time=  12.2s
[CV 3/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=7, n_estimators=500;, score=0.800 total time=  12.3s
[CV 4/5] END colsample_bytree=0.1, learning_rate=0.1, max_depth=

In [5]:
import pickle

with open('model_final_xgb.obj', 'wb') as f:
    pickle.dump(optimal_clf, f)

In [6]:
# Create Kaggle Submission
pred = test_df.drop(['is_fraud', 'city_pop_cluster', 'Id'], axis=1)
pred2 = test_df.drop(['is_fraud'], axis=1)

pred2['is_fraud'] = optimal_clf.predict(pred)
pred2.is_fraud = pred2.is_fraud.astype(int)
submission = pred2[['Id', 'is_fraud']]
submission.to_csv("./data/submission6.csv", index=False)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
import pickle
from xgboost import XGBClassifier
import numpy as np
from sklearn.ensemble import BaggingClassifier
# Create a VotingClassifier
from sklearn.ensemble import VotingClassifier

rf = RandomForestClassifier(class_weight='balanced', n_estimators=300, max_depth=None, min_samples_split=10, min_samples_leaf=4, criterion='entropy', n_jobs=-1, verbose=3)

rf2 = RandomForestClassifier(class_weight='balanced', n_estimators=900, max_depth=None, min_samples_split=10, min_samples_leaf=1, criterion='entropy', n_jobs=-1, verbose=3)

rf.fit(X_train, y_train)

rf2.fit(X_train, y_train)

y_pred_rf = rf.predict(X_val)
print(f1_score(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_rf))

y_pred_rf2 = rf2.predict(X_val)
print(f1_score(y_val, y_pred_rf2))
print(classification_report(y_val, y_pred_rf2))


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 300
building tree 2 of 300
building tree 3 of 300
building tree 4 of 300
building tree 5 of 300
building tree 6 of 300
building tree 7 of 300
building tree 8 of 300
building tree 9 of 300
building tree 10 of 300
building tree 11 of 300
building tree 12 of 300
building tree 13 of 300
building tree 14 of 300
building tree 15 of 300
building tree 16 of 300
building tree 17 of 300
building tree 18 of 300
building tree 19 of 300
building tree 20 of 300
building tree 21 of 300
building tree 22 of 300


[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    1.2s


building tree 23 of 300
building tree 24 of 300
building tree 25 of 300
building tree 26 of 300
building tree 27 of 300
building tree 28 of 300
building tree 29 of 300building tree 30 of 300

building tree 31 of 300
building tree 32 of 300
building tree 33 of 300
building tree 34 of 300
building tree 35 of 300
building tree 36 of 300
building tree 37 of 300
building tree 38 of 300
building tree 39 of 300
building tree 40 of 300
building tree 41 of 300
building tree 42 of 300
building tree 43 of 300
building tree 44 of 300
building tree 45 of 300
building tree 46 of 300building tree 47 of 300

building tree 48 of 300
building tree 49 of 300
building tree 50 of 300
building tree 51 of 300
building tree 52 of 300
building tree 53 of 300
building tree 54 of 300
building tree 55 of 300
building tree 56 of 300
building tree 57 of 300
building tree 58 of 300
building tree 59 of 300
building tree 60 of 300
building tree 61 of 300
building tree 62 of 300
building tree 63 of 300
building tree 64

[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:    8.0s


building tree 120 of 300
building tree 121 of 300
building tree 122 of 300
building tree 123 of 300
building tree 124 of 300
building tree 125 of 300
building tree 126 of 300
building tree 127 of 300
building tree 128 of 300
building tree 129 of 300
building tree 130 of 300
building tree 131 of 300
building tree 132 of 300
building tree 133 of 300
building tree 134 of 300
building tree 135 of 300
building tree 136 of 300
building tree 137 of 300
building tree 138 of 300
building tree 139 of 300
building tree 140 of 300
building tree 141 of 300
building tree 142 of 300
building tree 143 of 300
building tree 144 of 300
building tree 145 of 300
building tree 146 of 300
building tree 147 of 300
building tree 148 of 300
building tree 149 of 300
building tree 150 of 300
building tree 151 of 300
building tree 152 of 300
building tree 153 of 300
building tree 154 of 300
building tree 155 of 300
building tree 156 of 300
building tree 157 of 300
building tree 158 of 300
building tree 159 of 300


[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:   18.9s


building tree 280 of 300
building tree 281 of 300
building tree 282 of 300
building tree 283 of 300
building tree 284 of 300
building tree 285 of 300
building tree 286 of 300
building tree 287 of 300
building tree 288 of 300
building tree 289 of 300
building tree 290 of 300
building tree 291 of 300
building tree 292 of 300
building tree 293 of 300
building tree 294 of 300
building tree 295 of 300
building tree 296 of 300
building tree 297 of 300
building tree 298 of 300
building tree 299 of 300
building tree 300 of 300


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   21.1s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 900
building tree 2 of 900
building tree 3 of 900
building tree 4 of 900
building tree 5 of 900
building tree 6 of 900
building tree 7 of 900
building tree 8 of 900
building tree 9 of 900
building tree 10 of 900
building tree 11 of 900
building tree 12 of 900
building tree 13 of 900
building tree 14 of 900
building tree 15 of 900
building tree 16 of 900
building tree 17 of 900
building tree 18 of 900
building tree 19 of 900
building tree 20 of 900
building tree 21 of 900
building tree 22 of 900
building tree 23 of 900
building tree 24 of 900
building tree 25 of 900
building tree 26 of 900
building tree 27 of 900


[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    1.2s


building tree 28 of 900
building tree 29 of 900
building tree 30 of 900
building tree 31 of 900
building tree 32 of 900
building tree 33 of 900
building tree 34 of 900
building tree 35 of 900
building tree 36 of 900
building tree 37 of 900
building tree 38 of 900
building tree 39 of 900
building tree 40 of 900
building tree 41 of 900
building tree 42 of 900
building tree 43 of 900
building tree 44 of 900
building tree 45 of 900
building tree 46 of 900
building tree 47 of 900
building tree 48 of 900
building tree 49 of 900
building tree 50 of 900
building tree 51 of 900
building tree 52 of 900
building tree 53 of 900
building tree 54 of 900
building tree 55 of 900
building tree 56 of 900
building tree 57 of 900
building tree 58 of 900
building tree 59 of 900
building tree 60 of 900
building tree 61 of 900
building tree 62 of 900
building tree 63 of 900
building tree 64 of 900
building tree 65 of 900
building tree 66 of 900
building tree 67 of 900
building tree 68 of 900
building tree 69

[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:    7.6s


building tree 122 of 900
building tree 123 of 900
building tree 124 of 900
building tree 125 of 900
building tree 126 of 900
building tree 127 of 900
building tree 128 of 900
building tree 129 of 900
building tree 130 of 900
building tree 131 of 900
building tree 132 of 900
building tree 133 of 900
building tree 134 of 900
building tree 135 of 900
building tree 136 of 900
building tree 137 of 900
building tree 138 of 900
building tree 139 of 900
building tree 140 of 900
building tree 141 of 900
building tree 142 of 900
building tree 143 of 900
building tree 144 of 900
building tree 145 of 900
building tree 146 of 900
building tree 147 of 900
building tree 148 of 900
building tree 149 of 900
building tree 150 of 900
building tree 151 of 900
building tree 152 of 900
building tree 153 of 900
building tree 154 of 900
building tree 155 of 900
building tree 156 of 900
building tree 157 of 900
building tree 158 of 900
building tree 159 of 900
building tree 160 of 900
building tree 161 of 900


[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:   18.8s


building tree 282 of 900
building tree 283 of 900
building tree 284 of 900
building tree 285 of 900
building tree 286 of 900
building tree 287 of 900
building tree 288 of 900
building tree 289 of 900
building tree 290 of 900
building tree 291 of 900
building tree 292 of 900
building tree 293 of 900
building tree 294 of 900
building tree 295 of 900
building tree 296 of 900
building tree 297 of 900
building tree 298 of 900
building tree 299 of 900
building tree 300 of 900
building tree 301 of 900
building tree 302 of 900
building tree 303 of 900
building tree 304 of 900
building tree 305 of 900
building tree 306 of 900
building tree 307 of 900
building tree 308 of 900
building tree 309 of 900
building tree 310 of 900
building tree 311 of 900
building tree 312 of 900
building tree 313 of 900
building tree 314 of 900
building tree 315 of 900
building tree 316 of 900
building tree 317 of 900
building tree 318 of 900
building tree 319 of 900
building tree 320 of 900
building tree 321 of 900


[Parallel(n_jobs=-1)]: Done 492 tasks      | elapsed:   34.3s


building tree 505 of 900
building tree 506 of 900
building tree 507 of 900
building tree 508 of 900
building tree 509 of 900
building tree 510 of 900
building tree 511 of 900
building tree 512 of 900
building tree 513 of 900
building tree 514 of 900
building tree 515 of 900
building tree 516 of 900
building tree 517 of 900
building tree 518 of 900
building tree 519 of 900
building tree 520 of 900
building tree 521 of 900
building tree 522 of 900
building tree 523 of 900
building tree 524 of 900
building tree 525 of 900
building tree 526 of 900
building tree 527 of 900
building tree 528 of 900
building tree 529 of 900
building tree 530 of 900
building tree 531 of 900
building tree 532 of 900
building tree 533 of 900
building tree 534 of 900
building tree 535 of 900
building tree 536 of 900
building tree 537 of 900
building tree 538 of 900
building tree 539 of 900
building tree 540 of 900
building tree 541 of 900
building tree 542 of 900
building tree 543 of 900
building tree 544 of 900


[Parallel(n_jobs=-1)]: Done 780 tasks      | elapsed:   54.4s


building tree 791 of 900
building tree 792 of 900
building tree 793 of 900
building tree 794 of 900
building tree 795 of 900
building tree 796 of 900
building tree 797 of 900
building tree 798 of 900
building tree 799 of 900
building tree 800 of 900
building tree 801 of 900
building tree 802 of 900
building tree 803 of 900
building tree 804 of 900
building tree 805 of 900
building tree 806 of 900
building tree 807 of 900
building tree 808 of 900
building tree 809 of 900
building tree 810 of 900
building tree 811 of 900
building tree 812 of 900
building tree 813 of 900
building tree 814 of 900
building tree 815 of 900
building tree 816 of 900
building tree 817 of 900
building tree 818 of 900
building tree 819 of 900
building tree 820 of 900
building tree 821 of 900
building tree 822 of 900
building tree 823 of 900
building tree 824 of 900
building tree 825 of 900
building tree 826 of 900
building tree 827 of 900
building tree 828 of 900
building tree 829 of 900
building tree 830 of 900


[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  1.0min finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 268 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 300 out of 300 | elapsed:    0.2s finished


0.8097982708933718
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     96876
         1.0       0.88      0.75      0.81       375

    accuracy                           1.00     97251
   macro avg       0.94      0.87      0.90     97251
weighted avg       1.00      1.00      1.00     97251



[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 268 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 492 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    0.4s
[Parallel(n_jobs=10)]: Done 900 out of 900 | elapsed:    0.4s finished


0.8055130168453293
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     96876
         1.0       0.95      0.70      0.81       375

    accuracy                           1.00     97251
   macro avg       0.97      0.85      0.90     97251
weighted avg       1.00      1.00      1.00     97251



In [None]:
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Bagging Classifier
bagging_clf = BaggingClassifier(estimator=xgb_clf,
                                n_estimators=500,  # Number of base estimators in the ensemble
                                random_state=42,
                                n_jobs=-1,
                                verbose=3)  # Use parallel processing

bagging_clf.fit(X_train, y_train)
y_pred_bag = bagging_clf.predict(X_val)
print(f1_score(y_val, y_pred_bag))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pickle

# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(criterion='entropy', random_state=42)

# Parameters for GridSearchCV
param_grid = {
    'max_depth': [7, 10, 20, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8]
}

# GridSearchCV
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5, scoring='f1_micro', n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params_dt = grid_search.best_params_

# Validation
best_dt = grid_search.best_estimator_

y_pred_dt = best_dt.predict(X_val)

val_f1_dt = f1_score(y_val, y_pred_dt)

(best_params_dt, val_f1_dt)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
import pickle
from xgboost import XGBClassifier
import numpy as np
from sklearn.ensemble import BaggingClassifier
# Create a VotingClassifier
from sklearn.ensemble import VotingClassifier


rf = RandomForestClassifier(class_weight='balanced', n_estimators=300, max_depth=None, min_samples_split=10, min_samples_leaf=4, criterion='entropy')

rf2 = RandomForestClassifier(class_weight='balanced', n_estimators=900, max_depth=None, min_samples_split=10, min_samples_leaf=1, criterion='entropy')

xgb_clf = XGBClassifier(colsample_bytree=0.7, learning_rate=0.1, max_depth=7, n_estimators=300, use_label_encoder=False, eval_metric='logloss')

# Initialize the Bagging Classifier
bagging_xgb_clf = BaggingClassifier(estimator=xgb_clf,
                                n_estimators=200,
                                n_jobs=-1)  # Use parallel processing

# Initialize the Bagging Classifier
bagging_rf_clf = BaggingClassifier(estimator=rf,
                                n_estimators=200,
                                n_jobs=-1)


voting_clf = VotingClassifier(
    estimators=[('rf', rf), ('rf2', rf2), ('xgb', xgb_clf), ('bag_rf', bagging_rf_clf), ('bag_xgb', bagging_xgb_clf)],
    voting='soft',
    n_jobs=-1,
    verbose=3
)

# Train the ensemble model
vot_clf = voting_clf.fit(X_train, y_train)

# # Evaluate the model
# scores = cross_val_score(voting_clf, X, y, cv=5, scoring='f1_micro', n_jobs=-1, verbose=3)
# print("F1 Score: ", scores.mean())
y_pred_vote = vot_clf.predict(X_val)
f1_score_vote = f1_score(y_val, y_pred_vote)
print(f1_score_vote)

with open('best_vote_model.obj', 'wb') as f:
    pickle.dump(vot_clf, f)