In [44]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from geopy.distance import geodesic
from sklearn.cluster import KMeans

# Calculate distance from home function
def calculate_distance(row):
    home_location = (row['lat'], row['long'])
    merch_location = (row['merch_lat'], row['merch_long'])
    return geodesic(home_location, merch_location).miles

# Function to calculate distance between two points
def calculate_distance2(row1, row2):
    point1 = (row1['lat'], row1['long'])
    point2 = (row2['lat'], row2['long'])
    return geodesic(point1, point2).miles

def calculate_similarity_score(amount, fraud_mean, fraud_std, normal_mean, normal_std):
    # Calculate Z-scores for fraud and normal
    z_score_fraud = abs((amount - fraud_mean) / fraud_std)
    z_score_normal = abs((amount - normal_mean) / normal_std)
    
    # Invert the Z-scores to get similarity scores
    fraud_similarity = 1 / (1 + z_score_fraud)
    normal_similarity = 1 / (1 + z_score_normal)
    
    return fraud_similarity, normal_similarity

def process(df):
    # Add new features
    # Rearrange the rows
    df['original_order'] = range(df.shape[0])

    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%d/%m/%Y %H:%M')
    df['dob'] = pd.to_datetime(df['dob'], format='%d/%m/%Y')

    df.sort_values(by=['cc_num', 'trans_date_trans_time'], inplace=True)
    
    # Calculate the time difference between transactions
    df['Time_Delta'] = df.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds() / 60.0  # Time delta in minutes
    df['Time_Delta'] = df['Time_Delta'].fillna(value=0)

    # Calculate the rolling count of transactions for each card
    #df['timestamp'] = pd.to_datetime(df['trans_date_trans_time'], format='%d/%m/%Y %H:%M')
    # df.set_index('timestamp', inplace=True)
    # df['rolling_trans_freq'] = df.groupby('cc_num')['trans_date_trans_time'].rolling(window='24h').count().reset_index(0, drop=True)
    # df['rolling_trans_freq'] = df['rolling_trans_freq'].fillna(value=0)
    # df.reset_index(inplace=True)
    
    # Shift the latitude and longitude to get the previous transaction's location
    df['prev_lat'] = df.groupby('cc_num')['merch_lat'].shift(1)
    df['prev_long'] = df.groupby('cc_num')['merch_long'].shift(1)

    # Calculate the distance to the previous transaction
    df['distance_to_prev'] = df.apply(
        lambda row: calculate_distance2(
            {'lat': row['merch_lat'], 'long': row['merch_long']},
            {'lat': row['prev_lat'], 'long': row['prev_long']}
        ) if not pd.isnull(row['prev_lat']) else None,
        axis=1
    )
    
    df['distance_to_prev'] = df['distance_to_prev'].fillna(value=0)

    # Calculate location consistency as the inverse of the average distance to previous transactions (higher value means more consistency)
    df['location_consistency'] = 100 / df.groupby('cc_num')['distance_to_prev'].transform('mean')

    # Time-based features
    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
    df['month'] = df['trans_date_trans_time'].dt.month
    df['day_of_month'] = df['trans_date_trans_time'].dt.day
   
    # Age of the account holder
    df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days // 365

    df['dist_to_home'] = df.apply(calculate_distance, axis=1)

    # Simple approach: Mark transactions as recurring if the same user has made transactions with the same merchant at least 3 times
    recurring_trans = df.groupby(['cc_num', 'merchant']).filter(lambda x: len(x) >= 3)
    recurring_trans_ids = recurring_trans['Id'].unique()
    df['recurring_trans_indicator'] = df['Id'].apply(lambda x: 1 if x in recurring_trans_ids else 0)

    # Group by category and calculate the mean and standard deviation of transaction amounts
    category_stats = df.groupby('category')['amt'].agg(['mean', 'std']).reset_index()
    # Merge these stats back into the main dataframe
    df = df.merge(category_stats, on='category', how='left')
    # Calculate z-score for each transaction amount within its category
    df['amt_anomaly_score_cat'] = ((df['amt'] - df['mean']) / df['std'])
    df.drop(columns=['mean', 'std'], inplace=True)
    # Group by merchant and calculate the mean and standard deviation of transaction amounts
    merchant_stats = df.groupby('merchant')['amt'].agg(['mean', 'std']).reset_index()
    # Merge these stats back into the main dataframe
    df = df.merge(merchant_stats, on='merchant', how='left')
    # Calculate z-score for each transaction amount within its merchant
    df['amt_anomaly_score_merch'] = ((df['amt'] - df['mean']) / df['std'])
    df.drop(columns=['mean', 'std'], inplace=True)

    # Calculate the historical average transaction amount for each user
    avg_amt_per_user = df.groupby('cc_num')['amt'].transform('mean').rename('avg_amt_per_user')

    # Append this feature to the dataset
    df['amt_relative_avg'] = (abs(df['amt'] - avg_amt_per_user) / avg_amt_per_user)

    user_avg_amt = df.groupby('cc_num')['amt'].mean().reset_index(name='Avg_Amt')

    df = df.merge(user_avg_amt, on='cc_num')

    kmeans = KMeans(n_clusters=12, random_state=42)

    # Create a new column for the cluster labels
    df['city_pop_cluster'] = kmeans.fit_predict(df[['city_pop']])

    # Transaction frequency per card number
    freq_per_cc = df.groupby('cc_num').size().reset_index().rename(columns={0: 'freq_per_cc'})
    df = df.merge(freq_per_cc, on='cc_num', how='left')

    # Calculate the total number of transactions per merchant per card
    merchant_freq = df.groupby(['cc_num', 'merchant']).size().reset_index(name='merchant_trans_count')
    # Merge this back into the main dataframe
    df = df.merge(merchant_freq, on=['cc_num', 'merchant'], how='left')
    df['merchant_trans_count'] = df['merchant_trans_count'].fillna(value=0)

    df.drop(columns=['trans_date_trans_time', 'lat', 'long', 'merch_lat', 'merch_long'], inplace=True)
    df.drop(columns=['prev_lat', 'prev_long'], inplace=True)

    # Calculate the fraud rate by category
    fraud_rate_by_category = df.groupby('category')['is_fraud'].mean().reset_index()
    fraud_rate_by_category.rename(columns={'is_fraud': 'fraud_rate_cat'}, inplace=True)

    # Merge the fraud rate back into the main DataFrame
    df = pd.merge(df, fraud_rate_by_category[['category', 'fraud_rate_cat']], on='category', how='left')

    # Calculate the fraud rate by merchant
    fraud_rate_by_merchant = df.groupby('merchant')['is_fraud'].mean().reset_index()
    fraud_rate_by_merchant.rename(columns={'is_fraud': 'fraud_rate_merch'}, inplace=True)

    # Merge the fraud rate back into the main DataFrame
    df = pd.merge(df, fraud_rate_by_merchant[['merchant', 'fraud_rate_merch']], on='merchant', how='left')

    # Separate the transactions
    fraud_trans = df[df['is_fraud'] == 1]['amt']
    normal_trans = df[df['is_fraud'] == 0]['amt']

    # Calculate statistics
    fraud_mean, fraud_std = fraud_trans.mean(), fraud_trans.std()
    normal_mean, normal_std = normal_trans.mean(), normal_trans.std()

    v_calculate_similarity_score = np.vectorize(calculate_similarity_score)

    # Apply the function
    df['fraud_similarity'], df['normal_similarity'] = v_calculate_similarity_score(
        df['amt'],
        fraud_mean, fraud_std,
        normal_mean, normal_std
    )

    # Identify categorical columns to encode
    categorical_cols = ['merchant', 'category', 'gender', 'city', 'state', 'job', 'cc_num']

    mappings = {}

    label_encoder = LabelEncoder()
    for col in categorical_cols:
        df[col] = label_encoder.fit_transform(df[col])
        mappings[col] = {label: index for index, label in enumerate(label_encoder.classes_)}

    # Sort the dataset back to its original order
    df.sort_values(by='original_order', inplace=True)
    df.drop(columns='original_order', inplace=True)

    return df, mappings

trainingSet = pd.read_csv("../data/train.csv")
submissionSet = pd.read_csv("../data/test.csv")
train_processed, cat_map = process(trainingSet)
train_processed.drop(columns=['first', 'last', 'street', 'dob', 'zip', 'trans_num', 'unix_time'], inplace=True)

# Merge on Id so that the test set can have feature columns as well
test_df = pd.merge(train_processed, submissionSet, left_on='Id', right_on='Id')
test_df = test_df.drop(columns=['is_fraud_x'])
test_df = test_df.rename(columns={'is_fraud_y': 'is_fraud'})

# The training set is where the score is not null
train_df = train_processed[train_processed['is_fraud'].notnull()]

# Save the datasets with the new features for easy access later
test_df.to_csv("./processed_test3.csv", index=False)
train_df.to_csv("./processed_train3.csv", index=False)

In [52]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Assuming 'train_df' includes both features and the target ('is_fraud')
#X = train_df.drop(['is_fraud', 'Id', 'city_pop_cluster', 'job', 'city', 'state'], axis=1)
X = train_df.drop(['is_fraud', 'Id', 'city_pop_cluster', 'cc_num', 'recurring_trans_indicator'], axis=1)
y = train_df['is_fraud']
X_test = test_df.drop(columns=['city_pop_cluster', 'cc_num', 'recurring_trans_indicator'])
# num_cols = ['amt', 'Time_Delta', 'distance_to_prev', 'location_consistency', 'dist_to_home', 'amt_anomaly_score_cat', 'amt_anomaly_score_merch', 'amt_relative_avg', 'fraud_rate_cat', 'fraud_rate_merch', 'fraud_similarity', 'normal_similarity', 'Avg_Amt', 'age', 'freq_per_cc', 'merchant_trans_count', 'city_pop']
num_cols = ['amt', 'Time_Delta', 'distance_to_prev', 'location_consistency', 'dist_to_home', 'amt_anomaly_score_cat', 'amt_anomaly_score_merch', 'amt_relative_avg', 'fraud_rate_cat', 'fraud_rate_merch', 'fraud_similarity', 'normal_similarity', 'Avg_Amt']

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

X_num = X[num_cols]
test_num = test_df[num_cols]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

X_train_num, X_val_num, y_train_num, y_val_num = train_test_split(X_num, y, test_size=0.2, stratify=y, random_state=30)

In [46]:
print(X.head())

        cc_num  merchant  category        amt  gender  city  state  city_pop  \
459661     750        43         2   0.018542       1   380     17     68211   
517764     844       153         2  -0.208745       1   354      5       648   
236842     385         9        13  -0.432193       1   684     42   1595797   
227585     368       572         4   0.458846       0   385      9      1293   
285211     459       189        11  10.576690       0   102     39      1725   

        job  Time_Delta  ...  dist_to_home  amt_anomaly_score_cat  \
459661  163   -0.232477  ...     -0.876110               0.545540   
517764   92   -0.542272  ...      0.968382              -1.702511   
236842  248   -0.531537  ...      0.843043              -0.184942   
227585  296    1.178471  ...     -0.216913               0.488897   
285211  410   -0.382774  ...      0.567199               6.921294   

        amt_anomaly_score_merch  amt_relative_avg   Avg_Amt  \
459661                 0.515202         -

In [4]:
print(X_num.head())

              amt  Time_Delta  distance_to_prev  location_consistency  \
459661   0.018542   -0.232477          0.154200             -0.009151   
517764  -0.208745   -0.542272          0.061411              0.297805   
236842  -0.432193   -0.531537         -0.029324             -0.098808   
227585   0.458846    1.178471         -0.362111             -0.173969   
285211  10.576690   -0.382774          0.069489             -0.058705   

        dist_to_home  amt_anomaly_score_cat  amt_anomaly_score_merch  \
459661     -0.876110               0.545540                 0.515202   
517764      0.968382              -1.702511                -1.644960   
236842      0.843043              -0.184942                -0.222472   
227585     -0.216913               0.488897                 0.528154   
285211      0.567199               6.921294                 8.076743   

        amt_relative_avg  fraud_rate_cat  fraud_rate_merch  fraud_similarity  \
459661         -0.302675       -0.315181        

In [7]:
import pickle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

knn = KNeighborsClassifier(metric='manhattan', weights='distance', n_neighbors=10, n_jobs=-1)

knn.fit(X_train_num, y_train_num)

# with open('best_knn.obj', 'wb') as f:
#     pickle.dump(best_knn, f)
y_pred_knn = knn.predict(X_val_num)
f1_score_knn = f1_score(y_val_num, y_pred_knn)
print(f1_score_knn)

0.7275494672754946


In [22]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
# Initialize the XGBClassifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'learning_rate': [0.05, 0.1, 0.15, 0.2],
    'n_estimators': [300, 500, 900],
    'colsample_bytree': [0.3, 0.7, 0.9]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='f1_micro', cv=5, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)
clf = grid_search.best_estimator_
# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best average F1 score found: ", f1_score(y_val, clf.predict(X_val)))

# Train the model with the optimal parameters
optimal_clf = XGBClassifier(**grid_search.best_params_, use_label_encoder=False, eval_metric='logloss')
optimal_clf.fit(X, y)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=300;, score=0.999 total time=   4.2s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=300;, score=0.999 total time=   4.2s
[CV 4/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=300;, score=0.999 total time=   4.2s
[CV 5/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=300;, score=0.999 total time=   4.3s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=300;, score=0.999 total time=   4.4s
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=5, n_estimators=300;, score=0.999 total time=   5.6s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=500;, score=0.999 total time=   6.7s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=500;, score=0.999 total time= 



[CV 1/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=7, n_estimators=900;, score=0.999 total time=  29.6s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=7, n_estimators=900;, score=0.999 total time=  28.9s
[CV 5/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=7, n_estimators=900;, score=0.999 total time=  27.8s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=7, n_estimators=900;, score=0.999 total time=  28.6s
[CV 4/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=7, n_estimators=900;, score=0.999 total time=  28.6s
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=500;, score=0.999 total time=  19.6s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=500;, score=0.999 total time=  19.5s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=500;, score=0.999 total time=  19.8s
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.05

In [53]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
# Initialize the XGBClassifier
xgb_clf = XGBClassifier(colsample_bytree=0.9, learning_rate=0.1, max_depth=5, n_estimators=900, use_label_encoder=False, eval_metric='logloss')

xgb_clf.fit(X_train, y_train)

y_pred_xgb = xgb_clf.predict(X_val)
print(f1_score(y_val, y_pred_xgb))
print(classification_report(y_val, y_pred_xgb))

0.9028571428571428
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     96876
         1.0       0.97      0.84      0.90       375

    accuracy                           1.00     97251
   macro avg       0.99      0.92      0.95     97251
weighted avg       1.00      1.00      1.00     97251



In [48]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
# Initialize the XGBClassifier
xgb_clf2 = XGBClassifier(colsample_bytree=0.9, learning_rate=0.15, max_depth=None, n_estimators=900, use_label_encoder=False, eval_metric='logloss')

xgb_clf2.fit(X_train, y_train)

y_pred_xgb = xgb_clf2.predict(X_val)
print(f1_score(y_val, y_pred_xgb))
print(classification_report(y_val, y_pred_xgb))

0.8968481375358166
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     96876
         1.0       0.97      0.83      0.90       375

    accuracy                           1.00     97251
   macro avg       0.98      0.92      0.95     97251
weighted avg       1.00      1.00      1.00     97251



In [33]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
# Initialize the XGBClassifier
xgb_clf2 = XGBClassifier(colsample_bytree=0.9, learning_rate=0.15, max_depth=None, n_estimators=900, use_label_encoder=False, eval_metric='logloss')

xgb_clf2.fit(X_train, y_train)

y_pred_xgb = xgb_clf2.predict(X_val)
print(f1_score(y_val, y_pred_xgb))
print(classification_report(y_val, y_pred_xgb))

0.9090909090909091
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     96876
         1.0       0.97      0.85      0.91       375

    accuracy                           1.00     97251
   macro avg       0.99      0.93      0.95     97251
weighted avg       1.00      1.00      1.00     97251



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
import pickle
from xgboost import XGBClassifier
import numpy as np
from sklearn.ensemble import BaggingClassifier
# Create a VotingClassifier
from sklearn.ensemble import VotingClassifier

# Initialize the Bagging Classifier
bagging_clf = BaggingClassifier(estimator=xgb_clf, n_estimators=100, random_state=42, n_jobs=-1, verbose=3)

bagging_clf.fit(X_train, y_train)
y_pred_bag = bagging_clf.predict(X_val)
print(f1_score(y_val, y_pred_bag))

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
import pickle
from xgboost import XGBClassifier
import numpy as np
from sklearn.ensemble import BaggingClassifier
# Create a VotingClassifier
from sklearn.ensemble import VotingClassifier

rf1 = RandomForestClassifier(class_weight='balanced_subsample', n_estimators=300, max_depth=None, min_samples_split=10, min_samples_leaf=4, criterion='entropy', n_jobs=-1)

rf1.fit(X_train, y_train)

y_pred_rf1 = rf1.predict(X_val)
print(f1_score(y_val, y_pred_rf1))
print(classification_report(y_val, y_pred_rf1))

0.8168604651162791
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     96876
         1.0       0.90      0.75      0.82       375

    accuracy                           1.00     97251
   macro avg       0.95      0.87      0.91     97251
weighted avg       1.00      1.00      1.00     97251



In [34]:
rf2 = RandomForestClassifier(class_weight='balanced_subsample', n_estimators=900, max_depth=None, min_samples_split=10, min_samples_leaf=1, criterion='entropy', n_jobs=-1)

rf2.fit(X_train, y_train)

y_pred_rf2 = rf2.predict(X_val)
print(f1_score(y_val, y_pred_rf2))
print(classification_report(y_val, y_pred_rf2))

0.8170731707317073
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     96876
         1.0       0.95      0.71      0.82       375

    accuracy                           1.00     97251
   macro avg       0.98      0.86      0.91     97251
weighted avg       1.00      1.00      1.00     97251



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
import pickle
from xgboost import XGBClassifier
import numpy as np
from sklearn.ensemble import BaggingClassifier
# Create a VotingClassifier
from sklearn.ensemble import VotingClassifier

# Initialize the Bagging Classifier
bagging_clf2 = BaggingClassifier(estimator=rf1, n_estimators=100, random_state=42, n_jobs=-1, verbose=3)

bagging_clf2.fit(X_train, y_train)
y_pred_bag2 = bagging_clf2.predict(X_val)
print(f1_score(y_val, y_pred_bag2))

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
import pickle
from xgboost import XGBClassifier
import numpy as np
from sklearn.ensemble import BaggingClassifier
# Create a VotingClassifier
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[('xgb', xgb_clf), ('rf1', rf1), ('xgb2', xgb_clf2)],
    voting='soft',
    n_jobs=-1
)

# Train the ensemble model
vot_clf = voting_clf.fit(X, y)

# # Evaluate the model
# scores = cross_val_score(voting_clf, X, y, cv=5, scoring='f1_micro', n_jobs=-1, verbose=3)
# print("F1 Score: ", scores.mean())
y_pred_vote = vot_clf.predict(X_val)
f1_score_vote = f1_score(y_val, y_pred_vote)
print(f1_score_vote)

# with open('best_vote_model.obj', 'wb') as f:
#     pickle.dump(vot_clf, f)

1.0


In [51]:
# Create Kaggle Submission
pred = X_test.drop(['is_fraud', 'Id'], axis=1)
pred2 = X_test.drop(['is_fraud'], axis=1)

pred2['is_fraud'] = xgb_clf.predict(pred)
pred2.is_fraud = pred2.is_fraud.astype(int)
submission = pred2[['Id', 'is_fraud']]
submission.to_csv("./submission13.csv", index=False)