In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from geopy.distance import geodesic
from sklearn.cluster import KMeans

# Calculate distance from home function
def calculate_distance(row):
    home_location = (row['lat'], row['long'])
    merch_location = (row['merch_lat'], row['merch_long'])
    return geodesic(home_location, merch_location).miles

# Function to calculate distance between two points
def calculate_distance2(row1, row2):
    point1 = (row1['lat'], row1['long'])
    point2 = (row2['lat'], row2['long'])
    return geodesic(point1, point2).miles

def calculate_similarity_score(amount, fraud_mean, fraud_std, normal_mean, normal_std):
    # Calculate Z-scores for fraud and normal
    z_score_fraud = abs((amount - fraud_mean) / fraud_std)
    z_score_normal = abs((amount - normal_mean) / normal_std)
    
    # Invert the Z-scores to get similarity scores
    fraud_similarity = 1 / (1 + z_score_fraud)
    normal_similarity = 1 / (1 + z_score_normal)
    
    return fraud_similarity, normal_similarity

def process(df):
    # Add new features
    # Rearrange the rows
    df['original_order'] = range(df.shape[0])

    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%d/%m/%Y %H:%M')
    df['dob'] = pd.to_datetime(df['dob'], format='%d/%m/%Y')

    # df['timestamp'] = pd.to_datetime(df['trans_date_trans_time'], format='%d/%m/%Y %H:%M')
    # df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%d/%m/%Y %H:%M')

    df.sort_values(by=['cc_num', 'trans_date_trans_time'], inplace=True)
    # Calculate time difference in hours
    # df['time_since_last_trans'] = df.groupby('cc_num')['trans_date_trans_time'].diff().apply(lambda x: x.total_seconds() / 3600)
    # # Fill NaN values for each user's first transaction
    # df['time_since_last_trans'] = df['time_since_last_trans'].fillna(value=0)
    # Calculate the time difference between transactions
    df['Time_Delta'] = df.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds() / 60.0  # Time delta in minutes
    df['Time_Delta'] = df['Time_Delta'].fillna(value=0)
    
    # Shift the latitude and longitude to get the previous transaction's location
    df['prev_lat'] = df.groupby('cc_num')['merch_lat'].shift(1)
    df['prev_long'] = df.groupby('cc_num')['merch_long'].shift(1)

    # Calculate the distance to the previous transaction
    df['distance_to_prev'] = df.apply(
        lambda row: calculate_distance2(
            {'lat': row['merch_lat'], 'long': row['merch_long']},
            {'lat': row['prev_lat'], 'long': row['prev_long']}
        ) if not pd.isnull(row['prev_lat']) else None,
        axis=1
    )
    
    df['distance_to_prev'] = df['distance_to_prev'].fillna(value=0)

    # Calculate location consistency as the inverse of the average distance to previous transactions (higher value means more consistency)
    df['location_consistency'] = 100 / df.groupby('cc_num')['distance_to_prev'].transform('mean')

    # Time-based features
    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
   
    # Age of the account holder
    df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days // 365

    # df['trans_dist'] = haversine_distance(df['lat'], df['long'], df['merch_lat'], df['merch_long'])
    df['dist_to_home'] = df.apply(calculate_distance, axis=1)

    # Group by category and calculate the mean and standard deviation of transaction amounts
    category_stats = df.groupby('category')['amt'].agg(['mean', 'std']).reset_index()
    # Merge these stats back into the main dataframe
    df = df.merge(category_stats, on='category', how='left')
    # Calculate z-score for each transaction amount within its category
    df['amt_anomaly_score_cat'] = ((df['amt'] - df['mean']) / df['std'])
    df.drop(columns=['mean', 'std'], inplace=True)
    # Group by merchant and calculate the mean and standard deviation of transaction amounts
    merchant_stats = df.groupby('merchant')['amt'].agg(['mean', 'std']).reset_index()
    # Merge these stats back into the main dataframe
    df = df.merge(merchant_stats, on='merchant', how='left')
    # Calculate z-score for each transaction amount within its merchant
    df['amt_anomaly_score_merch'] = ((df['amt'] - df['mean']) / df['std'])
    df.drop(columns=['mean', 'std'], inplace=True)

    # merch_stats = df.groupby('merchant')['amt'].agg(['mean', 'std']).reset_index(names=['merch_mean', 'merch_std'])
    # # Merge these stats back into the main dataframe
    # df = df.merge(merch_stats, on='merchant', how='left')
    # # Calculate z-score for each transaction amount within its merchant
    # df['amt_anomaly_score_merch'] = (df['amt'] - df['merch_mean']) / df['merch_std']

    # user_avg_amt = df.groupby('cc_num')['amt'].mean().reset_index(name='Avg_Amt')
    # df = df.merge(user_avg_amt, on='cc_num')
    # df['Relative_Amt'] = abs(df['amt'] - df['Avg_Amt']) / df['Avg_Amt']
    # Calculate the historical average transaction amount for each user
    avg_amt_per_user = df.groupby('cc_num')['amt'].transform('mean').rename('avg_amt_per_user')

    # Append this feature to the dataset
    df['amt_relative_avg'] = (abs(df['amt'] - avg_amt_per_user) / avg_amt_per_user)
    #df['relative_amt'] = abs(df['amt'] - avg_amt_per_user) / avg_amt_per_user

    kmeans = KMeans(n_clusters=12, random_state=42)

    # Create a new column for the cluster labels
    df['city_pop_cluster'] = kmeans.fit_predict(df[['city_pop']])


    df.drop(columns=['trans_date_trans_time', 'lat', 'long', 'merch_lat', 'merch_long'], inplace=True)
    df.drop(columns=['prev_lat', 'prev_long', 'cc_num'], inplace=True)

    # Identify categorical columns to encode
    categorical_cols = ['merchant', 'category', 'gender', 'city', 'state', 'job']

    mappings = {}

    label_encoder = LabelEncoder()
    for col in categorical_cols:
        df[col] = label_encoder.fit_transform(df[col])
        mappings[col] = {label: index for index, label in enumerate(label_encoder.classes_)}


    # Calculate the fraud rate by category
    fraud_rate_by_category = df.groupby('category')['is_fraud'].mean().reset_index()
    fraud_rate_by_category.rename(columns={'is_fraud': 'fraud_rate_cat'}, inplace=True)

    # Merge the normalized fraud rate back into the main DataFrame
    df = pd.merge(df, fraud_rate_by_category[['category', 'fraud_rate_cat']], on='category', how='left')

    # # Determine the threshold for high fraud risk categories
    # threshold = fraud_rate_by_category['fraud_rate'].quantile(0.75)

    # # Identify high fraud risk categories
    # high_risk_categories = fraud_rate_by_category[fraud_rate_by_category['fraud_rate'] >= threshold]['category']

    # # Display high-risk categories
    # print('High-risk categories:')
    # print(high_risk_categories)

    # Calculate the fraud rate by merchant
    fraud_rate_by_merchant = df.groupby('merchant')['is_fraud'].mean().reset_index()
    fraud_rate_by_merchant.rename(columns={'is_fraud': 'fraud_rate_merch'}, inplace=True)

    # Merge the normalized fraud rate back into the main DataFrame
    df = pd.merge(df, fraud_rate_by_merchant[['merchant', 'fraud_rate_merch']], on='merchant', how='left')

    # Separate the transactions
    fraud_trans = df[df['is_fraud'] == 1]['amt']
    normal_trans = df[df['is_fraud'] == 0]['amt']

    # Calculate statistics
    fraud_mean, fraud_std = fraud_trans.mean(), fraud_trans.std()
    normal_mean, normal_std = normal_trans.mean(), normal_trans.std()

    v_calculate_similarity_score = np.vectorize(calculate_similarity_score)

    # Apply the function
    df['fraud_similarity'], df['normal_similarity'] = v_calculate_similarity_score(
        df['amt'],
        fraud_mean, fraud_std,
        normal_mean, normal_std
    )

    # # Determine the merch_threshold for high fraud risk categories
    # merch_threshold = fraud_rate_by_merchant['fraud_rate'].quantile(0.75)

    # # Identify high fraud risk categories
    # high_risk_merchants = fraud_rate_by_merchant[fraud_rate_by_merchant['fraud_rate'] >= merch_threshold]['merchant']

    # # Display high-risk merchants
    # print('High-risk merchants:')
    # print(high_risk_merchants)
    # # Initialize the new feature with 0
    # df['high_risk_category'] = 0
    # df['high_risk_merchant'] = 0

    # # Flag transactions in high-risk categories
    # df.loc[df['category'].isin(high_risk_categories), 'high_risk_category'] = 1
    # df.loc[df['merchant'].isin(high_risk_merchants), 'high_risk_merchant'] = 1

    # Sort the dataset back to its original order
    df.sort_values(by='original_order', inplace=True)
    df.drop(columns='original_order', inplace=True)

    return df, mappings

trainingSet = pd.read_csv("./data/train.csv")
submissionSet = pd.read_csv("./data/test.csv")
train_processed, cat_map = process(trainingSet)
train_processed.drop(columns=['first', 'last', 'street', 'dob', 'zip', 'trans_num', 'unix_time'], inplace=True)

# Merge on Id so that the test set can have feature columns as well
test_df = pd.merge(train_processed, submissionSet, left_on='Id', right_on='Id')
test_df = test_df.drop(columns=['is_fraud_x'])
test_df = test_df.rename(columns={'is_fraud_y': 'is_fraud'})

# The training set is where the score is not null
train_df = train_processed[train_processed['is_fraud'].notnull()]

# Save the datasets with the new features for easy access later
test_df.to_csv("./data/test_p2.csv", index=False)
train_df.to_csv("./data/train_p2.csv", index=False)

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Assuming 'train_df' includes both features and the target ('is_fraud')
X = train_df.drop(['is_fraud', 'Id', 'city_pop_cluster'], axis=1)
y = train_df['is_fraud']

num_cols = ['amt', 'Time_Delta', 'distance_to_prev', 'location_consistency', 'dist_to_home', 'amt_anomaly_score_cat', 'amt_anomaly_score_merch', 'amt_relative_avg', 'fraud_rate_cat', 'fraud_rate_merch', 'fraud_similarity', 'normal_similarity']

# scaler = StandardScaler()
# X[num_cols] = scaler.fit_transform(X[num_cols])
# test_df[num_cols] = scaler.transform(test_df[num_cols])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [3]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
# Initialize the XGBClassifier
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'learning_rate': [0.05, 0.1, 0.15, 0.2],
    'n_estimators': [300, 500, 900],
    'colsample_bytree': [0.3, 0.7, 0.9]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, scoring='f1', cv=5, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)
clf = grid_search.best_estimator_
# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best average F1 score found: ", f1_score(y_val, clf.predict(X_val)))

# Train the model with the optimal parameters
optimal_clf = XGBClassifier(**grid_search.best_params_, use_label_encoder=False, eval_metric='logloss')
optimal_clf.fit(X, y)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=300;, score=0.786 total time=   4.2s
[CV 4/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=300;, score=0.803 total time=   4.2s
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=300;, score=0.765 total time=   4.3s
[CV 5/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=300;, score=0.784 total time=   4.3s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=300;, score=0.773 total time=   4.3s
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=500;, score=0.795 total time=   6.4s
[CV 5/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=500;, score=0.795 total time=   6.4s
[CV 4/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=500;, score=0.813 total time= 



[CV 4/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=500;, score=0.843 total time=  15.9s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=500;, score=0.833 total time=  16.1s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=None, n_estimators=300;, score=0.862 total time=   6.6s
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=None, n_estimators=300;, score=0.842 total time=   6.7s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=None, n_estimators=300;, score=0.834 total time=   6.5s
[CV 5/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=None, n_estimators=300;, score=0.846 total time=   6.5s
[CV 4/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=None, n_estimators=300;, score=0.849 total time=   6.6s
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=900;, score=0.852 total time=  26.6s
[CV 5/5] END colsample_bytree=0.3, lea

In [4]:
import pickle

with open('best_model_xgb.obj', 'wb') as f:
    pickle.dump(optimal_clf, f)

In [5]:
pred = test_df.drop(['is_fraud', 'city_pop_cluster', 'Id'], axis=1)
pred2 = test_df.drop(['is_fraud'], axis=1)

pred2['is_fraud'] = optimal_clf.predict(pred)
pred2.is_fraud = pred2.is_fraud.astype(int)
submission = pred2[['Id', 'is_fraud']]
submission.to_csv("./data/submission4.csv", index=False)

In [7]:
from sklearn.ensemble import RandomForestClassifier

# Initialize a basic Random Forest model
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

param_grid = {
    'n_estimators': [300, 500, 900],
    'max_depth': [None, 20],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'criterion': ['entropy', 'log_loss']
}

# Setup GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=4, scoring='f1_micro', n_jobs=-1, verbose=3)

# Fit GridSearchCV
grid_search_rf.fit(X_train, y_train)

# Best parameters and model
best_rf_params = grid_search_rf.best_params_
print(f"Best parameters: {best_rf_params}")

best_rf_model = grid_search_rf.best_estimator_

# Prediction and Evaluation
y_pred_rf = best_rf_model.predict(X_val)
print(f1_score(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_rf))
# Train the model with the optimal parameters
optimal_rf = RandomForestClassifier(**grid_search_rf.best_params_, class_weight='balanced', random_state=42)
optimal_rf.fit(X, y)


Fitting 4 folds for each of 48 candidates, totalling 192 fits
[CV 4/4] END criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.999 total time= 2.4min
[CV 2/4] END criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.999 total time= 2.4min
[CV 1/4] END criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.999 total time= 2.4min
[CV 3/4] END criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.999 total time= 2.4min
[CV 4/4] END criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=500;, score=0.999 total time= 4.0min
[CV 3/4] END criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=500;, score=0.999 total time= 4.0min
[CV 2/4] END criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50



[CV 2/4] END criterion=log_loss, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=900;, score=0.999 total time= 7.2min
[CV 2/4] END criterion=log_loss, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=500;, score=0.999 total time= 3.9min
[CV 1/4] END criterion=log_loss, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=900;, score=0.999 total time= 7.3min
[CV 4/4] END criterion=log_loss, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=500;, score=0.999 total time= 3.9min
[CV 3/4] END criterion=log_loss, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=500;, score=0.999 total time= 4.0min
[CV 4/4] END criterion=log_loss, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=900;, score=0.999 total time= 7.1min
[CV 3/4] END criterion=log_loss, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=900;, score=0.999 total time= 7.3min
[CV 1/4] END cri

In [None]:
with open('best_rf_model.obj', 'wb') as f:
    pickle.dump(optimal_rf, f)
