In [1]:

### System
import os
import joblib

### Set seed
import random
random.seed(42)

### Mains
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import seaborn as sns
%matplotlib inline 


# ### Models:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

### Ensemble Models:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, VotingClassifier


### Dats Splits 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

### Pipelines
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline, make_pipeline 
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder

# # Sampling Methods
# from imblearn.over_sampling import SMOTE, RandomOverSampler
# from imblearn.under_sampling import NearMiss, RandomUnderSampler

### Metrics:
import sklearn.metrics as skm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
from vincenty import vincenty_inverse

def distance_from_home(trans_row, cardholder_row):
    coords_home = (cardholder_row["longitude"], cardholder_row["latitude"])
    coords_purchase = (trans_row["merchLongitude"], trans_row["merchLatitude"])

    return vincenty_inverse(coords_home, coords_purchase).km

def process_dataset(df):
    # Format dates
    df["transDate"] = pd.to_datetime(df["transDate"])
    df["dateOfBirth"] = pd.to_datetime(df["dateOfBirth"])

    # Format categories
    df["business"] = df["business"].astype("category")
    df["category"] = df["category"].astype("category")
    df["gender"] = df["gender"].astype("category")
    df["city"] = df["city"].astype("category")
    df["state"] = df["state"].astype("category")
    df["job"] = df["job"].astype("category")

    # Format strings
    df["creditCardNum"] = df["creditCardNum"].astype(str)
    df["firstName"] = df["firstName"].astype(str)
    df["lastName"] = df["lastName"].astype(str)
    df["street"] = df["street"].astype(str)
    df["zip"] = df["zip"].astype(str)
    df["transNum"] = df["transNum"].astype(str)

    # Format bool
    df["isFraud"] = df["isFraud"].astype(bool)

    df["distance_customer_merchant"] = df.apply(
        lambda row: distance_from_home(row, row), axis=1) 
    
    
    # encode city
    grouped_transactions = df.groupby("city")
    total_transactions = grouped_transactions.size()
    fraud_transactions = grouped_transactions["isFraud"].sum()
    fraud_rate = (fraud_transactions / total_transactions) * 100
    result_dict = fraud_rate.to_dict()

    df["city_fraudrate"] = df["city"].map(result_dict)

    # encode job
    grouped_transactions = df.groupby("job")
    total_transactions = grouped_transactions.size()
    fraud_transactions = grouped_transactions["isFraud"].sum()
    fraud_rate = (fraud_transactions / total_transactions) * 100
    result_dict = fraud_rate.to_dict()

    df["job_fraudrate"] = df["job"].map(result_dict)

    df.sort_values(by=['creditCardNum', 'transDate'], inplace=True)
    df['numOfPrevFraudTxns'] = df.groupby('creditCardNum')['isFraud'].cumsum() - df['isFraud']
    df['historyOfFraud'] = (df['numOfPrevFraudTxns'] > 0).astype(int)
    
    
    df[df['historyOfFraud'] > 0].iloc[0]
    df["transDate"] = pd.to_datetime(df["transDate"])
    df["dateOfBirth"] = pd.to_datetime(df["dateOfBirth"])
    df["trans_day"] = df["transDate"].dt.dayofyear
    df["trans_weekday"] = df["transDate"].dt.weekday
    df["trans_hour"] = df["transDate"].dt.hour
    df["age_at_transaction"] = df["transDate"].dt.year - df["dateOfBirth"].dt.year
    
    onehot_encoder = OneHotEncoder(sparse=False) 
    categories = df["category"].values.reshape(-1, 1)
    categories_encoded = onehot_encoder.fit_transform(categories)
    categories_encoded_df = pd.DataFrame(
        categories_encoded, columns=onehot_encoder.get_feature_names_out(["category"]))
    df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, categories_encoded_df], axis=1)
    
    drop_columns=[
        "business",
        "firstName",
        "lastName",
        "gender",
        "street",
        "zip",
        "unixTime",
        "creditCardNum",
        "transNum",
        "merchLatitude",
        "merchLongitude",
        "latitude",
        "longitude",
        "dateOfBirth",
        "transDate",
        "job",
        "city",
        "state",
        "category"
    ]
    
    df = df.drop(columns=drop_columns)
    return df


In [3]:
df = pd.read_csv("workspace-michael/train.csv")
df = process_dataset(df)




In [4]:
df.head()

Unnamed: 0,amount,cityPop,isFraud,distance_customer_merchant,city_fraudrate,job_fraudrate,numOfPrevFraudTxns,historyOfFraud,trans_day,trans_weekday,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,473.36,1504,False,83.762096,0.0,1.392111,0,0,190,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,142.77,1504,False,27.199041,0.0,1.392111,0,0,191,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,49.01,1504,False,42.251942,0.0,1.392111,0,0,191,2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,116.93,1504,False,12.615215,0.0,1.392111,0,0,192,3,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,75.75,1504,False,20.007615,0.0,1.392111,0,0,192,3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["isFraud"]),
    df["isFraud"],
    test_size=0.2,
)


rtc = RandomForestClassifier(
    max_features=0.4,
    max_depth=None,
    min_samples_leaf=2,
    criterion="gini",
    random_state=42,
    n_estimators=1000,
    n_jobs=-1,
)

rtc.fit(X_train, y_train)

y_pred = rtc.predict_proba(X_test)[:, 1] >= 0.479
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9989000412484532
Precision: 0.9839357429718876
Recall: 0.8718861209964412
F1 Score: 0.9245283018867925
Confusion Matrix:
 [[36080     4]
 [   36   245]]
