In [1]:

### System
import os
import joblib

### Set seed
import random
random.seed(42)

### Mains
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import seaborn as sns
%matplotlib inline 


# ### Models:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

### Ensemble Models:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, VotingClassifier


### Dats Splits 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

### Pipelines
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline, make_pipeline 
from sklearn.utils import shuffle

# # Sampling Methods
# from imblearn.over_sampling import SMOTE, RandomOverSampler
# from imblearn.under_sampling import NearMiss, RandomUnderSampler

### Metrics:
import sklearn.metrics as skm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix



### Helper functions

In [2]:
def process_datatypes(df):
    # Format dates
    df["transDate"] = pd.to_datetime(df["transDate"])
    df["dateOfBirth"] = pd.to_datetime(df["dateOfBirth"], format="%Y-%m-%d")
    df["unixTime"] = pd.to_datetime(df["unixTime"], unit="s")

    # Format categories
    df["business"] = df["business"].astype("category")
    df["category"] = df["category"].astype("category")
    df["gender"] = df["gender"].astype("category")
    df["city"] = df["city"].astype("category")
    df["state"] = df["state"].astype("category")
    df["job"] = df["job"].astype("category")

    # Format strings
    df["creditCardNum"] = df["creditCardNum"].astype(str)
    df["firstName"] = df["firstName"].astype(str)
    df["lastName"] = df["lastName"].astype(str)
    df["street"] = df["street"].astype(str)
    df["zip"] = df["zip"].astype(str)
    df["transNum"] = df["transNum"].astype(str)

    # Format bool
    df["isFraud"] = df["isFraud"].astype(bool)

    # Fix UNIX time
    df["transDate"] = df["unixTime"] + pd.DateOffset(years=7)
    df.drop(columns=["unixTime"], inplace=True)


from vincenty import vincenty_inverse


def distance_from_home(row):
    coords_home = (row["longitude"], row["latitude"])
    coords_purchase = (row["merchLongitude"], row["merchLatitude"])

    return vincenty_inverse(coords_home, coords_purchase).km


def calcuate_distances(df):
    df["distance_customer_merchant"] = df.apply(distance_from_home, axis=1)


import json
#? Import from file and map
def calculate_fraudrates(df):
    with open("kaggle-set/city_fraudrates.json", "r") as file:
        city_dict = json.load(file)
    with open("kaggle-set/job_fraudrates.json", "r") as file:
        job_dict = json.load(file)

    df["city_fraudrate"] = df["city"].map(city_dict)
    df.drop(columns=["city"], inplace=True)

    df["job_fraudrate"] = df["job"].map(job_dict)
    df.drop(columns=["job"], inplace=True)

def determine_history(df):
    df['numOfPrevFraudTxns'] = df.groupby('creditCardNum')['isFraud'].cumsum() - df['isFraud']
    df['historyOfFraud'] = (df['numOfPrevFraudTxns'] > 0).astype(int)


def process_dates(df):
    df["trans_day"] = df["transDate"].dt.dayofyear
    df["trans_weekday"] = df["transDate"].dt.weekday
    df["trans_hour"] = df["transDate"].dt.hour
    df["age_at_transaction"] = df["transDate"].dt.year - df["dateOfBirth"].dt.year
    
def drop_columns(df):
    new_df = df.drop(
        columns=[
            "creditCardNum",
            "business",
            "firstName",
            "lastName",
            "gender",
            "street",
            "zip",
            "state",
            "transNum",
            "merchLatitude",
            "merchLongitude",
            "latitude",
            "longitude",
            "dateOfBirth",
            "transDate",
        ]
    )
    return new_df

### Process The Dataset

In [3]:
df = pd.read_csv("workspace-michael/train.csv")
df_test = pd.read_csv("workspace-michael/test.csv")

def process_dataset(df):
    new_df = df.copy()

    process_datatypes(new_df)
    calcuate_distances(new_df)
    calculate_fraudrates(new_df)
    determine_history(new_df)
    process_dates(new_df)

    return new_df

In [4]:

df_train = drop_columns(process_dataset(df))
print(f"Train size {df_train.shape}")

df_test = drop_columns(process_dataset(df_test))
print(f"Test size {df_test.shape}")

Train size (181822, 13)
Test size (2251, 13)


### Encode categorical data



In [5]:
def encode_categorical(df):
    df = pd.get_dummies(df, columns=["category"])
    return df


df_train = encode_categorical(df_train)
df_test = encode_categorical(df_test)

In [6]:
df_train.head()

Unnamed: 0,amount,cityPop,isFraud,distance_customer_merchant,city_fraudrate,job_fraudrate,numOfPrevFraudTxns,historyOfFraud,trans_day,trans_weekday,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,4.1,6713,False,47.229861,0.455581,0.455581,0,0,1,1,...,False,False,False,False,True,False,False,False,False,False
1,2.67,798,False,27.264981,0.239398,0.23948,0,0,1,1,...,False,False,False,False,False,False,False,True,False,False
2,22.88,337,False,116.068684,0.342114,0.376197,0,0,1,1,...,False,False,False,False,False,True,False,False,False,False
3,9.59,3032,False,80.518979,0.364465,0.816327,0,0,1,1,...,False,False,False,False,False,True,False,False,False,False
4,129.06,1925,False,65.543964,0.443837,0.383142,0,0,1,1,...,True,False,False,False,False,False,False,False,False,False


In [7]:
df_test = df_test.drop(columns='isFraud')
df_test.head()

Unnamed: 0,amount,cityPop,distance_customer_merchant,city_fraudrate,job_fraudrate,numOfPrevFraudTxns,historyOfFraud,trans_day,trans_weekday,trans_hour,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,667.17,63,94.267392,0.478142,0.383142,0,0,179,5,19,...,False,False,False,False,True,False,False,False,False,False
1,132.4,1690,1.512518,1.085482,0.338116,0,0,351,2,6,...,False,False,False,False,False,False,False,False,True,False
2,10.45,222785,69.227369,1.042139,0.872172,0,0,221,5,22,...,False,False,False,False,False,False,False,False,False,True
3,8.89,218,89.741914,0.364299,0.816104,0,0,351,2,23,...,False,False,False,False,False,False,False,False,False,True
4,809.83,8874,89.250337,0.454959,0.659691,0,0,268,3,1,...,False,False,False,False,True,False,False,False,False,False


In [8]:
# X_train, X_test, y_train, y_test = train_test_split(
#     df_train.drop(columns=["isFraud"]),
#     df_train["isFraud"],
#     test_size=0.2,
# )

# rtc = RandomForestClassifier(
#     max_features=0.4,
#     max_depth=None,
#     min_samples_leaf=2,
#     criterion="gini",
#     random_state=42,
#     n_estimators=1000,
#     n_jobs=-1,
# )

# rtc.fit(X_train, y_train)

# y_pred = rtc.predict_proba(X_test)[:, 1] >= 0.479
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Precision:", precision_score(y_test, y_pred))
# print("Recall:", recall_score(y_test, y_pred))
# print("F1 Score:", f1_score(y_test, y_pred))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [9]:
# y_pred = rtc.predict_proba(df_test)[:, 1] >= 0.479
# y_pred

In [10]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy=0.5)
X = df_train.drop(columns=["isFraud"])
Y = df_train["isFraud"]
X_rus, y_rus = rus.fit_resample(X, Y)
y_rus.value_counts()


isFraud
False    2672
True     1336
Name: count, dtype: int64

Do Under and Over Sampling

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X_rus, y_rus, test_size=0.2, random_state=42, stratify=y_rus
)

rtc = RandomForestClassifier(
    max_features=0.4,
    max_depth=None,
    min_samples_leaf=2,
    criterion="gini",
    random_state=42,
    n_estimators=1000,
    n_jobs=-1,
)

rtc.fit(X_train, y_train)

y_pred = rtc.predict_proba(X_test)[:, 1] >= 0.479
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9750623441396509
Precision: 0.9490909090909091
Recall: 0.9775280898876404
F1 Score: 0.9630996309963098
Confusion Matrix:
 [[521  14]
 [  6 261]]


In [12]:
y_pred = rtc.predict_proba(df_test)[:, 1] >= 0.479
y_pred

array([ True, False, False, ..., False, False, False])

In [13]:
df_test = pd.read_csv("workspace-michael/test.csv")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2251 entries, 0 to 2250
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   transDate       2251 non-null   object 
 1   creditCardNum   2251 non-null   int64  
 2   business        2251 non-null   object 
 3   category        2251 non-null   object 
 4   amount          2251 non-null   float64
 5   firstName       2251 non-null   object 
 6   lastName        2251 non-null   object 
 7   gender          2251 non-null   object 
 8   street          2251 non-null   object 
 9   city            2251 non-null   object 
 10  state           2251 non-null   object 
 11  zip             2251 non-null   int64  
 12  latitude        2251 non-null   float64
 13  longitude       2251 non-null   float64
 14  cityPop         2251 non-null   int64  
 15  job             2251 non-null   object 
 16  dateOfBirth     2251 non-null   object 
 17  transNum        2251 non-null   o

In [14]:
df_test["isFraud"] = y_pred

In [15]:
df_test.head()

Unnamed: 0,transDate,creditCardNum,business,category,amount,firstName,lastName,gender,street,city,...,latitude,longitude,cityPop,job,dateOfBirth,transNum,unixTime,merchLatitude,merchLongitude,isFraud
0,2020-06-27 19:47,3573390000000000,fraud_Volkman Ltd,misc_net,667.17,Stephanie,Murphy,F,526 Stacy Walks,Barnard,...,45.7205,-98.5534,63,Systems developer,1969-10-30,276495f58de3af46843647fdd00843eb,1372362479,46.0403,-97.71046,True
1,2020-12-16 06:29,3579200000000000,fraud_Pfeffer and Sons,shopping_pos,132.4,Tyler,Small,M,047 Kevin Haven,Boyd,...,44.9437,-91.0294,1690,IT trainer,1975-12-24,0bd7e2199cc5dbe847d90f1ca1d94974,1387175381,45.680527,-91.026459,False
2,2020-08-08 22:02,343819000000000,fraud_Kovacek Ltd,travel,10.45,Shelby,Cannon,F,1535 Ryan Burgs Suite 919,Des Moines,...,41.5855,-93.6719,222785,Wellsite geologist,1956-01-24,873e753bf1a897daf144b2fe9b039fd2,1375999332,41.593729,-94.291725,False
3,2020-12-16 23:41,4292900000000000000,fraud_Lemke and Sons,travel,8.89,Jeffrey,Smith,M,135 Joseph Mountains,Sula,...,45.8433,-113.8748,218,"Therapist, horticultural",1995-08-16,fdfa039f3d2678522f57003bf1c38a7f,1387237267,46.478129,-114.636119,False
4,2020-09-24 01:14,4427810000000,fraud_Kerluke Inc,misc_net,809.83,Michelle,Rodriguez,F,1742 Brandon Squares Apt. 461,Ruidoso,...,33.3305,-105.6933,8874,Licensed conveyancer,2000-08-16,f442042f8f14ec7cd9d5fddece9b75b2,1379985277,32.842035,-104.904134,True


In [16]:
df_test.to_csv("team_2.csv")
