In [10]:
import pandas as pd

df = pd.read_csv("workspace-michael/test.csv")


def process_datatypes(df):
    # Format dates
    df["transDate"] = pd.to_datetime(df["transDate"])
    df["dateOfBirth"] = pd.to_datetime(df["dateOfBirth"], format="%Y-%m-%d")
    df["unixTime"] = pd.to_datetime(df["unixTime"], unit="s")

    # Format categories
    df["business"] = df["business"].astype("category")
    df["category"] = df["category"].astype("category")
    df["gender"] = df["gender"].astype("category")
    df["city"] = df["city"].astype("category")
    df["state"] = df["state"].astype("category")
    df["job"] = df["job"].astype("category")

    # Format strings
    df["creditCardNum"] = df["creditCardNum"].astype(str)
    df["firstName"] = df["firstName"].astype(str)
    df["lastName"] = df["lastName"].astype(str)
    df["street"] = df["street"].astype(str)
    df["zip"] = df["zip"].astype(str)
    df["transNum"] = df["transNum"].astype(str)

    # Format bool
    df["isFraud"] = df["isFraud"].astype(bool)

    # Fix UNIX time
    df["transDate"] = df["unixTime"] + pd.DateOffset(years=7)
    df.drop(columns=["unixTime"], inplace=True)


from vincenty import vincenty_inverse


def distance_from_home(row):
    coords_home = (row["longitude"], row["latitude"])
    coords_purchase = (row["merchLongitude"], row["merchLatitude"])

    return vincenty_inverse(coords_home, coords_purchase).km


def calcuate_distances(df):
    df["distance_customer_merchant"] = df.apply(distance_from_home, axis=1)


import json


def calculate_fraudrates(df):
    with open("kaggle-set/city_fraudrates.json", "r") as file:
        city_dict = json.load(file)
    with open("kaggle-set/job_fraudrates.json", "r") as file:
        job_dict = json.load(file)

    df["city_fraudrate"] = df["city"].map(city_dict)
    df.drop(columns=["city"], inplace=True)

    df["job_fraudrate"] = df["job"].map(job_dict)
    df.drop(columns=["job"], inplace=True)


def determine_history(df):
    df["historyOfFraud"] = df["creditCardNum"].duplicated(keep=False) & df["isFraud"]


def process_dates(df):
    df["trans_day"] = df["transDate"].dt.dayofyear
    df["trans_weekday"] = df["transDate"].dt.weekday
    df["trans_hour"] = df["transDate"].dt.hour
    df["age_at_transaction"] = df["transDate"].dt.year - df["dateOfBirth"].dt.year

In [11]:
def drop_columns(df):
    new_df = df.drop(
        columns=[
            "creditCardNum",
            "business",
            "firstName",
            "lastName",
            "gender",
            "street",
            "zip",
            "state",
            "transNum",
            "merchLatitude",
            "merchLongitude",
            "latitude",
            "longitude",
            "dateOfBirth",
            "transDate",
        ]
    )
    return new_df


def process_dataset(df):
    new_df = df.copy()

    process_datatypes(new_df)
    calcuate_distances(new_df)
    calculate_fraudrates(new_df)
    determine_history(new_df)
    process_dates(new_df)

    return new_df


df_test = process_dataset(df)
df_test = drop_columns(df_test)

In [12]:
def encode_categorical(df):
    df = pd.get_dummies(df, columns=["category"])
    return df


df_test = encode_categorical(df_test)

In [13]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2251 entries, 0 to 2250
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   amount                      2251 non-null   float64
 1   cityPop                     2251 non-null   int64  
 2   isFraud                     2251 non-null   bool   
 3   distance_customer_merchant  2251 non-null   float64
 4   city_fraudrate              2251 non-null   float64
 5   job_fraudrate               2251 non-null   float64
 6   historyOfFraud              2251 non-null   bool   
 7   trans_day                   2251 non-null   int32  
 8   trans_weekday               2251 non-null   int32  
 9   trans_hour                  2251 non-null   int32  
 10  age_at_transaction          2251 non-null   int32  
 11  category_entertainment      2251 non-null   bool   
 12  category_food_dining        2251 non-null   bool   
 13  category_gas_transport      2251 

In [15]:
df_test = df_test.drop(columns='isFraud')

In [None]:
# ADD MODEL CODE HERE
model = None

In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
from sklearn.ensemble import RandomForestClassifier

# RUN TEST SET

test_set = df_test.drop(columns=["isFraud"])
y_true = df_test["isFraud"]

y_pred = model.predict_proba(test_set)[:, 1] >= 0.479
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1 Score:", f1_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))