In [2]:
import pandas as pd

df1 = pd.read_csv("fraudTest.csv")
df2 = pd.read_csv("fraudTrain.csv")

df = pd.concat([df1, df2])
df = df.drop(columns=["Unnamed: 0"])

new_column_names = [
    "transDate",
    "creditCardNum",
    "business",
    "category",
    "amount",
    "firstName",
    "lastName",
    "gender",
    "street",
    "city",
    "state",
    "zip",
    "latitude",
    "longitude",
    "cityPop",
    "job",
    "dateOfBirth",
    "transNum",
    "unixTime",
    "merchLatitude",
    "merchLongitude",
    "isFraud",
]

df = df.rename(columns=dict(zip(df.columns, new_column_names)))

In [3]:
def process_datatypes(df):
    # Format dates
    df["transDate"] = pd.to_datetime(df["transDate"])
    df["dateOfBirth"] = pd.to_datetime(df["dateOfBirth"], format="%Y-%m-%d")
    df["unixTime"] = pd.to_datetime(df["unixTime"], unit="s")

    # Format categories
    df["business"] = df["business"].astype("category")
    df["category"] = df["category"].astype("category")
    df["gender"] = df["gender"].astype("category")
    df["city"] = df["city"].astype("category")
    df["state"] = df["state"].astype("category")
    df["job"] = df["job"].astype("category")

    # Format strings
    df["creditCardNum"] = df["creditCardNum"].astype(str)
    df["firstName"] = df["firstName"].astype(str)
    df["lastName"] = df["lastName"].astype(str)
    df["street"] = df["street"].astype(str)
    df["zip"] = df["zip"].astype(str)
    df["transNum"] = df["transNum"].astype(str)

    # Format bool
    df["isFraud"] = df["isFraud"].astype(bool)

    # Fix UNIX time
    df["transDate"] = df["unixTime"] + pd.DateOffset(years=7)
    df.drop(columns=["unixTime"], inplace=True)


from vincenty import vincenty_inverse


def distance_from_home(row):
    coords_home = (row["longitude"], row["latitude"])
    coords_purchase = (row["merchLongitude"], row["merchLatitude"])

    return vincenty_inverse(coords_home, coords_purchase).km


def calcuate_distances(df):
    df["distance_customer_merchant"] = df.apply(distance_from_home, axis=1)


def calculate_fraudrates(df):
    # encode city
    grouped_transactions = df.groupby("city")
    total_transactions = grouped_transactions.size()
    fraud_transactions = grouped_transactions["isFraud"].sum()
    fraud_rate = (fraud_transactions / total_transactions) * 100
    result_dict = fraud_rate.to_dict()

    df["city_fraudrate"] = df["city"].map(result_dict)
    df.drop(columns=["city"], inplace=True)

    # encode job
    grouped_transactions = df.groupby("job")
    total_transactions = grouped_transactions.size()
    fraud_transactions = grouped_transactions["isFraud"].sum()
    fraud_rate = (fraud_transactions / total_transactions) * 100
    result_dict = fraud_rate.to_dict()

    df["job_fraudrate"] = df["job"].map(result_dict)
    df.drop(columns=["job"], inplace=True)


def determine_history(df):
    df["historyOfFraud"] = df["creditCardNum"].duplicated(keep=False) & df["isFraud"]


def process_dates(df):
    df["trans_day"] = df["transDate"].dt.dayofyear
    df["trans_weekday"] = df["transDate"].dt.weekday
    df["trans_hour"] = df["transDate"].dt.hour
    df["age_at_transaction"] = df["transDate"].dt.year - df["dateOfBirth"].dt.year

In [4]:
def drop_columns(df):
    new_df = df.drop(
        columns=[
            "creditCardNum",
            "business",
            "firstName",
            "lastName",
            "gender",
            "street",
            "zip",
            "state",
            "transNum",
            "merchLatitude",
            "merchLongitude",
            "latitude",
            "longitude",
            "dateOfBirth",
            "transDate",
        ]
    )
    return new_df

In [6]:
def process_dataset(df):
    new_df = df.copy()

    process_datatypes(new_df)
    calcuate_distances(new_df)
    calculate_fraudrates(new_df)
    determine_history(new_df)
    process_dates(new_df)

    return new_df


df_train = process_dataset(df)
df_train = drop_columns(df_train)

  grouped_transactions = df.groupby("city")
  grouped_transactions = df.groupby("job")


In [7]:
# encode city
grouped_transactions = df.groupby("city")
total_transactions = grouped_transactions.size()
fraud_transactions = grouped_transactions["isFraud"].sum()
fraud_rate = (fraud_transactions / total_transactions) * 100
city_result_dict = fraud_rate.to_dict()


# encode job
grouped_transactions = df.groupby("job")
total_transactions = grouped_transactions.size()
fraud_transactions = grouped_transactions["isFraud"].sum()
fraud_rate = (fraud_transactions / total_transactions) * 100
job_result_dict = fraud_rate.to_dict()

In [9]:
import json

with open("city_fraudrates.json", "w") as file:
    json.dump(city_result_dict, file)

with open("job_fraudrates.json", "w") as file:
    json.dump(job_result_dict, file)

with open("city_fraudrates.json", "r") as file:
    data = json.load(file)
with open("job_fraudrates.json", "r") as file:
    data = json.load(file)

df["city_fraudrate"] = df["city"].map(result_dict)
df.drop(columns=["city"], inplace=True)

In [None]:
def encode_categorical(df):
    df = pd.get_dummies(df, columns=["category"])
    return df


df_train = encode_categorical(df_train)

In [None]:
# OLD MODEL

from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split

model_data = pd.read_pickle("model_data.pkl")
model_data.drop(columns=["state"], inplace=True)

rus = RandomUnderSampler(sampling_strategy=0.5)
X = model_data.drop(columns=["isFraud"])
Y = model_data["isFraud"]
X_rus, y_rus = rus.fit_resample(X, Y)
y_rus.value_counts()

X_train, X_test, y_train, y_test = train_test_split(
    X_rus, y_rus, test_size=0.2, random_state=42, stratify=y_rus
)

rtc = RandomForestClassifier(
    max_features=0.4,
    max_depth=None,
    min_samples_leaf=2,
    criterion="gini",
    random_state=42,
    n_estimators=1000,
    n_jobs=-1,
)

rtc.fit(X_train, y_train)

y_pred = rtc.predict_proba(X_test)[:, 1] >= 0.479
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9750623441396509
Precision: 0.9490909090909091
Recall: 0.9775280898876404
F1 Score: 0.9630996309963098
Confusion Matrix:
 [[521  14]
 [  6 261]]


In [None]:
# NEW TEST SET

test_set = df_train.drop(columns=["isFraud"])
y_true = df_train["isFraud"]

y_pred = rtc.predict_proba(test_set)[:, 1] >= 0.479
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1 Score:", f1_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

Accuracy: 0.9997606704107651
Precision: 1.0
Recall: 0.937995337995338
F1 Score: 0.9680057733942747
Confusion Matrix:
 [[553574      0]
 [   133   2012]]
