In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Import Modules
from pprint import pprint
import json
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import re

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import mplcyberpunk
import seaborn as sns
import scipy as sp
import scipy.stats as stats
from typing import List, Dict, Callable

# plt.style.use('ggplot')
plt.style.use("cyberpunk")

import os
os.getcwd()

'/home/jovyan/work/Fraud_Detection'

In [3]:
# Import Custom Modules
from src.cleaner import *

In [None]:
# Import Data
data = pd.read_json('data/data.json')

In [None]:
# Clean Data
cleaned_data = clean_with_target(data)

# Preview Data
cleaned_data.head(2)

# Featurize for Visualization / KNN Model

In [None]:
def re_add_features(data:pd.DataFrame, data2:pd.DataFrame) -> pd.DataFrame:
    """ Adds wanted features back from original dataframe """
    res = data.copy()
    from_ = data2.copy()
    
    res['event_created'] = from_['event_created']
    res['event_end'] = from_['event_end']
    res['event_published'] = from_['event_published']
    res['user_created'] = from_['user_created']
    

    return res

def create_features(data:pd.DataFrame) -> pd.DataFrame:
    """ Creates features from given data geared towards KNN Model """
    
    res = data.copy()
    # 'listed' col to binary
    res['listed'] = res['listed'].apply(lambda x: 0 if x == 'y' else 1)
#     # 'previous_payments' to n_previous_payments
#     res['n_previous_payouts'] = res['previous_payouts'].apply(lambda x: len(x))
#     res.drop(columns='previous_payouts', inplace=True)
    # 'country' feature to 'None' instead of nan
    res['country'] = res['country'].fillna('None')
    
    # Durations (create durations between event_created/start/end/publish)
    res['event_duration'] = res['event_end'] - res['event_start']
    res['event_till_publish'] = res['event_published'] - res['event_created']
    res['user_event_lifespan'] = res['event_created'] - res['user_created']
    res.drop(columns=['event_created', 'event_published', 'event_start', 'event_end'], inplace=True)
    
    return res

In [None]:
# # Re-add Missing Features
# cleaned_data = re_add_features(cleaned_data, data)
# # Final Cleaning
# cleaned_data = create_features(cleaned_data)

In [None]:
cleaned_data.head(2)

# Visualizations

In [None]:
# Global Series to use in Visualizations
fraud = cleaned_data[cleaned_data['fraud'] == 1]
legit = cleaned_data[cleaned_data['fraud'] == 0]
event_groups = cleaned_data.groupby('fraud')

In [None]:
# Create A Fraud vs User to Event Lifespan Dist

# Draw Plot
plt.figure(figsize=(14,7), dpi= 80)

alpha_col = 0.7
colors = ["dodgerblue", "red"]
classification = sorted(cleaned_data["fraud"].unique())

# Plot Points
for col, class_ in zip(colors, classification):
    sns.kdeplot(cleaned_data.loc[cleaned_data['fraud'] == class_, "user_event_lifespan"], shade=True, color=col, label=f"Fraud={class_}", alpha=.7)

# Decoration
plt.title('\nDensity Plot of User Lifespan Till Event Creation by Class\n', fontsize=22)
plt.xlabel("Duration (s)\n", fontsize=25)
plt.ylabel("Density\n", fontsize=25)
legend_ = plt.legend(loc="upper right", bbox_to_anchor=(0.45, 0.5, 0.5, 0.5), fontsize=20, labelspacing=0.6)
mplcyberpunk.add_glow_effects()
mplcyberpunk.add_underglow()

# Increase Label Line Thickness
for line in legend_.get_lines():
    line.set_linewidth(6)

# Set Limits


# Show Plot
plt.subplots_adjust(left=0, bottom=0, right=1.2, top=1, wspace=1, hspace=0)
plt.show()

In [None]:
# Create A Fraud vs User to Event Lifespan Dist

# Draw Plot
plt.figure(figsize=(14,7), dpi= 80)

alpha_col = 0.7
colors = ["red", "dodgerblue"]

x = cleaned_data.copy()
x['n_previous_payouts'] = x['n_previous_payouts'].apply(lambda x: x/2048)

# Plot Points
for col, class_ in zip(colors, [0, 1]):
    sns.kdeplot(x.loc[x['fraud'] == class_, "n_previous_payouts"], shade=True, color=col, label=f"Fraud={class_}", alpha=.7)

# Decoration
plt.title('\nDensity Plot of n_previous_payouts by Class\n', fontsize=22)
plt.xlabel("Duration (s)\n", fontsize=25)
plt.ylabel("Density\n", fontsize=25)
legend_ = plt.legend(loc="upper right", bbox_to_anchor=(0.45, 0.5, 0.5, 0.5), fontsize=20, labelspacing=0.6)
mplcyberpunk.add_glow_effects()
mplcyberpunk.add_underglow()

# Increase Label Line Thickness
for line in legend_.get_lines():
    line.set_linewidth(6)

# Set Limits


# Show Plot
plt.subplots_adjust(left=0, bottom=0, right=1.2, top=1, wspace=1, hspace=0)
plt.show()

# KNN Modeling

In [None]:
# Import KNN Module
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split


# Copy data to avoid collisions
df = clean_with_target(data.copy())

# One Hot Encoding
df = ohe_existence(df, ['email_domain', 'payee_name'])
# df.drop(columns=['venue_latitude', 'venue_longitude'], inplace=True)
df.dropna(inplace=True)

In [None]:
# Create X, y targets
y = df.pop('fraud')
X = df
# Train Test Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [None]:
# View All Current NaN Features
for col in df.columns.tolist():
    nas = df[col].isna().sum()
    if nas:
        print(col, nas)

In [None]:
df

In [None]:
# Create KNN Model
knn = KNeighborsClassifier(n_neighbors = 5,n_jobs=4)
# Fit KNN Model
knn.fit(X_train,y_train)

In [None]:
# Score Model
score = knn.score(X_test,y_test)

print(score)

# Check API Data

In [None]:
from src.api_client import *

event_api = EventAPIClient()

def _call(client:object) -> pd.DataFrame:
    return client.get_data()

sample_call = _call(event_api)

# pprint(sample_call[0])

sample_df = pd.DataFrame(sample_call)

In [None]:
X_train.shape

In [None]:
sample_df.shape

In [None]:
test = clean_row(sample_df, False)
test.shape

In [None]:
test_pred = knn.predict(test)

In [None]:
test_proba = knn.predict_proba(test)

In [None]:
classes = knn.classes_

In [None]:
test_pred, test_proba, classes