In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Import Modules
from pprint import pprint
import json
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import re

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import os
os.getcwd()

'/home/jovyan/work/Fraud_Detection'

In [3]:
# Import Custom Modules
from src.cleaner import *

In [4]:
# Import Data
data = pd.read_json('data/data.json')

In [8]:
# Clean Data
cleaned_data = clean_with_target(data)

# Preview Data
cleaned_data.head(2)

Unnamed: 0,channels,country,currency,delivery_method,email_domain,event_start,fb_published,has_logo,listed,payee_name,previous_payouts,user_type,venue_country,venue_latitude,venue_longitude,fraud
0,5,US,USD,0.0,gmail.com,1265594400,0,0,y,,[],1,US,25.777471,-80.133433,1
1,0,US,USD,1.0,ruf.org,1296255600,0,1,n,RUF,"[{'name': 'RUF', 'created': '2010-10-01 01:10:...",3,US,32.776566,-79.930922,0


# Featurize for KNN Model

In [9]:
def readd_features(data:pd.DataFrame, data2:pd.DataFrame) -> pd.DataFrame:
    """ Adds wanted features back from original dataframe """
    res = data.copy()
    from_ = data2.copy()
    
    res['event_created'] = from_['event_created']
    res['event_end'] = from_['event_end']
    res['event_published'] = from_['event_published']

    return res

def create_features(data:pd.DataFrame) -> pd.DataFrame:
    """ Creates features from given data geared towards KNN Model """
    
    res = data.copy()
    # 'listed' col to binary
    res['listed'] = res['listed'].apply(lambda x: 0 if x == 'y' else 1)
    # 'previous_payments' to n_previous_payments
    res['n_previous_payouts'] = res['previous_payouts'].apply(lambda x: len(x))
    res.drop(columns='previous_payouts', inplace=True)
    # 'country' feature to 'None' instead of nan
    res['country'] = res['country'].fillna('None')
    
    # 'user_age' feature cleaning
    res['user_age'] = res['user_age'].apply(lambda x: 0 if x>=100 else x)
    res.drop(columns='user_age', inplace=True)
    
    
    to_drop = ['venue_country', 'venue_latitude', 'venue_longitude', 'venue_name', 'venue_state']
    res.drop(columns=to_drop, inplace=True)
    return res
    
    
    

In [11]:
cleaned_data = readd_features(cleaned_data, data)

In [12]:
cleaned_data

Unnamed: 0,channels,country,currency,delivery_method,email_domain,event_start,fb_published,has_logo,listed,payee_name,previous_payouts,user_type,venue_country,venue_latitude,venue_longitude,fraud,event_created,event_end,event_published
0,5,US,USD,0.0,gmail.com,1265594400,0,0,y,,[],1,US,25.777471,-80.133433,1,1262739706,1265630400,1.263110e+09
1,0,US,USD,1.0,ruf.org,1296255600,0,1,n,RUF,"[{'name': 'RUF', 'created': '2010-10-01 01:10:...",3,US,32.776566,-79.930922,0,1293832670,1296288000,1.293833e+09
2,8,US,USD,1.0,pvsd.k12.ca.us,1295713800,0,0,y,University Preparation School,"[{'name': 'Danielle Severn', 'created': '2010-...",3,US,33.944201,-118.080419,0,1291090956,1295740800,1.291092e+09
3,6,IE,EUR,1.0,irishtabletennis.com,1360702800,0,1,y,,"[{'name': '', 'created': '2010-11-09 01:10:15'...",3,,,,0,1360681570,1388534400,1.360683e+09
4,11,US,USD,0.0,artsandbusinesscouncil.org,1297440000,1,0,y,Arts and Business Council or Greater Boston,[{'name': 'Arts and Business Council or Greate...,3,US,42.353848,-71.044276,0,1291994666,1297468800,1.291995e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14332,0,US,USD,0.0,yahoo.com,1361232000,0,1,n,,[],1,US,39.373780,-76.629921,1,1360297993,1361257200,1.360299e+09
14333,5,US,USD,1.0,me.com,1365123600,0,0,y,"ARCS Foundation Inc., San Diego Chapter","[{'name': 'ARCS Foundation Inc., San Diego Cha...",4,US,32.778906,-117.209791,0,1360367042,1365134400,1.360370e+09
14334,13,,USD,0.0,yahoo.com,1368327600,1,1,y,,"[{'name': '', 'created': '2013-05-16 03:26:11'...",4,US,30.041819,-89.957130,0,1360600330,1368349200,1.361337e+09
14335,8,US,USD,0.0,velvetlist.com,1360890000,0,1,y,,"[{'name': '', 'created': '2010-11-30 01:11:30'...",3,US,40.862283,-73.911363,1,1360376285,1360929600,1.360377e+09


# Check API Data

In [7]:
from src.api_client import *

event_api = EventAPIClient()

def _call(client:object) -> pd.DataFrame:
    return client.get_data()

sample_call = _call(event_api)

# pprint(sample_call[0])

sample_df = pd.DataFrame(sample_call)