In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import  RandomForestClassifier
from sklearn.metrics import log_loss, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

import sys
sys.path.append("../")
from src import cleaner, model
from src.api_client import EventAPIClient

In [2]:
# Import data
data = pd.read_json('../data/data.json')

In [3]:
# Clean data and alter columns
cleaned_data = cleaner.clean_with_target(data)
cleaned_data

Unnamed: 0,channels,country,currency,delivery_method,email_domain,event_start,fb_published,has_logo,listed,payee_name,payout_type,user_type,venue_country,venue_latitude,venue_longitude,fraud,n_previous_payouts
0,5,US,USD,0.0,gmail.com,1265594400,0,0,y,,,1,US,25.777471,-80.133433,1,0
1,0,US,USD,1.0,ruf.org,1296255600,0,1,n,RUF,CHECK,3,US,32.776566,-79.930922,0,49
2,8,US,USD,1.0,pvsd.k12.ca.us,1295713800,0,0,y,University Preparation School,CHECK,3,US,33.944201,-118.080419,0,36
3,6,IE,EUR,1.0,irishtabletennis.com,1360702800,0,1,y,,ACH,3,,,,0,21
4,11,US,USD,0.0,artsandbusinesscouncil.org,1297440000,1,0,y,Arts and Business Council or Greater Boston,CHECK,3,US,42.353848,-71.044276,0,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14332,0,US,USD,0.0,yahoo.com,1361232000,0,1,n,,,1,US,39.373780,-76.629921,1,0
14333,5,US,USD,1.0,me.com,1365123600,0,0,y,"ARCS Foundation Inc., San Diego Chapter",CHECK,4,US,32.778906,-117.209791,0,5
14334,13,,USD,0.0,yahoo.com,1368327600,1,1,y,,ACH,4,US,30.041819,-89.957130,0,1
14335,8,US,USD,0.0,velvetlist.com,1360890000,0,1,y,,ACH,3,US,40.862283,-73.911363,0,128


In [7]:
client = EventAPIClient()
new_data = client.get_data()
new_data
len(new_data[0]['previous_payouts'])

0

In [18]:
model.make_model(cleaned_data)

In [21]:
model.nuclear_option(new_data)

(1, 0.8109616514032922)

In [111]:
(cleaned_data['country'] == 'None').sum()
(cleaned_data['country'] == '').sum()
#cleaned_data.replace(to_replace='', value='None', inplace=True)
#cleaned_data['country'] = cleaned_data['country'].apply(lambda x: 'None' if x == '')

In [112]:
cleaned_data.head()

Unnamed: 0,channels,country,currency,delivery_method,email_domain,event_start,fb_published,has_logo,listed,payee_name,payout_type,user_type,venue_country,venue_latitude,venue_longitude,fraud,n_previous_payouts
0,5,US,USD,0,gmail.com,1265594400,0,0,y,,,1,US,25.777471,-80.133433,1,0
1,0,US,USD,1,ruf.org,1296255600,0,1,n,RUF,CHECK,3,US,32.776566,-79.930922,0,49
2,8,US,USD,1,pvsd.k12.ca.us,1295713800,0,0,y,University Preparation School,CHECK,3,US,33.944201,-118.080419,0,36
3,6,IE,EUR,1,irishtabletennis.com,1360702800,0,1,y,,ACH,3,,,,0,21
4,11,US,USD,0,artsandbusinesscouncil.org,1297440000,1,0,y,Arts and Business Council or Greater Boston,CHECK,3,US,42.353848,-71.044276,0,49


# Set up Random Forest

In [141]:
cleaned_data.columns

Index(['channels', 'country', 'currency', 'delivery_method', 'email_domain',
       'event_start', 'fb_published', 'has_logo', 'listed', 'payee_name',
       'payout_type', 'user_type', 'venue_country', 'venue_latitude',
       'venue_longitude', 'fraud', 'n_previous_payouts'],
      dtype='object')

In [136]:
def check_none(df):
    cols = df.columns
    for col in cols:
        print(col, (df[col] == 'None').sum())

In [149]:
# Limit columns for first round of modeling
to_keep = ['channels', 'fb_published', 'has_logo',  
            'user_type', 'fraud', 'n_previous_payouts']
model_df = cleaned_data[to_keep]
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14337 entries, 0 to 14336
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   channels            14337 non-null  int64
 1   fb_published        14337 non-null  int64
 2   has_logo            14337 non-null  int64
 3   user_type           14337 non-null  int64
 4   fraud               14337 non-null  int64
 5   n_previous_payouts  14337 non-null  int64
dtypes: int64(6)
memory usage: 672.2 KB


In [150]:
# Check to make sure no nan in dataframe
model_df = model_df.dropna()
model_df.isna().sum()

channels              0
fb_published          0
has_logo              0
user_type             0
fraud                 0
n_previous_payouts    0
dtype: int64

In [151]:
y = model_df.pop('fraud')
X = model_df.copy()

In [152]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [154]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [158]:
y_hat = model.predict(X_test)
y_hat_proba = model.predict_proba(X_test)

In [159]:
model.score(X_test, y_test)

0.9308228730822873

In [160]:
model.feature_importances_

array([0.06595923, 0.01019287, 0.01659615, 0.05318978, 0.85406197])

In [166]:
y_hat_proba[0], model.classes_

(array([1., 0.]), array([0, 1]))

In [187]:
model.predict_proba(np.array(X_test.iloc[3]).reshape(1,-1))

array([[0.37, 0.63]])