#### Import modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import cPickle as pickle
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation

#### Load Data In

In [69]:
df_fraud = pd.read_json('data.json')

#### Convert the 'acct_type' column into a 'fraud' column with 1 for fraud and 0 for non-fraud events

In [3]:
fraud = {'premium':0, 'spammer_limited': 0, 'spammer_warn':0, 'tos_warn':0,
        'spammer_noinvite':0, 'tos_lock':0, 'locked':0, 'spammer_web':0,
        'spammer':0, 'fraudster_event':1, 'fraudster':1, 'fraudster_att':1}
df_fraud['fraud'] = df_fraud['acct_type'].map(fraud)

#### Create an 'event_duration' column which is the the difference between the event start and end times

In [4]:
df_fraud['event_duration'] = (df_fraud.event_end - df_fraud.event_start)/3600.

#### Create a list of the top 30 most common domains and classify all other domains as 'other'

In [5]:
top_domains = set(pd.DataFrame(df_fraud.email_domain.value_counts()[0:31]).index)
for i,row in enumerate(df_fraud.email_domain):
    if row not in top_domains:
        df_fraud.ix[i, 'email_domain'] = 'other'

Noticed that a lot of 'payee_names' were missing and that most of the columns with missing payee_names corresponded to fraud events, so I classified 'payee_names' as 1 if a name was given and 0 if no name was given. The 'payout_type' column was also missing entries and I classified the 'payout_type' for these samples as 'not_available'

In [6]:
for index, row in df_fraud.iterrows():
    if len(row['payee_name']) == 0:
        df_fraud.ix[index, 'payee_name'] = 0
    else:
        df_fraud.ix[index, 'payee_name'] = 1
    if len(row['payout_type']) == 0:
        df_fraud.ix[index, 'payout_type'] = 'not_available'
df_fraud['payee_name'] = df_fraud['payee_name'].astype(float)

The 'ticket_types' column was made up of dictionaries giving details about the different ticket types ordered for each event. Each ticket type dictionary had information on cost, quantity sold and total quantity available. From these dictionaries, I added columns to our dataframe to represent the count of ticket types offered, minimum price, maximum price, total quantity sold, total quantity of tickets available, value of sold tickets, total value of tickets available and a weighted price.

The 'previous_payouts' column was also made up of dictionaries with information on previous events created by the user. From these dictionaries, I added columns to our dataframe to represent the count of previous events crated by the user and the average payout to the user per event.

In [7]:
for index, row in df_fraud.iterrows():   
    df_fraud.ix[index,'types_count'] = len(row['ticket_types'])
    df_fraud.ix[index,'min_price'] = min([j['cost'] for j in row['ticket_types']] or [0])
    df_fraud.ix[index,'max_price'] = max([j['cost'] for j in row['ticket_types']] or [0])
    df_fraud.ix[index,'quantity_sold'] = sum([j['quantity_sold'] for j in row['ticket_types']] or [0])
    df_fraud.ix[index,'quantity_total'] = sum([j['quantity_total'] for j in row['ticket_types']] or [0])
    df_fraud.ix[index,'value_sold'] = sum([j['quantity_sold']*j['cost'] for j in row['ticket_types']] or [0])
    df_fraud.ix[index,'value_total'] = sum([j['quantity_total']*j['cost'] for j in row['ticket_types']] or [0])
    
    df_fraud.ix[index,'payout_count'] = len(row['previous_payouts'])
    try:
        df_fraud.ix[index,'avg_payout'] = sum([j['amount'] for j in row['previous_payouts']] or [0]) / len(row['previous_payouts'])
    except:
        df_fraud.ix[index,'avg_payout'] = 0    
    
    
for index, row in df_fraud.iterrows():     
    try:
        df_fraud.ix[index,'weighted_price'] = row['value_total'] / row['quantity_total']
    except:
        df_fraud.ix[index,'weighted_price'] = 0

In [8]:
df_fraud = df_fraud.drop(['acct_type','event_end', 'event_start', 'object_id',
                          'venue_name','venue_address', 'user_created',
                         'venue_latitude', 'venue_longitude', 'ticket_types',
                         'previous_payouts'],axis=1)

In [9]:
encode = {}
for column in ['country', 'currency', 'email_domain', 'listed', 'payout_type',
              'venue_country', 'venue_state']:
    le = LabelEncoder()
    le.fit(df_fraud[column])
    df_fraud[column] = le.transform(df_fraud[column])
    encode[column] = le

In [10]:
def tokenize(doc):
    bs = BeautifulSoup(doc.encode('ascii','ignore'))
    document = bs.text
    wordnet_lemmatizer = WordNetLemmatizer()
    stops = stopwords.words('english')
    sp = set(punctuation)
    sp.add('``')
    sp.add("''")
    texts = [word for word in document.lower().split() if word not in sp]
    texts = [word for word in texts if word not in stops]
    texts = [wordnet_lemmatizer.lemmatize(word) for word in texts]
    return texts

In [11]:
with open('pickle/tfidf-description.pkl') as f:
    tfidf_description = pickle.load(f)
with open('pickle/rfmodel-description.pkl') as f:
    rf_description = pickle.load(f)
df_fraud['description']= rf_description.predict_proba(tfidf_description.transform(df_fraud['description']))[:,1]



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [12]:
with open('pickle/tfidf-name.pkl') as f:
    tfidf_name = pickle.load(f)
with open('pickle/rfmodel-name.pkl') as f:
    rf_name = pickle.load(f)
df_fraud['name']= rf_name.predict_proba(tfidf_name.transform(df_fraud['name']))[:,1]

In [13]:
with open('pickle/tfidf-org_name.pkl') as f:
    tfidf_org_name = pickle.load(f)
with open('pickle/rfmodel-org_name.pkl') as f:
    rf_org_name = pickle.load(f)
df_fraud['org_name']= rf_org_name.predict_proba(tfidf_org_name.transform(df_fraud['org_name']))[:,1]

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [14]:
with open('pickle/tfidf-org_desc.pkl') as f:
    tfidf_org_desc = pickle.load(f)
with open('pickle/rfmodel-org_desc.pkl') as f:
    rf_org_desc = pickle.load(f)
df_fraud['org_desc']= rf_org_desc.predict_proba(tfidf_org_desc.transform(df_fraud['org_desc']))[:,1]

  ' that document to Beautiful Soup.' % decoded_markup


In [63]:
with open('pickle/top_domains.pkl', 'wb') as f:
    pickle.dump(top_domains, f)
with open('pickle/encode.pkl', 'wb') as f:
    pickle.dump(encode, f)

In [30]:
y_data = df_fraud.pop('fraud').values
X_data = df_fraud.values

In [53]:
def prep_data(df_fraud):
    
    df_fraud['event_duration'] = (df_fraud.event_end - df_fraud.event_start)/3600.
    
    with open('pickle/top_domains.pkl', 'w') as f:
        top_domains = pickle.load(f)
    for i,row in enumerate(df_fraud.email_domain):
        if row not in top_domains:
            df_fraud.ix[i, 'email_domain'] = 'other'
    
    for index, row in df_fraud.iterrows():
        if index != 0:
            if len(row['payee_name']) == 0:
                df_fraud.ix[index, 'payee_name'] = 0
            else:
                df_fraud.ix[index, 'payee_name'] = 1
        if len(row['payout_type']) == 0:
            df_fraud.ix[index, 'payout_type'] = 'not_available'
    df_fraud['payee_name'] = df_fraud['payee_name'].astype(float)
    
    for index, row in df_fraud.iterrows():   
        df_fraud.ix[index,'types_count'] = len(row['ticket_types'])
        df_fraud.ix[index,'min_price'] = min([j['cost'] for j in row['ticket_types']] or [0])
        df_fraud.ix[index,'max_price'] = max([j['cost'] for j in row['ticket_types']] or [0])
        df_fraud.ix[index,'quantity_sold'] = sum([j['quantity_sold'] for j in row['ticket_types']] or [0])
        df_fraud.ix[index,'quantity_total'] = sum([j['quantity_total'] for j in row['ticket_types']] or [0])
        df_fraud.ix[index,'value_sold'] = sum([j['quantity_sold']*j['cost'] for j in row['ticket_types']] or [0])
        df_fraud.ix[index,'value_total'] = sum([j['quantity_total']*j['cost'] for j in row['ticket_types']] or [0])

        df_fraud.ix[index,'payout_count'] = len(row['previous_payouts'])
        try:
            df_fraud.ix[index,'avg_payout'] = sum([j['amount'] for j in row['previous_payouts']] or [0]) / len(row['previous_payouts'])
        except:
            df_fraud.ix[index,'avg_payout'] = 0    
    
    
    for index, row in df_fraud.iterrows():     
        try:
            df_fraud.ix[index,'weighted_price'] = row['value_total'] / row['quantity_total']
        except:
            df_fraud.ix[index,'weighted_price'] = 0
            
    df_fraud = df_fraud.drop(['acct_type','event_end', 'event_start', 'object_id',
                          'venue_name','venue_address', 'user_created',
                         'venue_latitude', 'venue_longitude', 'ticket_types',
                         'previous_payouts'],axis=1)
    
    with open('pickle/encode.pkl', 'w') as f:
        encode = pickle.load(f)
    for column in ['country', 'currency', 'email_domain', 'listed', 'payout_type',
              'venue_country', 'venue_state']:
        df_fraud[column] = encode[column].transform(df_fraud[column])
        
    with open('pickle/tfidf-description.pkl') as f:
        tfidf_description = pickle.load(f)
    with open('pickle/rfmodel-description.pkl') as f:
        rf_description = pickle.load(f)
    df_fraud['description']= rf_description.predict_proba(tfidf_description.transform(df_fraud['description']))[:,1]
    
    with open('pickle/tfidf-name.pkl') as f:
        tfidf_name = pickle.load(f)
    with open('pickle/rfmodel-name.pkl') as f:
        rf_name = pickle.load(f)
    df_fraud['name']= rf_name.predict_proba(tfidf_name.transform(df_fraud['name']))[:,1]
    
    with open('pickle/tfidf-org_name.pkl') as f:
        tfidf_org_name = pickle.load(f)
    with open('pickle/rfmodel-org_name.pkl') as f:
        rf_org_name = pickle.load(f)
    df_fraud['org_name']= rf_org_name.predict_proba(tfidf_org_name.transform(df_fraud['org_name']))[:,1]
    
    with open('pickle/tfidf-org_desc.pkl') as f:
        tfidf_org_desc = pickle.load(f)
    with open('pickle/rfmodel-org_desc.pkl') as f:
        rf_org_desc = pickle.load(f)
    df_fraud['org_desc']= rf_org_desc.predict_proba(tfidf_org_desc.transform(df_fraud['org_desc']))[:,1]
    
    return df_fraud, df_fraud.values

### Modeling

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0)

In [44]:
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier()
# model.fit(X_train, y_train)
# model.predict_proba(X_test)
# model.score(X_test, y_test)

In [55]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train, y_train)
probabilities = model.predict_proba(X_train)[:,1]

In [56]:
with open('pickle/model.pkl', 'w') as f:
    pickle.dump(model, f)
with open('pickle/probabilities.pkl', 'w') as f:
    pickle.dump(probabilities, f)
with open('pickle/actual.pkl', 'w') as f:
    pickle.dump(y_train, f)

In [46]:
from sklearn.metrics import confusion_matrix
probs = model.predict_proba(X_train)[:,1]
for i in np.linspace(0.0001, 0.9999, 30):
    preds = [1 if j > i else 0 for j in probs]
    print confusion_matrix(y_train, preds)  

In [71]:
list(df_fraud.columns)

[u'acct_type',
 u'approx_payout_date',
 u'body_length',
 u'channels',
 u'country',
 u'currency',
 u'delivery_method',
 u'description',
 u'email_domain',
 u'event_created',
 u'event_end',
 u'event_published',
 u'event_start',
 u'fb_published',
 u'gts',
 u'has_analytics',
 u'has_header',
 u'has_logo',
 u'listed',
 u'name',
 u'name_length',
 u'num_order',
 u'num_payouts',
 u'object_id',
 u'org_desc',
 u'org_facebook',
 u'org_name',
 u'org_twitter',
 u'payee_name',
 u'payout_type',
 u'previous_payouts',
 u'sale_duration',
 u'sale_duration2',
 u'show_map',
 u'ticket_types',
 u'user_age',
 u'user_created',
 u'user_type',
 u'venue_address',
 u'venue_country',
 u'venue_latitude',
 u'venue_longitude',
 u'venue_name',
 u'venue_state']

In [62]:
import json
with open('data_test.json', 'w') as f:
    json.dump(X_test.tolist(), f)

In [61]:
from sklearn.model_selection import train_test_split
df_fraud_test = pd.read_json('data.json')
y_data_ = df_fraud_test.pop('acct_type').values
X_data_ = df_fraud.values
X_train, X_test, y_train, y_test = train_test_split(X_data_, y_data_, test_size=0)