In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.decomposition import PCA, TruncatedSVD

In [None]:
df = pd.read_json('data/data.json')
# create binary labels for fraud and not fraud
fraud_idx = df[(df.acct_type=='fraudster_event')| (df.acct_type=='fraudster')|(df.acct_type=='fraudster_att')].index
# create label column
df['label'] = 0
df['label'].iloc[list(fraud_idx)]=1 # changes fraud labels to 1

In [None]:
# parse event descriptions
df['soups'] = df.description.apply(lambda x: BeautifulSoup(x, features='lxml') )

def get_text(soup):
    return [p.text.replace('\xa0', '') for p in soup.find_all('p') if p.text.replace('\xa0', '') != '']

df.soups = df.soups.apply(lambda x: get_text(x))

In [None]:
all_descriptions = df.soups.apply(lambda x: ''.join(x))

In [None]:
# vecotorize descriptions
tfidf = TfidfVectorizer(stop_words='english')
vecs = tfidf.fit_transform(all_descriptions)

vec_arr = vecs.toarray()

In [None]:
X = vec_arr
y = df.label.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
grad_boost = GradientBoostingClassifier()
log_reg = LogisticRegression(solver='lbfgs')
rand_forest = RandomForestClassifier(criterion='entropy', max_depth=50, n_estimators=100)
models = [grad_boost, log_reg, rand_forest]

In [None]:
cv_results = cross_validate(rand_forest, X_train, y_train, scoring='roc_auc', cv=3)

print("Cross Validated Roc Auc: {}".format(cv_results['test_score']))
print("Mean: {:0.3f}".format( cv_results['test_score'].mean()))

In [None]:
# dimensionality reduction
decomp = TruncatedSVD(n_components=100)
lsaX = decomp.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(lsaX, y)

In [None]:
cv_results = cross_validate(grad_boost, X_train, y_train, scoring='roc_auc', cv=3)

print("Cross Validated Roc Auc: {}".format(cv_results['test_score']))
print("Mean: {:0.3f}".format( cv_results['test_score'].mean()))

In [None]:
from sklearn.metrics import auc, roc_curve

def roc_plot(y_test, y_scores, g_params="ROC"):    
    '''
    plots an roc curve
    Input:
        y_test:     labeled y_test values
        y_scores:   classifier.fit(X_train, y_train).decision_function(X_test)
        g_params:   str. describe model output
    '''
    
    n_classes = 1
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    fpr, tpr, thresholds = roc_curve(y_test, y_scores)
    roc_auc = auc(fpr, tpr)

    plt.figure() 
    lw = 2 
    plt.plot(fpr, tpr, color='darkorange',lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 
    plt.xlim([0.0, 1.0]) 
    plt.ylim([0.0, 1.05]) 
    plt.xlabel('False Positive Rate') 
    plt.ylabel('True Positive Rate') 
    plt.title('ROC: Random Forest:  {} '.format(g_params)) 
    plt.legend(loc="lower right") 
    plt.show()

In [None]:
roc_plot(y_test, rand_forest.predict_proba(X_test)[:,1])