In [52]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, f1_score, plot_roc_curve

In [6]:
X = pd.read_csv("../data/number_df.csv").drop('Unnamed: 0', axis = 1)
y = pd.read_csv("../data/number_target.csv").drop('Unnamed: 0', axis = 1)

In [23]:
X.columns

Index(['approx_payout_date', 'body_length', 'channels', 'delivery_method',
       'event_created', 'event_end', 'event_published', 'event_start',
       'fb_published', 'gts', 'has_analytics', 'has_header', 'has_logo',
       'name_length', 'num_order', 'num_payouts', 'object_id', 'org_facebook',
       'org_twitter', 'sale_duration', 'sale_duration2', 'show_map',
       'user_age', 'user_created', 'user_type', 'venue_latitude',
       'venue_longitude'],
      dtype='object')

In [8]:
y

Unnamed: 0,Fraud
0,True
1,False
2,False
3,False
4,False
...,...
14332,True
14333,False
14334,False
14335,False


In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify = y)

In [15]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [16]:
y_hat = dt.predict(X_test)

In [17]:
dt.score(X_test, y_test)

0.9562552831783601

In [22]:
#want close to 1
f1_score(y_test, y_hat)

0.7650397275822928

In [27]:
ticket = pd.read_csv('../data/ticket_type_df.csv').drop('Unnamed: 0', axis = 1)
ticket

Unnamed: 0,cost,quantity,num_sold,percent_sold
0,208.333333,920.0,0.0,0.000000
1,35.000000,100.0,25.0,0.250000
2,93.510000,48.0,48.0,1.000000
3,13.666667,30000.0,58.0,0.001933
4,101.750000,264.0,39.0,0.147727
...,...,...,...,...
14332,45.000000,400.0,0.0,0.000000
14333,43.250000,3256.0,628.0,0.192875
14334,42.000000,148.0,1.0,0.006757
14335,79.330000,100.0,0.0,0.000000


In [29]:
new_x = pd.concat([X, ticket],axis=1)

In [34]:
def balance_work(y_train):
    n1 = np.sum(y_train)
    n2 = len(y_train) - n1
    n_samples = n1 + n2
    w1 = n_samples / (2 * n1)
    w2 = n_samples / (2 * n2)
    return w1, w2

In [42]:
def Random_forest_model(X, y, num_trees, num_features):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    w1, w2 = balance_work(y_train)
    rf_model = RandomForestClassifier(n_estimators = num_trees, max_features = num_features, class_weight={1: w1, 0: w2})
    rf_model.fit(X_train, y_train.values.ravel())
    y_predict = rf_model.predict(X_test)
    score = rf_model.score(X_test,y_test)
    return score, confusion_matrix(y_test, y_predict), f1_score(y_test, y_predict), rf_model
rf_score2, rf_matrix2, f1_2, model2 = Random_forest_model(X, y, 50, 'sqrt')

In [46]:
rf_score2

0.9765690376569037

In [49]:
rf_matrix2.ravel()

array([3239,   23,   61,  262])

In [48]:
f1_2

0.861842105263158

In [63]:
def div_count_pos_neg(X, y):
    negatives, positives = y == 0, y == 1
    negative_count, positive_count = np.sum(negatives), np.sum(positives)
    X_positives, y_positives = X[positives], y[positives]
    X_negatives, y_negatives = X[negatives], y[negatives]
    return negative_count, positive_count, X_positives, \
           X_negatives, y_positives, y_negatives

In [91]:
def oversample(X, y, tp):
    """Randomly choose positive observations from X & y, with replacement
    to achieve the target proportion of positive to negative observations.

    Parameters
    ----------
    X  : ndarray - 2D
    y  : ndarray - 1D
    tp : float - range [0, 1], target proportion of positive class observations

    Returns
    -------
    X_undersampled : ndarray - 2D
    y_undersampled : ndarray - 1D
    """
#     if (0.5 < np.mean(y)).bool():
#         return X, y
    if 0.5 < np.mean(y):
        return X, y
    neg_count, pos_count, X_pos, X_neg, y_pos, y_neg = div_count_pos_neg(X, y)
    positive_range = np.arange(pos_count)
    positive_size = (tp * neg_count) / (1 - tp)
    positive_idxs = np.random.choice(a=positive_range,
                                     size=int(positive_size),
                                     replace=True)
    X_positive_oversampled = X_pos[positive_idxs]
    y_positive_oversampled = y_pos[positive_idxs]
    X_oversampled = np.vstack((X_positive_oversampled, X_neg))
    y_oversampled = np.concatenate((y_positive_oversampled, y_neg))

    return X_oversampled, y_oversampled

In [111]:
x_o, y_o = oversample(X.values, np.ravel(y.values), 0.5)

In [114]:
x_o.shape

(26088, 31)

In [115]:
y_o.shape

(26088,)

In [130]:
def Random_forest_model(X, y, num_trees, num_features):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    w1, w2 = balance_work(y_train)
    rf_model = RandomForestClassifier(n_estimators = num_trees, max_features = num_features)
    rf_model.fit(X_train, y_train)
    y_predict = rf_model.predict(X_test)
    score = rf_model.score(X_test,y_test)
    return score, confusion_matrix(y_test, y_predict), f1_score(y_test, y_predict), rf_model
rf_score3, rf_matrix3, f1_3, model3 = Random_forest_model(x_o, y_o, 50, 'sqrt')

In [131]:
f1_3

0.997858017135863

In [132]:
rf_matrix3.ravel()

array([3247,   14,    0, 3261])

In [64]:
def smote(X, y, tp, k=None):
    """Generates new observations from the positive (minority) class.
    For details, see: https://www.jair.org/media/953/live-953-2037-jair.pdf


    Parameters
    ----------
    X  : ndarray - 2D
    y  : ndarray - 1D
    tp : float - [0, 1], target proportion of positive class observations

    Returns
    -------
    X_smoted : ndarray - 2D
    y_smoted : ndarray - 1D
    """
    if tp < float(np.mean(y)):
        return X, y
    if k is None:
        k = int(len(X) ** 0.5)

    neg_count, pos_count, X_pos, X_neg, y_pos, y_neg = div_count_pos_neg(X, y)
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_pos, y_pos)
    neighbors = knn.kneighbors(return_distance=False)

    positive_size = (tp * neg_count) / (1 - tp)
    smote_num = int(positive_size - pos_count)

    rand_idxs = np.random.randint(0, pos_count, size=smote_num)
    rand_nghb_idxs = np.random.randint(0, k, size=smote_num)
    rand_pcts = np.random.random((smote_num, X.shape[1]))
    smotes = []
    for r_idx, r_nghb_idx, r_pct in zip(rand_idxs, rand_nghb_idxs, rand_pcts):
        rand_pos, rand_pos_neighbors = X_pos[r_idx], neighbors[r_idx]
        rand_pos_neighbor = X_pos[rand_pos_neighbors[r_nghb_idx]]
        rand_dir = rand_pos_neighbor - rand_pos
        rand_change = rand_dir * r_pct
        smoted_point = rand_pos + rand_change
        smotes.append(smoted_point)

    X_smoted = np.vstack((X, np.array(smotes)))
    y_smoted = np.concatenate((y, np.ones((smote_num,))))
    return X_smoted, y_smoted

In [116]:
x_s, y_s = smote(X.values, np.ravel(y.values), 0.5)

In [117]:
x_s

array([[1.26606240e+09, 3.85200000e+03, 5.00000000e+00, ...,
        9.20000000e+02, 0.00000000e+00, 0.00000000e+00],
       [1.29672000e+09, 3.49900000e+03, 0.00000000e+00, ...,
        1.00000000e+02, 2.50000000e+01, 2.50000000e-01],
       [1.29617280e+09, 2.60100000e+03, 8.00000000e+00, ...,
        4.80000000e+01, 4.80000000e+01, 1.00000000e+00],
       ...,
       [1.37226303e+09, 7.01791893e+03, 3.97404372e+00, ...,
        1.24739303e+01, 0.00000000e+00, 0.00000000e+00],
       [1.37945793e+09, 0.00000000e+00, 0.00000000e+00, ...,
        3.00000000e+01, 0.00000000e+00, 0.00000000e+00],
       [1.33023460e+09, 0.00000000e+00, 5.99578794e-01, ...,
        6.07557381e+01, 7.55289817e+00, 4.87652846e-01]])

In [118]:
x_s.shape

(26088, 31)

In [127]:
def Random_forest_model(X, y, num_trees, num_features):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    w1, w2 = balance_work(y_train)
    rf_model = RandomForestClassifier(n_estimators = num_trees, max_features = num_features)
    rf_model.fit(X_train, y_train)
    y_predict = rf_model.predict(X_test)
    score = rf_model.score(X_test,y_test)
    return score, confusion_matrix(y_test, y_predict), f1_score(y_test, y_predict), rf_model
rf_score4, rf_matrix4, f1_4, model4 = Random_forest_model(x_s, y_s, 50, 'sqrt')

In [128]:
rf_matrix4.ravel()

array([3233,   28,   35, 3226])

In [129]:
f1_4

0.9903300076745971