# Live Auction Neural Network

#### Plan
1. Try new method of upsampling, where randomly select features
2. Randomized Hyper parameter grid search
3. Unsupervised Pre-Training
4. MLP
5. Compare to other methods.

In [1]:
!pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip

Collecting https://github.com/Lasagne/Lasagne/archive/master.zip
  Downloading https://github.com/Lasagne/Lasagne/archive/master.zip (229kB)
[K    100% |████████████████████████████████| 235kB 5.5MB/s 
[?25hCollecting numpy (from Lasagne==0.2.dev1)
  Downloading numpy-1.14.2-cp36-cp36m-manylinux1_x86_64.whl (12.2MB)
[K    100% |████████████████████████████████| 12.2MB 112kB/s 
[?25hInstalling collected packages: numpy, Lasagne
  Found existing installation: numpy 1.14.0
    Uninstalling numpy-1.14.0:
      Successfully uninstalled numpy-1.14.0
  Found existing installation: Lasagne 0.1
    Uninstalling Lasagne-0.1:
      Successfully uninstalled Lasagne-0.1
  Running setup.py install for Lasagne ... [?25ldone
[?25hSuccessfully installed Lasagne-0.2.dev1 numpy-1.14.2
[33mYou are using pip version 9.0.1, however version 9.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
################################## Imports #####################################################

# Basic Imports
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import time
import datetime

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Model Infrastructure
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.utils import resample
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import cross_val_predict
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from scipy import stats
import random
from imblearn.over_sampling import SMOTE, ADASYN


import boto3
import io

# Models
from sklearn.ensemble import RandomForestClassifier
from sknn import ae, mlp
from sklearn import ensemble
from sklearn.neural_network import MLPClassifier
from sknn import ae, mlp

In [3]:
#################################### Bring in Data #############################################
start_time = time.time()
s3 = boto3.client('s3')

#Bring in Training Data
obj = s3.get_object(Bucket='data-science-project-data', Key='Human_or_Robot/train.csv')
train = pd.read_csv(io.BytesIO(obj['Body'].read()))
train.set_index('bidder_id', inplace=True)

# Bring in bids data
obj = s3.get_object(Bucket='data-science-project-data', Key='Human_or_Robot/bids.csv')
bids = pd.read_csv(io.BytesIO(obj['Body'].read()))

print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 56.84995913505554 seconds ---


In [24]:
train.head()

Unnamed: 0_level_0,payment_account,address,outcome
bidder_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
91a3c57b13234af24875c56fb7e2b2f4rb56a,a3d2de7675556553a5f08e4c88d2c228754av,a3d2de7675556553a5f08e4c88d2c228vt0u4,0.0
624f258b49e77713fc34034560f93fb3hu3jo,a3d2de7675556553a5f08e4c88d2c228v1sga,ae87054e5a97a8f840a3991d12611fdcrfbq3,0.0
1c5f4fc669099bfbfac515cd26997bd12ruaj,a3d2de7675556553a5f08e4c88d2c2280cybl,92520288b50f03907041887884ba49c0cl0pd,0.0
4bee9aba2abda51bf43d639013d6efe12iycd,51d80e233f7b6a7dfdee484a3c120f3b2ita8,4cb9717c8ad7e88a9a284989dd79b98dbevyi,0.0
4ab12bc61c82ddd9c2d65e60555808acqgos1,a3d2de7675556553a5f08e4c88d2c22857ddh,2a96c3ce94b3be921e0296097b88b56a7x1ji,0.0


In [4]:
###################################### Functions ##############################################

#Create function to time into a real time stamp. Assuming min_time_diff is a second.
def convert_time(time):
    min_time_diff = 52631579
    #Time in seconds of 1/1/2010 12:00AM since 1970
    init_seconds = 1262347200
    #Minimum value from dataset
    min_time = 9631916842105263

    b = init_seconds - (1/min_time_diff)*min_time
    timestamp = (1/min_time_diff)*time + b
    return datetime.datetime.fromtimestamp(timestamp)


########## Create the bids dataframe ################
def create_bid_dataframe(data):
    # Create converted Time column
    data['Converted Time'] = data['time'].apply(lambda x: convert_time(x))
    
    # Create bidder aggregation
    bidder_aggregate = data.groupby(['bidder_id'])['bid_id'].count().to_frame()
    bidder_aggregate.columns = ['Total Bids']
    bidder_aggregate['Total Auctions'] = data.groupby(['bidder_id'])['auction'].nunique()
    bidder_aggregate['Number of Merchandise'] = data.groupby(['bidder_id'])['merchandise'].nunique()
    bidder_aggregate['Number of Device'] = data.groupby(['bidder_id'])['device'].nunique()
    bidder_aggregate['Number of IPs'] = data.groupby(['bidder_id'])['ip'].nunique()
    bidder_aggregate['Number of URLs'] = data.groupby(['bidder_id'])['url'].nunique()
    bidder_aggregate['Number of Countries'] = data.groupby(['bidder_id'])['country'].nunique()
    bidder_aggregate['Number of Bids IN'] = bids[bids['country']=='in'].groupby(['bidder_id'])['country'].count()
    bidder_aggregate['Number of Bids NG'] = bids[bids['country']=='ng'].groupby(['bidder_id'])['country'].count()
    bidder_aggregate['Number of Bids ID'] = bids[bids['country']=='id'].groupby(['bidder_id'])['country'].count()
    bidder_aggregate['Number of Bids TR'] = bids[bids['country']=='tr'].groupby(['bidder_id'])['country'].count()
    bidder_aggregate['Number of Bids US'] = bids[bids['country']=='us'].groupby(['bidder_id'])['country'].count()
    bidder_aggregate.fillna(0, inplace=True)
    
    # Create Bidder Auction Dataframe
    bidder_auction = data.groupby(['bidder_id','auction'])['Converted Time'].min().to_frame()
    bidder_auction.columns = ['First Bid Time']
    bidder_auction['Last Bid Time'] = data.groupby(['bidder_id','auction'])['Converted Time'].max()
    bidder_auction['Bid Time Difference'] = bidder_auction['Last Bid Time'] - bidder_auction['First Bid Time']
    bidder_auction['Number of Bids'] = data.groupby(['bidder_id','auction'])['time'].count()
    bidder_auction['Mean Time per Bid'] = bidder_auction['Bid Time Difference']/bidder_auction['Number of Bids']
    
    # Creat Auction Dataframe
    auction = data.groupby(['auction'])['Converted Time'].min().to_frame()
    auction.columns = ['Auction Started']
    auction['Auction Ended'] = data.groupby(['auction'])['Converted Time'].max()
    auction['Auction Time Difference'] = auction['Auction Ended'] - auction['Auction Started']
    auction['Number of Bidders'] = data.groupby(['auction'])['bidder_id'].nunique()
    
    # Join Auction with Bidder_auction
    bidder_auction_full = bidder_auction.join(auction, how='inner')
    #Particpation measure. How much of the auction particpated in (time)
    bidder_auction_full['Time Particpation'] = (((bidder_auction_full['Auction Ended'] 
                                                 - bidder_auction_full['First Bid Time'])/bidder_auction_full['Auction Time Difference']))*100
    # Started ratio (When got in the higher the number the later)
    bidder_auction_full['Started Ratio'] = (((bidder_auction_full['First Bid Time'] 
                                                     - bidder_auction_full['Auction Started'])/bidder_auction_full['Auction Time Difference']))*100
    # Won Auction
    bidder_auction_full['Won Auction'] = bidder_auction_full['Last Bid Time']==bidder_auction_full['Auction Ended']
    
    # Now time to aggregate
    bidder_auction_full_aggregate = bidder_auction_full.groupby(['bidder_id'])['Mean Time per Bid'].min().to_frame()
    bidder_auction_full_aggregate.columns = ['Min Mean Time per Bid']
    bidder_auction_full_aggregate['Max Mean Time per Bid'] = bidder_auction_full.groupby(['bidder_id'])['Mean Time per Bid'].max()
    bidder_auction_full_aggregate['Min Time Particpation'] = bidder_auction_full.groupby(['bidder_id'])['Time Particpation'].min()
    bidder_auction_full_aggregate['Max Time Particpation'] = bidder_auction_full.groupby(['bidder_id'])['Time Particpation'].max()
    bidder_auction_full_aggregate['Mean Time Particpation'] = bidder_auction_full.groupby(['bidder_id'])['Time Particpation'].mean()
    bidder_auction_full_aggregate['Min Started Ratio'] = bidder_auction_full.groupby(['bidder_id'])['Started Ratio'].min().to_frame()
    bidder_auction_full_aggregate['Max Started Ratio'] = bidder_auction_full.groupby(['bidder_id'])['Started Ratio'].max().to_frame()
    bidder_auction_full_aggregate['Mean Started Ratio'] = bidder_auction_full.groupby(['bidder_id'])['Started Ratio'].mean().to_frame()
    bidder_auction_full_aggregate['Auctions Won'] = bidder_auction_full[bidder_auction_full['Won Auction']==True].groupby(['bidder_id'])['Won Auction'].count()
    bidder_auction_full_aggregate.fillna(0, inplace=True)
    
    # Joing to bidder aggregate
    bidder_features = bidder_aggregate.join(bidder_auction_full_aggregate, how='inner')
    
    #Convert Time Delta to seconds
    bidder_features['Max Mean Time per Bid'] = bidder_features['Max Mean Time per Bid'].apply(lambda x: x.total_seconds())
    bidder_features['Min Mean Time per Bid'] = bidder_features['Min Mean Time per Bid'].apply(lambda x: x.total_seconds())
    
    return bidder_features
    

############### Join the bidder_feature ##################

def add_features(train, features):
    return train.join(features, how='inner')


################# Custom Shuffle ########################
def custom_shuffle(df, n):
    columns = df.columns.tolist()
    length = len(df) - 1
    output = []
    for _ in range (n):
        row = {}
        for column in columns:
            index = random.randint(0,length)
            # Select random value
            row[column] = df[column][index]
        output.append(row)
    return pd.DataFrame(output)


################# Data Split #############################
'''
train - the training data set
split_percent - the train/test split
minority_upsample_percent - the percent to upsample by
'''
def train_split(train, split_percent, minority_upsample_percent):
    # Create initial x and y
    y = train['outcome']
    X = train.drop(['payment_account','address'],1)

    # Split the Data 
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=split_percent)

    # Drop outcome from X_test
    X_test = X_test.drop(['outcome'],1)

    # Do the upsample on the X_train
    df_majority = X_train[X_train.outcome==0]
    df_minority = X_train[X_train.outcome==1]
    
    # Calculate Upsample Percent
    nsamples = int(len(df_minority) + len(df_minority)*minority_upsample_percent)
    
    # Upsample minority class
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,     # sample with replacement
                                     n_samples=nsamples,    # to match majority class
                                     random_state=123) # reproducible results 

    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])

    # Rename the X_train and y_train
    X_train = df_upsampled.drop(['outcome'],1)
    y_train = df_upsampled['outcome']
    
    return X_train, X_test, y_train, y_test


######### Separate Outcome feature split ########################
def outcome_feature_split(train):
    y = train['outcome']
    X = train.drop(['outcome','payment_account','address'],1)
    return X,y
    

In [5]:
###################### Assemble Data ########################################
start_time = time.time()

# Create Bidder Features
bids.dropna(inplace=True)
bidder_features = create_bid_dataframe(bids)

#Create Training Set
training_set = add_features(train, bidder_features)

print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 95.06350779533386 seconds ---


## Build Models

In [34]:
# Perform Test train split
X_train, X_test, y_train, y_test = train_split(training_set, 0.3, 0.5)

In [35]:
####################### Gradient Boost Model ################################################

#Will use Grid Search to do the cross validation.
start_time = time.time()
parameters = {'subsample':[0.95],
              'max_depth':[2],
             'n_estimators':[500]}

# Initialize the model.
clf = ensemble.GradientBoostingClassifier(max_depth=6,loss='exponential')

#Create grid and perform 8 cross validation
gradient_grid = GridSearchCV(clf, parameters, cv=8, verbose=0, scoring='roc_auc')

#Fit the Data
gradient_grid.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))


#  Test Score
gradient_prediction = gradient_grid.predict(X_test)

gradient_prediction_proba = gradient_grid.predict_proba(X_test)
gradient_prediction_proba = [p[1] for p in gradient_prediction_proba]

# Produce the AUROC
print(roc_auc_score(y_test, gradient_prediction_proba))

--- 6.471538782119751 seconds ---
0.9390973794888385


In [48]:
# Tree based models do better with unbalanced classes.
print(classification_report(y_test, gradient_prediction))

             precision    recall  f1-score   support

        0.0       0.96      0.99      0.97       562
        1.0       0.64      0.21      0.32        33

avg / total       0.94      0.95      0.94       595



In [45]:
################## MLP Model #################################################################
start_time = time.time()
# Establish and fit the model, with a single, 1000 perceptron layer.

parameters = {'hidden_layer_sizes':[(1000,1000)],
             'activation':['logistic'],
             'solver':['adam'],
             'alpha':[0.0001, 0.001, 0.01, 1, 10, 100]}

mlp = MLPClassifier()

mlp_grid = GridSearchCV(mlp, parameters, scoring='roc_auc', cv=5, verbose=0)

mlp_grid.fit(X_train, y_train)

mlp_prediction_proba = mlp_grid.predict_proba(X_test)
mlp_prediction_proba = [p[1] for p in mlp_prediction_proba]

# Produce the AUROC
print(roc_auc_score(y_test, mlp_prediction_proba))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

0.8793809985980805
-- Execution time: 199.10712671279907 seconds ---


In [46]:
mlp_grid.best_params_

{'activation': 'logistic',
 'alpha': 0.01,
 'hidden_layer_sizes': (1000, 1000),
 'solver': 'adam'}

In [47]:
# MLP really doesnt do well with unbalanced classes.
mlp_prediction = mlp_grid.predict(X_test)
print(classification_report(y_test, mlp_prediction))

             precision    recall  f1-score   support

        0.0       0.95      1.00      0.97       562
        1.0       0.50      0.06      0.11        33

avg / total       0.92      0.94      0.92       595



## Use SMOTE to UPsample
A part from random sampling with replacement, there are 2 popular methods to over-sample mintory classes. Synthetic Minority Oversampling Technique (SMOTE) and Adaptive Synthetic sampling method (ADASYN)

In [6]:
# Test SMOTE
#Create outcome variable y and feature dataframe X
X,y = outcome_feature_split(training_set)

# Do the test train split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

X_resampled, y_resampled = SMOTE().fit_sample(X_train, y_train)


In [24]:
################## MLP Model #################################################################
start_time = time.time()
# Establish and fit the model, with a single, 1000 perceptron layer.

parameters = {'hidden_layer_sizes':[(1000,1000)],
             'activation':['logistic'],
             'solver':['adam'],
             'alpha':[0.001]}

mlp = MLPClassifier()

mlp_grid = GridSearchCV(mlp, parameters, scoring='roc_auc', cv=5, verbose=0)

mlp_grid.fit(X_resampled, y_resampled)

mlp_prediction_proba = mlp_grid.predict_proba(X_test)
mlp_prediction_proba = [p[1] for p in mlp_prediction_proba]

# Produce the AUROC
print(roc_auc_score(y_test, mlp_prediction_proba))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

0.8140583554376658
-- Execution time: 24.298595428466797 seconds ---


In [26]:
# The SMOTE Algorithm really improved the recall
mlp_prediction = mlp_grid.predict(X_test)
print(classification_report(y_test, mlp_prediction))

             precision    recall  f1-score   support

        0.0       0.96      0.90      0.93       377
        1.0       0.16      0.35      0.22        20

avg / total       0.92      0.87      0.89       397



In [12]:
####################### Gradient Boost Model ################################################

#Will use Grid Search to do the cross validation.
start_time = time.time()
parameters = {'subsample':[0.95,1],
              'max_depth':[2,4,6],
             'n_estimators':[300,500,800]}

# Initialize the model.
clf = ensemble.GradientBoostingClassifier(loss='exponential')

#Create grid and perform 8 cross validation
gradient_grid = GridSearchCV(clf, parameters, cv=5, verbose=0, scoring='f1')

#Fit the Data
gradient_grid.fit(X_resampled, y_resampled)
print("--- %s seconds ---" % (time.time() - start_time))


#  Test Score
gradient_prediction = gradient_grid.predict(X_test)

gradient_prediction_proba = gradient_grid.predict_proba(X_test)
gradient_prediction_proba = [p[1] for p in gradient_prediction_proba]

# Produce the AUROC
print(roc_auc_score(y_test, gradient_prediction_proba))

--- 193.59356331825256 seconds ---
0.9640102827763496


In [13]:
print(classification_report(y_test, gradient_prediction))

             precision    recall  f1-score   support

        0.0       0.99      0.96      0.98       389
        1.0       0.27      0.75      0.40         8

avg / total       0.98      0.95      0.96       397



Using SMOTE to handle the imbalanced class, improved the recall for the MLP. 

In [14]:
def Find_Optimal_Cutoff(target, predicted):
    """ Find the optimal probability cutoff point for a classification model related to event rate
    Parameters
    ----------
    target : Matrix with dependent or target data, where rows are observations

    predicted : Matrix with predicted data, where rows are observations

    Returns
    -------     
    list type, with optimal cutoff value

    """
    fpr, tpr, threshold = roc_curve(target, predicted)
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.ix[(roc.tf-0).abs().argsort()[:1]]

    return list(roc_t['threshold']) 


# Add prediction probability to dataframe from train set
gradient_train = gradient_grid.predict_proba(X_test)
gradient_train = [p[1] for p in gradient_train]

# Find optimal probability threshold
threshold = Find_Optimal_Cutoff(y_test, gradient_train)
print(threshold)

[0.008071129031701861]


In [15]:
#Applying threshold

gradient_prediction_proba_root = gradient_grid.predict_proba(X_test)
gradient_prediction_proba = [p[1] for p in gradient_prediction_proba_root]

prediction = pd.DataFrame(gradient_prediction_proba)
prediction.columns = ['Probability']
prediction['pred'] = prediction['Probability'].map(lambda x: 1 if x > threshold[0] else 0)

In [16]:
print(classification_report(y_test, prediction['pred']))

             precision    recall  f1-score   support

        0.0       1.00      0.92      0.96       389
        1.0       0.18      0.88      0.30         8

avg / total       0.98      0.92      0.94       397



### Use ASADYN for Upsampling

In [17]:
# Test SMOTE
#Create outcome variable y and feature dataframe X
X,y = outcome_feature_split(training_set)

# Do the test train split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

X_resampled, y_resampled = ADASYN().fit_sample(X_train, y_train)

In [19]:
################## MLP Model #################################################################
start_time = time.time()
# Establish and fit the model, with a single, 1000 perceptron layer.

parameters = {'hidden_layer_sizes':[(1000,1000)],
             'activation':['logistic'],
             'solver':['adam'],
             'alpha':[0.001]}

mlp = MLPClassifier()

mlp_grid = GridSearchCV(mlp, parameters, scoring='roc_auc', cv=5, verbose=0)

mlp_grid.fit(X_resampled, y_resampled)

mlp_prediction_proba = mlp_grid.predict_proba(X_test)
mlp_prediction_proba = [p[1] for p in mlp_prediction_proba]

#  Test Score
gradient_prediction = gradient_grid.predict(X_test)

# Produce the AUROC
print(roc_auc_score(y_test, mlp_prediction_proba))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

0.7775258251448728
-- Execution time: 25.495235204696655 seconds ---


Too many issues trying to use the AUto encoder. Its the reason why I planned on using scikit-nearualnetwork

In [18]:
####################### Gradient Boost Model ################################################

#Will use Grid Search to do the cross validation.
start_time = time.time()
parameters = {'subsample':[0.95,1],
              'max_depth':[2,4,6],
             'n_estimators':[300,500,800]}

# Initialize the model.
clf = ensemble.GradientBoostingClassifier(loss='exponential')

#Create grid and perform 8 cross validation
gradient_grid = GridSearchCV(clf, parameters, cv=5, verbose=0, scoring='f1')

#Fit the Data
gradient_grid.fit(X_resampled, y_resampled)
print("--- %s seconds ---" % (time.time() - start_time))


#  Test Score
gradient_prediction = gradient_grid.predict(X_test)

gradient_prediction_proba = gradient_grid.predict_proba(X_test)
gradient_prediction_proba = [p[1] for p in gradient_prediction_proba]

# Produce the AUROC
print(roc_auc_score(y_test, gradient_prediction_proba))

--- 172.67300868034363 seconds ---
0.8763134851138353


In [19]:
print(classification_report(y_test, gradient_prediction))

             precision    recall  f1-score   support

        0.0       0.98      0.94      0.96       571
        1.0       0.31      0.62      0.41        24

avg / total       0.96      0.93      0.94       595

