In [10]:
# Data Process
import pandas as pd
import numpy as np 

# Data visulization
import plotly.express as px
import plotly
import matplotlib.pyplot as plt

# IO
from pathlib import Path

# Feature & Model
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, classification_report
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

plt.style.use('ggplot')
pd.set_option('display.max_columns', 500)

In [2]:
# read csv data file
home = str(Path.home())
data = pd.read_csv(home + '/Startup-Analysis/data/processed/startup_data_cleaned.csv')

feat = ['age_first_funding_year', 'age_last_funding_year', 'age_first_milestone_year_impute', 'age_last_milestone_year_impute', 'relationships', 
       'funding_rounds', 'funding_total_usd', 'milestones', 'is_CA', 'is_NY', 'is_MA', 'is_TX', 'is_otherstate', 
       'is_web', 'is_mobile', 'is_enterprise', 'is_advertising', 'is_gamesvideo', 'is_ecommerce', 'is_biotech', 'is_consulting',
       'is_othercategory', 'has_VC', 'has_angel', 'has_roundA', 'has_roundB', 'has_roundC',
       'has_roundD', 'avg_participants', 'is_top500', 'lat_decile', 'long_decile']
label = ['labels']

X = data[feat]
y = data[label]


# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [3]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'learning_rate': hp.uniform('learning_rate', 0,1),
        'n_estimators': 100,
        'seed': 0
    }

In [4]:
def objective(space):
    clf=XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']), learning_rate=space['learning_rate'] ,eval_metric='logloss', 
                    objective = 'binary:logistic', grow_policy = 'lossguide', early_stopping_rounds=15, random_state=seed
    )
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation,
            verbose=False)
    

    pred = clf.predict_proba(X_test)
    accuracy = average_precision_score(y_test, pred[:,1])
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [5]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

SCORE:                                                                                                                  
0.7449072080072877                                                                                                      
SCORE:                                                                                                                  
0.6131147540983607                                                                                                      
SCORE:                                                                                                                  
0.6131147540983607                                                                                                      
SCORE:                                                                                                                  
0.6131147540983607                                                                                                      
SCORE:                          

In [6]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.5045024960728945, 'gamma': 1.025019489728992, 'learning_rate': 0.043898360152916927, 'max_depth': 3.0, 'min_child_weight': 4.0, 'reg_alpha': 79.0, 'reg_lambda': 0.7029778984753066}


In [7]:
# fit model no training data
model = XGBClassifier(colsample_bytree = best_hyperparams['colsample_bytree'], 
                      gamma = best_hyperparams['gamma'], 
                      max_depth = int(best_hyperparams['max_depth']), 
                      min_child_weight = best_hyperparams['min_child_weight'], 
                      reg_alpha = best_hyperparams['reg_alpha'], 
                      reg_lambda = best_hyperparams['reg_lambda'], 
                      learning_rate = best_hyperparams['learning_rate'],
                      objective = 'binary:logistic', 
                      grow_policy = 'lossguide',
                      eval_metric='logloss',
                      n_estimators = 100, 
                      random_state=seed)
model.fit(X_train, y_train)

In [8]:
# make predictions for test data
y_pred = model.predict_proba(X_test)


In [9]:
# evaluate predictions
pr_auc = average_precision_score(y_test, y_pred[:,1])
print("PR_AUC: %.2f%%" % (pr_auc * 100.0))

PR_AUC: 82.64%
