In [1]:
# Data Process
import pandas as pd
import numpy as np 

# Data visulization
import plotly.express as px
import plotly
import matplotlib.pyplot as plt

# IO
from pathlib import Path

# Feature & Model
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, classification_report
from sklearn.model_selection import GridSearchCV
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import pickle

plt.style.use('ggplot')
pd.set_option('display.max_columns', 500)

In [2]:
# read csv data file
home = str(Path.home())
data = pd.read_csv(home + '/Startup-Analysis/data/processed/startup_data_cleaned.csv')

feat = ['age_first_funding_year', 'age_last_funding_year', 'age_first_milestone_year_impute', 'age_last_milestone_year_impute', 'relationships', 
       'funding_rounds', 'funding_total_usd', 'milestones', 'is_CA', 'is_NY', 'is_MA', 'is_TX', 'is_otherstate', 
       'is_web', 'is_mobile', 'is_enterprise', 'is_advertising', 'is_gamesvideo', 'is_ecommerce', 'is_biotech', 'is_consulting',
       'is_othercategory', 'has_VC', 'has_angel', 'has_roundA', 'has_roundB', 'has_roundC',
       'has_roundD', 'avg_participants', 'is_top500',  'latitude_decile', 'longitude_decile']
label = ['labels']

X = data[feat]
y = data[label]


# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

## Parameter tuning using grid search

In [None]:
parameters={'max_depth': np.random.choice(range(3,20), 5, replace=False),
        'gamma': np.random.uniform(1,9,5),
        'reg_alpha' : np.random.choice(range(1,20), 5, replace=False),
        'reg_lambda' : np.random.uniform(1,9,5),
        'colsample_bytree' : np.random.uniform(0,1,5),
        'min_child_weight' : np.random.uniform(0,9,5),
        'learning_rate': np.random.uniform(0,1,5)
    }

xgb_clf=XGBClassifier(eval_metric='logloss', 
                  objective = 'binary:logistic', 
                  grow_policy = 'lossguide', 
                  n_estimators = 100,
                  random_state=seed)

cv_clf = GridSearchCV(xgb_clf, parameters, scoring = 'average_precision', n_jobs = -1, cv = 5, refit = True, verbose=1)
cv_clf.fit( X_train, y_train)

Fitting 5 folds for each of 78125 candidates, totalling 390625 fits


In [None]:
df_cv_result = pd.DataFrame(cv_clf.cv_results_)
df_cv_result.loc[df_cv_result['mean_test_score'].idxmax()]['params']

In [None]:
model = cv_clf.best_estimator_

In [None]:
# make predictions for test data
y_pred = model.predict_proba(X_test)


In [None]:
# evaluate predictions
pr_auc = average_precision_score(y_test, y_pred[:,1])
print("PR_AUC: %.2f%%" % (pr_auc * 100.0))

The previous base model achieve PR_AUC of 82.27, with the parameter tuning, it increase to 82.96

# Save the model artifact

In [None]:
artifact_name = "xgb_tuned_model.pkl"

# save
pickle.dump(model, open(home  + '/Startup-Analysis/models/model_artifact/' + artifact_name, "wb"))

# Save the feature list

In [None]:
feat_file_name = 'feature.pkl'
pickle.dump(feat, open(home  + '/Startup-Analysis/models/feat/' + feat_file_name, "wb"))