In [1]:
# Data Process
import pandas as pd
import numpy as np 

# Data visulization
import plotly.express as px
import plotly
import matplotlib.pyplot as plt

# IO
from pathlib import Path

# Feature & Model
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report

plt.style.use('ggplot')
pd.set_option('display.max_columns', 500)

In [2]:
# read csv data file
home = str(Path.home())
data = pd.read_csv(home + '/Startup-Analysis/data/processed/startup_data_cleaned.csv')

feat = ['age_first_funding_year', 'age_last_funding_year', 'age_first_milestone_year_impute', 'age_last_milestone_year_impute', 'relationships', 
       'funding_rounds', 'funding_total_usd', 'milestones', 'is_CA', 'is_NY', 'is_MA', 'is_TX', 'is_otherstate', 
       'is_web', 'is_mobile', 'is_enterprise', 'is_advertising', 'is_gamesvideo', 'is_ecommerce', 'is_biotech', 'is_consulting',
       'is_othercategory', 'has_VC', 'has_angel', 'has_roundA', 'has_roundB', 'has_roundC',
       'has_roundD', 'avg_participants', 'is_top500', 'lat_decile', 'long_decile']
label = ['labels']

X = data[feat]
y = data[label]


# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)


# fit model no training data
model = XGBClassifier(grow_policy='lossguide', eval_metric='logloss', colsample_bytree=0.8, random_state=seed)
model.fit(X_train, y_train)

In [3]:
# make predictions for test data
y_pred = model.predict_proba(X_test)


In [4]:
# evaluate predictions
pr_auc = average_precision_score(y_test, y_pred[:,1])
print("PR_AUC: %.2f%%" % (pr_auc * 100.0))

PR_AUC: 82.27%
