In [2]:
import xgboost
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Generate CV scores for random forest and xgb to validate results of Dataiku AutoML results

In [3]:
df = pd.read_csv("../data/PNI_001_cleaned.csv")

X = df.drop(['ID','PNI'],axis =1).values
y = df['PNI'].values
ID = df['ID'].values

In [None]:
cv = StratifiedKFold(n_splits=5,shuffle = True)
clf = RandomForestClassifier(n_estimators=200,max_depth=25,min_samples_leaf=1,verbose=0)
clf = xgboost.XGBClassifier()

id_list = []
y_test_list = []
y_pred_list = []
for i, (train, test) in enumerate(cv.split(X, y)):
    clf.fit(X[train], y[train])
    y_pred = clf.predict_proba(X[test])[:,1]
    
    print(roc_auc_score(y[test],y_pred))
    y_pred_list.append(list(y_pred))
    y_test_list.append(list(y[test]))
    id_list.append(list(ID[test]))

In [7]:
y_pred_list_flat = [item for sublist in y_pred_list for item in sublist]
y_test_list_flat = [item for sublist in y_test_list for item in sublist]
id_list_flat = [item for sublist in id_list for item in sublist]

In [8]:
df_results = pd.DataFrame({'probability':y_pred_list_flat,
                          'PNI': y_test_list_flat,
                          'ID': id_list_flat})

In [9]:
df_results.to_csv('pni001_rf_predictions.csv',index = False)

In [None]:
import seaborn as sns
sns.distplot(df_results[df_results['PNI'] == 0]['probability'], color = 'blue', kde = False, norm_hist = True)
sns.distplot(df_results[df_results['PNI'] == 1]['probability'], color = 'red', kde = False, norm_hist = True)
plt.legend(['PNI = 0', 'PNI = 1'])
plt.title('Normalized Histogram for PNI Probability')
