In [1]:
import scipy as sp
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from scipy.stats import randint as sp_randint
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import brier_score_loss
from sklearn.model_selection import RandomizedSearchCV
from time import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [2]:
#NEED INSTALL IMBLANCE PACKAGE
#code: conda install -c glemaitre imbalanced-learn
from imblearn.over_sampling import SMOTE
#read tables: After variable selection, choosing 50 variables by baseline RF importance
train=pd.read_table('training_self1_0127.txt')
test=pd.read_table('testing_self1_0127.txt')

In [3]:
train_Ycat=train['Ycat']
train_PERT=train['PERT_Y']
train=train.drop('Ycat',1)
train=train.drop('MemberID',1)
train=train.drop('PERT_Y',1)

test_Ycat=test['Ycat']
test_PERT=test['PERT_Y']
test=test.drop('Ycat',1)
test=test.drop('PERT_Y',1)
test=test.drop('MemberID',1)

scaler = MinMaxScaler()
train = scaler.fit_transform(train)
test =  scaler.transform(test)

train_std=pd.DataFrame(train)
test_std=pd.DataFrame(test)

res=[1 if x =='Y' else 0 for x in train_PERT]

In [4]:
sm = SMOTE(random_state=1234)
X_res, y_res = sm.fit_sample(train_std, res)

In [5]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


In [6]:
#hyper parameter options, could also apply to other model. 
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 20),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}


# Optimize Hyper-parameters by Recall

In [7]:
rf=RandomForestClassifier(n_estimators=100)

In [8]:
#search 50 times     
seed = np.random.seed(1234)
n_iter_search = 20
random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring ='recall',verbose=100)

start = time()
random_search.fit(X_res,y_res )
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

report(random_search.cv_results_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] min_samples_split=10, bootstrap=False, max_features=13, criterion=entropy, min_samples_leaf=5, max_depth=3 
[CV]  min_samples_split=10, bootstrap=False, max_features=13, criterion=entropy, min_samples_leaf=5, max_depth=3, score=0.949531, total= 4.1min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.2min remaining:    0.0s
[CV] min_samples_split=10, bootstrap=False, max_features=13, criterion=entropy, min_samples_leaf=5, max_depth=3 
[CV]  min_samples_split=10, bootstrap=False, max_features=13, criterion=entropy, min_samples_leaf=5, max_depth=3, score=0.950659, total= 4.4min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.7min remaining:    0.0s
[CV] min_samples_split=10, bootstrap=False, max_features=13, criterion=entropy, min_samples_leaf=5, max_depth=3 
[CV]  min_samples_split=10, bootstrap=False, max_features=13, criterion=entropy, min_samples_leaf=5, max_depth=3, score=0.950667, total= 4.4min
[Paralle

In [9]:
#RandomizedSearchCV took 20049.04 seconds for 20 candidates parameter settings.
#Model with rank: 1
#Mean validation score: 0.998 (std: 0.003)
#Parameters: {'min_samples_split': 4, 'bootstrap': True, 'max_features': 5, 'criterion': 'gini', 
#'min_samples_leaf': 4, 'max_depth': None}

rf=RandomForestClassifier(n_estimators=500)
rf=RandomForestClassifier(criterion='gini', max_depth=None, min_samples_split=4, 
                       min_samples_leaf=4,max_features=5,bootstrap=True,random_state=1234)


In [10]:
#fit the model
rf.fit(X_res,y_res)

predicted_probs = rf.predict_proba(train_std)
a= predicted_probs[:,1]
np.mean(a)
b= predicted_probs[:,0]
np.mean(b)
a1=sorted(a,reverse=True)
a2=pd.DataFrame(a1)
a2.iloc[train_PERT.value_counts()["Y"]*4-1,:]

0    0.205
Name: 6679, dtype: float64

In [11]:
#Synchronize with Jeff's Scoring Functions
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
    f1_score, roc_auc_score, average_precision_score, brier_score_loss, \
    fbeta_score, confusion_matrix
from creonmetrics import labeled_metric, assumed_metric, pu_score, \
    pr_one_unlabeled, brier_score_partial_loss

y_true = test_Ycat.values
y_prob = rf.predict_proba(test_std)

y_pred=np.array([1 if x >= 0.205 else 0 for x in y_prob[:,1]])

In [12]:
import collections
data = {'labeled_acc' : labeled_metric(y_true, y_pred, accuracy_score),
        'labeled_prec' : labeled_metric(y_true, y_pred, precision_score),
        'labeled_recall' : labeled_metric(y_true, y_pred, recall_score),
        'labeled_f1' : labeled_metric(y_true, y_pred, f1_score),
        'labeled_roc_auc' : labeled_metric(y_true, y_pred, roc_auc_score),
        'labeled_avg_prec' : labeled_metric(y_true, y_pred, average_precision_score),
        'labeled_brier' : labeled_metric(y_true, y_prob, brier_score_loss),
        'labeled_brier_pos' : labeled_metric(y_true, y_prob, brier_score_partial_loss, label=1),
        'labeled_brier_neg' : labeled_metric(y_true, y_prob, brier_score_partial_loss, label=0),
        'confusion_matrix_lab' : labeled_metric(y_true, y_pred, confusion_matrix),
        'pr_one_unlabeled' : pr_one_unlabeled(y_true, y_pred),
        'assumed_brier' : assumed_metric(y_true, y_prob, brier_score_loss),
        'assumed_brier_neg' : assumed_metric(y_true, y_prob, brier_score_partial_loss, label=0),
        'assumed_f1' : assumed_metric(y_true, y_pred, f1_score),
        'assumed_f1beta10' : assumed_metric(y_true, y_pred, fbeta_score, beta=10),
        'confusion_matrix_un' : assumed_metric(y_true, y_pred, confusion_matrix),
        'pu_score' : pu_score(y_true, y_pred),
        }
data_s = collections.OrderedDict(sorted(data.items()))
for k, v in data_s.items():
    print(k, ': ', v)


assumed_brier :  0.007355754824
assumed_brier_neg :  0.00457415648911
assumed_f1 :  0.162200785995
assumed_f1beta10 :  0.518933478192
confusion_matrix_lab :  [[189  27]
 [191 227]]
confusion_matrix_un :  [[85568  2154]
 [  191   227]]
labeled_acc :  0.656151419558
labeled_avg_prec :  0.869012409006
labeled_brier :  0.397944167705
labeled_brier_neg :  0.0241413318277
labeled_brier_pos :  0.591105441747
labeled_f1 :  0.675595238095
labeled_prec :  0.893700787402
labeled_recall :  0.543062200957
labeled_roc_auc :  0.709031100478
pr_one_unlabeled :  0.0243069046694
pu_score :  10.9172385884
