In [1]:
import scipy as sp
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from scipy.stats import randint as sp_randint
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import brier_score_loss
from sklearn.model_selection import RandomizedSearchCV
from time import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
#NEED INSTALL IMBLANCE PACKAGE
#code: conda install -c glemaitre imbalanced-learn
from imblearn.over_sampling import SMOTE
#read tables: After variable selection, choosing 50 variables by baseline RF importance
train=pd.read_table('training_self1_0127.txt')
test=pd.read_table('testing_self1_0127.txt')

In [3]:
train_Ycat=train['Ycat']
train_PERT=train['PERT_Y']
train=train.drop('Ycat',1)
train=train.drop('MemberID',1)
train=train.drop('PERT_Y',1)

test_Ycat=test['Ycat']
test_PERT=test['PERT_Y']
test=test.drop('Ycat',1)
test=test.drop('PERT_Y',1)
test=test.drop('MemberID',1)

scaler = MinMaxScaler()
train = scaler.fit_transform(train)
test =  scaler.transform(test)

train_std=pd.DataFrame(train)
test_std=pd.DataFrame(test)

res=[1 if x =='Y' else 0 for x in train_PERT]

In [4]:
sm = SMOTE(random_state=1234,ratio=.2)
X_res, y_res = sm.fit_sample(train_std, res)

In [5]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            

In [6]:
#hyper parameter options, could also apply to other model. 
param_dist = {"loss": ['deviance', 'exponential'],
              "learning_rate": sp.stats.expon(scale=.1),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "max_depth": sp_randint(1, 5),
              "max_features": ['auto', 'sqrt','log2']}


# Optimize Hyper-parameters by Recall*Precision

In [7]:
gbc=GradientBoostingClassifier(random_state=1234)

In [8]:
from sklearn.metrics import precision_recall_fscore_support,recall_score, make_scorer,precision_score

def pu(actual,prediction):
    precision = precision_score(actual, prediction)
    recall = recall_score(actual, prediction)
    pu= precision*recall
    return pu

pu_score=make_scorer(pu,greater_is_better=True)

In [10]:
#search 50 times     
seed = np.random.seed(1234)
n_iter_search = 20
random_search = RandomizedSearchCV(gbc, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring =pu_score, verbose=100)

start = time()
random_search.fit(X_res,y_res )
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

report(random_search.cv_results_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] learning_rate=0.0212598657618, max_features=auto, loss=deviance, max_depth=2, min_samples_split=10, min_samples_leaf=5 
[CV]  learning_rate=0.0212598657618, max_features=auto, loss=deviance, max_depth=2, min_samples_split=10, min_samples_leaf=5, score=0.742425, total= 2.0min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.0min remaining:    0.0s
[CV] learning_rate=0.0212598657618, max_features=auto, loss=deviance, max_depth=2, min_samples_split=10, min_samples_leaf=5 
[CV]  learning_rate=0.0212598657618, max_features=auto, loss=deviance, max_depth=2, min_samples_split=10, min_samples_leaf=5, score=0.763739, total= 2.1min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.2min remaining:    0.0s
[CV] learning_rate=0.0212598657618, max_features=auto, loss=deviance, max_depth=2, min_samples_split=10, min_samples_leaf=5 
[CV]  learning_rate=0.0212598657618, max_features=auto, loss=deviance, max_depth=2, min_samp

  'precision', 'predicted', average, warn_for)


[CV]  learning_rate=0.00485132452566, max_features=log2, loss=deviance, max_depth=1, min_samples_split=3, min_samples_leaf=4, score=0.000000, total=  14.5s
[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed: 29.8min remaining:    0.0s
[CV] learning_rate=0.00485132452566, max_features=log2, loss=deviance, max_depth=1, min_samples_split=3, min_samples_leaf=4 


  'precision', 'predicted', average, warn_for)


[CV]  learning_rate=0.00485132452566, max_features=log2, loss=deviance, max_depth=1, min_samples_split=3, min_samples_leaf=4, score=0.000000, total=  13.9s
[Parallel(n_jobs=1)]: Done  29 out of  29 | elapsed: 30.0min remaining:    0.0s
[CV] learning_rate=0.00485132452566, max_features=log2, loss=deviance, max_depth=1, min_samples_split=3, min_samples_leaf=4 


  'precision', 'predicted', average, warn_for)


[CV]  learning_rate=0.00485132452566, max_features=log2, loss=deviance, max_depth=1, min_samples_split=3, min_samples_leaf=4, score=0.000000, total=  13.5s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 30.3min remaining:    0.0s
[CV] learning_rate=0.0824242867784, max_features=sqrt, loss=deviance, max_depth=3, min_samples_split=9, min_samples_leaf=7 
[CV]  learning_rate=0.0824242867784, max_features=sqrt, loss=deviance, max_depth=3, min_samples_split=9, min_samples_leaf=7, score=0.831788, total=  40.0s
[Parallel(n_jobs=1)]: Done  31 out of  31 | elapsed: 31.0min remaining:    0.0s
[CV] learning_rate=0.0824242867784, max_features=sqrt, loss=deviance, max_depth=3, min_samples_split=9, min_samples_leaf=7 
[CV]  learning_rate=0.0824242867784, max_features=sqrt, loss=deviance, max_depth=3, min_samples_split=9, min_samples_leaf=7, score=0.871310, total=  39.8s
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed: 31.6min remaining:    0.0s
[CV] learning_rate=0.0824242867784, max_featur

  'precision', 'predicted', average, warn_for)


[CV]  learning_rate=0.00117002534447, max_features=sqrt, loss=exponential, max_depth=3, min_samples_split=5, min_samples_leaf=2, score=0.000000, total=  41.5s
[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed: 53.0min remaining:    0.0s
[CV] learning_rate=0.00117002534447, max_features=sqrt, loss=exponential, max_depth=3, min_samples_split=5, min_samples_leaf=2 


  'precision', 'predicted', average, warn_for)


[CV]  learning_rate=0.00117002534447, max_features=sqrt, loss=exponential, max_depth=3, min_samples_split=5, min_samples_leaf=2, score=0.000000, total=  41.1s
[Parallel(n_jobs=1)]: Done  53 out of  53 | elapsed: 53.7min remaining:    0.0s
[CV] learning_rate=0.00117002534447, max_features=sqrt, loss=exponential, max_depth=3, min_samples_split=5, min_samples_leaf=2 


  'precision', 'predicted', average, warn_for)


[CV]  learning_rate=0.00117002534447, max_features=sqrt, loss=exponential, max_depth=3, min_samples_split=5, min_samples_leaf=2, score=0.000000, total=  40.7s
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 54.4min remaining:    0.0s
[CV] learning_rate=0.11712352736, max_features=sqrt, loss=exponential, max_depth=2, min_samples_split=2, min_samples_leaf=4 
[CV]  learning_rate=0.11712352736, max_features=sqrt, loss=exponential, max_depth=2, min_samples_split=2, min_samples_leaf=4, score=0.774687, total=  28.5s
[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed: 54.9min remaining:    0.0s
[CV] learning_rate=0.11712352736, max_features=sqrt, loss=exponential, max_depth=2, min_samples_split=2, min_samples_leaf=4 
[CV]  learning_rate=0.11712352736, max_features=sqrt, loss=exponential, max_depth=2, min_samples_split=2, min_samples_leaf=4, score=0.814464, total=  28.3s
[Parallel(n_jobs=1)]: Done  56 out of  56 | elapsed: 55.4min remaining:    0.0s
[CV] learning_rate=0.11712352736, max_f

In [11]:
#RandomizedSearchCV took 3722.75 seconds for 20 candidates parameter settings.
#Model with rank: 1
#Mean validation score: 0.961 (std: 0.028)
#Parameters: {'learning_rate': 0.24318158771707576, 'max_features': 'auto', 'loss': 'deviance', 
#'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 5}

gbc=GradientBoostingClassifier(random_state=1234,loss='deviance',learning_rate=
                                0.24318158771707576,min_samples_leaf=5,
                                min_samples_split=4,max_features='auto',max_depth=3)

In [12]:
#fit the model
gbc.fit(X_res,y_res)

predicted_probs = gbc.predict_proba(train_std)
a= predicted_probs[:,1]
np.mean(a)
b= predicted_probs[:,0]
np.mean(b)
a1=sorted(a,reverse=True)
a2=pd.DataFrame(a1)
a2.iloc[train_PERT.value_counts()["Y"]*4-1,:]

0    0.129059
Name: 6679, dtype: float64

In [13]:
#Synchronize with Jeff's Scoring Functions
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
    f1_score, roc_auc_score, average_precision_score, brier_score_loss, \
    fbeta_score, confusion_matrix
from creonmetrics import labeled_metric, assumed_metric, pu_score, \
    pr_one_unlabeled, brier_score_partial_loss

y_true = test_Ycat.values
y_prob = gbc.predict_proba(test_std)

y_pred=np.array([1 if x >= 0.129059 else 0 for x in y_prob[:,1]])

In [14]:
import collections
data = {'labeled_acc' : labeled_metric(y_true, y_pred, accuracy_score),
        'labeled_prec' : labeled_metric(y_true, y_pred, precision_score),
        'labeled_recall' : labeled_metric(y_true, y_pred, recall_score),
        'labeled_f1' : labeled_metric(y_true, y_pred, f1_score),
        'labeled_roc_auc' : labeled_metric(y_true, y_pred, roc_auc_score),
        'labeled_avg_prec' : labeled_metric(y_true, y_pred, average_precision_score),
        'labeled_brier' : labeled_metric(y_true, y_prob, brier_score_loss),
        'labeled_brier_pos' : labeled_metric(y_true, y_prob, brier_score_partial_loss, label=1),
        'labeled_brier_neg' : labeled_metric(y_true, y_prob, brier_score_partial_loss, label=0),
        'confusion_matrix_lab' : labeled_metric(y_true, y_pred, confusion_matrix),
        'pr_one_unlabeled' : pr_one_unlabeled(y_true, y_pred),
        'assumed_brier' : assumed_metric(y_true, y_prob, brier_score_loss),
        'assumed_brier_neg' : assumed_metric(y_true, y_prob, brier_score_partial_loss, label=0),
        'assumed_f1' : assumed_metric(y_true, y_pred, f1_score),
        'assumed_f1beta10' : assumed_metric(y_true, y_pred, fbeta_score, beta=10),
        'confusion_matrix_un' : assumed_metric(y_true, y_pred, confusion_matrix),
        'pu_score' : pu_score(y_true, y_pred),
        }
data_s = collections.OrderedDict(sorted(data.items()))
for k, v in data_s.items():
    print(k, ': ', v)


assumed_brier :  0.004992672206
assumed_brier_neg :  0.00177169141347
assumed_f1 :  0.204043253409
assumed_f1beta10 :  0.503734859454
confusion_matrix_lab :  [[200  16]
 [201 217]]
confusion_matrix_un :  [[86230  1492]
 [  201   217]]
labeled_acc :  0.657728706625
labeled_avg_prec :  0.8837519642
labeled_brier :  0.454738728296
labeled_brier_neg :  0.0169747207195
labeled_brier_pos :  0.680951708288
labeled_f1 :  0.666666666667
labeled_prec :  0.931330472103
labeled_recall :  0.519138755981
labeled_roc_auc :  0.722532340953
pr_one_unlabeled :  0.0168674148058
pu_score :  13.8994587053
