In [1]:
import scipy as sp
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from scipy.stats import randint as sp_randint
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import brier_score_loss
from sklearn.model_selection import RandomizedSearchCV
from time import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [2]:
#NEED INSTALL IMBLANCE PACKAGE
#code: conda install -c glemaitre imbalanced-learn
from imblearn.over_sampling import SMOTE
#read tables: After variable selection, choosing 50 variables by baseline RF importance
train=pd.read_table('training_self1_0127.txt')
test=pd.read_table('testing_self1_0127.txt')

In [3]:
train_Ycat=train['Ycat']
train_PERT=train['PERT_Y']
train=train.drop('Ycat',1)
train=train.drop('MemberID',1)
train=train.drop('PERT_Y',1)

test_Ycat=test['Ycat']
test_PERT=test['PERT_Y']
test=test.drop('Ycat',1)
test=test.drop('PERT_Y',1)
test=test.drop('MemberID',1)

scaler = MinMaxScaler()
train = scaler.fit_transform(train)
test =  scaler.transform(test)

train_std=pd.DataFrame(train)
test_std=pd.DataFrame(test)

res=[1 if x =='Y' else 0 for x in train_PERT]

In [4]:
sm = SMOTE(random_state=1234,ratio=.2)
X_res, y_res = sm.fit_sample(train_std, res)

In [5]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


In [6]:
#hyper parameter options, could also apply to other model. 
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 20),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}


# Optimize Hyper-parameters by F1 Score

In [7]:
#model1
rf=RandomForestClassifier(n_estimators=100)

In [8]:
#search 50 times
seed = np.random.seed(1234)
n_iter_search = 20
random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring ='f1',  n_jobs=-1, verbose=100,
                                   pre_dispatch =10)

start = time()
random_search.fit(X_res,y_res )
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

report(random_search.cv_results_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Memmaping (shape=(421062L, 51L), dtype=float64) to new file c:\users\motoharu.dei\appdata\local\temp\joblib_memmaping_pool_6232_156298320\6232-278938624-9005ccc209ccf046c5d5153ae592621a.pkl
Memmaping (shape=(421062L,), dtype=int32) to new file c:\users\motoharu.dei\appdata\local\temp\joblib_memmaping_pool_6232_156298320\6232-278938624-83f7956e1aa6563de788f77dc68fb8e9.pkl
Memmaping (shape=(280707L,), dtype=int64) to new file c:\users\motoharu.dei\appdata\local\temp\joblib_memmaping_pool_6232_156298320\6232-278938624-5ff6027c1f0dfb71e9be14d66cfa5499.pkl
Memmaping (shape=(140355L,), dtype=int64) to new file c:\users\motoharu.dei\appdata\local\temp\joblib_memmaping_pool_6232_156298320\6232-278938624-a5091e1951d332bf63830e38c9eb6ee5.pkl
Memmaping (shape=(421062L, 51L), dtype=float64) to old file c:\users\motoharu.dei\appdata\local\temp\joblib_memmaping_pool_6232_156298320\6232-278938624-9005ccc209ccf046c5d5153ae592621a.pkl
Memmapi

In [13]:
#RandomizedSearchCV took 2039.49 seconds for 20 candidates parameter settings.
#Model with rank: 1
#Mean validation score: 0.988 (std: 0.009)
#Parameters: {'bootstrap': False, 'min_samples_leaf': 1, 'min_samples_split': 4, 
#'criterion': 'gini', 'max_features': 10, 'max_depth': None}

rf=RandomForestClassifier(n_estimators=500)
rf=RandomForestClassifier(criterion='gini', max_depth=None, min_samples_split=4, 
                       min_samples_leaf=1,max_features=10,bootstrap=False,random_state=1234)


In [14]:
#fit the model
rf.fit(X_res,y_res)

predicted_probs = rf.predict_proba(train_std)
a= predicted_probs[:,1]
np.mean(a)
b= predicted_probs[:,0]
np.mean(b)
a1=sorted(a,reverse=True)
a2=pd.DataFrame(a1)
a2.iloc[train_PERT.value_counts()["Y"]*4-1,:]

0    0.033333
Name: 6679, dtype: float64

In [15]:
#Synchronize with Jeff's Scoring Functions
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
    f1_score, roc_auc_score, average_precision_score, brier_score_loss, \
    fbeta_score, confusion_matrix
from creonmetrics import labeled_metric, assumed_metric, pu_score, \
    pr_one_unlabeled, brier_score_partial_loss

y_true = test_Ycat.values
y_prob = rf.predict_proba(test_std)

y_pred=np.array([1 if x >= 0.033333 else 0 for x in y_prob[:,1]])

In [16]:
import collections
data = {'labeled_acc' : labeled_metric(y_true, y_pred, accuracy_score),
        'labeled_prec' : labeled_metric(y_true, y_pred, precision_score),
        'labeled_recall' : labeled_metric(y_true, y_pred, recall_score),
        'labeled_f1' : labeled_metric(y_true, y_pred, f1_score),
        'labeled_roc_auc' : labeled_metric(y_true, y_pred, roc_auc_score),
        'labeled_avg_prec' : labeled_metric(y_true, y_pred, average_precision_score),
        'labeled_brier' : labeled_metric(y_true, y_prob, brier_score_loss),
        'labeled_brier_pos' : labeled_metric(y_true, y_prob, brier_score_partial_loss, label=1),
        'labeled_brier_neg' : labeled_metric(y_true, y_prob, brier_score_partial_loss, label=0),
        'confusion_matrix_lab' : labeled_metric(y_true, y_pred, confusion_matrix),
        'pr_one_unlabeled' : pr_one_unlabeled(y_true, y_pred),
        'assumed_brier' : assumed_metric(y_true, y_prob, brier_score_loss),
        'assumed_brier_neg' : assumed_metric(y_true, y_prob, brier_score_partial_loss, label=0),
        'assumed_f1' : assumed_metric(y_true, y_pred, f1_score),
        'assumed_f1beta10' : assumed_metric(y_true, y_pred, fbeta_score, beta=10),
        'confusion_matrix_un' : assumed_metric(y_true, y_pred, confusion_matrix),
        'pu_score' : pu_score(y_true, y_pred),
        }
data_s = collections.OrderedDict(sorted(data.items()))
for k, v in data_s.items():
    print(k, ': ', v)


assumed_brier :  0.00562505987948
assumed_brier_neg :  0.00238222896068
assumed_f1 :  0.0962722852512
assumed_f1beta10 :  0.630825201884
confusion_matrix_lab :  [[160  56]
 [121 297]]
confusion_matrix_un :  [[82267  5455]
 [  121   297]]
labeled_acc :  0.720820189274
labeled_avg_prec :  0.871368912088
labeled_brier :  0.457327812829
labeled_brier_neg :  0.0144765946502
labeled_brier_pos :  0.686169590643
labeled_f1 :  0.770428015564
labeled_prec :  0.841359773371
labeled_recall :  0.710526315789
labeled_roc_auc :  0.725633528265
pr_one_unlabeled :  0.0616986263799
pu_score :  7.73596513702


# Optimize Hyper-parameters by Recall*Precision

In [9]:
from sklearn.metrics import precision_recall_fscore_support,recall_score, make_scorer,precision_score

def pu(actual,prediction):
    precision = precision_score(actual, prediction)
    recall = recall_score(actual, prediction)
    pu= precision*recall
    return pu

pu_score=make_scorer(pu,greater_is_better=True)

In [10]:
rf=RandomForestClassifier(n_estimators=100)

In [11]:
#search 50 times     
seed = np.random.seed(1234)
n_iter_search = 20
random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring =pu_score, verbose=100,
                                  )

start = time()
random_search.fit(X_res,y_res )
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

report(random_search.cv_results_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] max_features=13, max_depth=3, min_samples_leaf=5, criterion=entropy, bootstrap=False, min_samples_split=10 
[CV]  max_features=13, max_depth=3, min_samples_leaf=5, criterion=entropy, bootstrap=False, min_samples_split=10, score=0.682543, total=  44.6s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   46.4s remaining:    0.0s
[CV] max_features=13, max_depth=3, min_samples_leaf=5, criterion=entropy, bootstrap=False, min_samples_split=10 
[CV]  max_features=13, max_depth=3, min_samples_leaf=5, criterion=entropy, bootstrap=False, min_samples_split=10, score=0.726020, total=  43.6s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s
[CV] max_features=13, max_depth=3, min_samples_leaf=5, criterion=entropy, bootstrap=False, min_samples_split=10 
[CV]  max_features=13, max_depth=3, min_samples_leaf=5, criterion=entropy, bootstrap=False, min_samples_split=10, score=0.714902, total=  43.7s
[Paralle

In [19]:
#RandomizedSearchCV took 4576.51 seconds for 20 candidates parameter settings.
#Model with rank: 1
#Mean validation score: 0.977 (std: 0.021)
#Parameters: {'max_features': 14, 'max_depth': None, 'min_samples_leaf': 1, 'criterion': 'entropy', 
#'bootstrap': False, 'min_samples_split': 2}

rf=RandomForestClassifier(n_estimators=500)
rf=RandomForestClassifier(criterion='entropy', max_depth=None, min_samples_split=2, 
                       min_samples_leaf=1,max_features=14,bootstrap=False,random_state=1234)


In [20]:
#fit the model
rf.fit(X_res,y_res)

predicted_probs = rf.predict_proba(train_std)
a= predicted_probs[:,1]
np.mean(a)
b= predicted_probs[:,0]
np.mean(b)
a1=sorted(a,reverse=True)
a2=pd.DataFrame(a1)
a2.iloc[train_PERT.value_counts()["Y"]*4-1,:]

0    0.0
Name: 6679, dtype: float64

In [21]:
#Synchronize with Jeff's Scoring Functions
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
    f1_score, roc_auc_score, average_precision_score, brier_score_loss, \
    fbeta_score, confusion_matrix
from creonmetrics import labeled_metric, assumed_metric, pu_score, \
    pr_one_unlabeled, brier_score_partial_loss

y_true = test_Ycat.values
y_prob = rf.predict_proba(test_std)

y_pred=np.array([1 if x >= 0.0 else 0 for x in y_prob[:,1]])

In [22]:
import collections
data = {'labeled_acc' : labeled_metric(y_true, y_pred, accuracy_score),
        'labeled_prec' : labeled_metric(y_true, y_pred, precision_score),
        'labeled_recall' : labeled_metric(y_true, y_pred, recall_score),
        'labeled_f1' : labeled_metric(y_true, y_pred, f1_score),
        'labeled_roc_auc' : labeled_metric(y_true, y_pred, roc_auc_score),
        'labeled_avg_prec' : labeled_metric(y_true, y_pred, average_precision_score),
        'labeled_brier' : labeled_metric(y_true, y_prob, brier_score_loss),
        'labeled_brier_pos' : labeled_metric(y_true, y_prob, brier_score_partial_loss, label=1),
        'labeled_brier_neg' : labeled_metric(y_true, y_prob, brier_score_partial_loss, label=0),
        'confusion_matrix_lab' : labeled_metric(y_true, y_pred, confusion_matrix),
        'pr_one_unlabeled' : pr_one_unlabeled(y_true, y_pred),
        'assumed_brier' : assumed_metric(y_true, y_prob, brier_score_loss),
        'assumed_brier_neg' : assumed_metric(y_true, y_prob, brier_score_partial_loss, label=0),
        'assumed_f1' : assumed_metric(y_true, y_pred, f1_score),
        'assumed_f1beta10' : assumed_metric(y_true, y_pred, fbeta_score, beta=10),
        'confusion_matrix_un' : assumed_metric(y_true, y_pred, confusion_matrix),
        'pu_score' : pu_score(y_true, y_pred),
        }
data_s = collections.OrderedDict(sorted(data.items()))
for k, v in data_s.items():
    print(k, ': ', v)


assumed_brier :  0.0054550714772
assumed_brier_neg :  0.00212307060943
assumed_f1 :  0.00944014092459
assumed_f1beta10 :  0.324903801755
confusion_matrix_lab :  [[  0 216]
 [  0 418]]
confusion_matrix_un :  [[    0 87722]
 [    0   418]]
labeled_acc :  0.659305993691
labeled_avg_prec :  0.829652996845
labeled_brier :  0.468722397476
labeled_brier_neg :  0.012037037037
labeled_brier_pos :  0.70471291866
labeled_f1 :  0.794676806084
labeled_prec :  0.659305993691
labeled_recall :  1.0
labeled_roc_auc :  0.5
pr_one_unlabeled :  1.0
pu_score :  1.0


# Optimize Hyper-parameters by Recall

In [23]:
rf=RandomForestClassifier(n_estimators=100)

In [24]:
#search 50 times     
seed = np.random.seed(1234)
n_iter_search = 20
random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring ='recall',verbose=100)

start = time()
random_search.fit(X_res,y_res )
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

report(random_search.cv_results_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] max_features=13, max_depth=3, min_samples_leaf=5, criterion=entropy, bootstrap=False, min_samples_split=10 
[CV]  max_features=13, max_depth=3, min_samples_leaf=5, criterion=entropy, bootstrap=False, min_samples_split=10, score=0.734279, total=  48.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   49.9s remaining:    0.0s
[CV] max_features=13, max_depth=3, min_samples_leaf=5, criterion=entropy, bootstrap=False, min_samples_split=10 
[CV]  max_features=13, max_depth=3, min_samples_leaf=5, criterion=entropy, bootstrap=False, min_samples_split=10, score=0.780908, total=  57.8s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.8min remaining:    0.0s
[CV] max_features=13, max_depth=3, min_samples_leaf=5, criterion=entropy, bootstrap=False, min_samples_split=10 
[CV]  max_features=13, max_depth=3, min_samples_leaf=5, criterion=entropy, bootstrap=False, min_samples_split=10, score=0.807242, total= 1.3min
[Paralle

In [25]:
#RandomizedSearchCV took 7874.27 seconds for 20 candidates parameter settings.
#Model with rank: 1
#Mean validation score: 0.985 (std: 0.020)
#Parameters: {'max_features': 7, 'max_depth': None, 'min_samples_leaf': 1, 'criterion': 'entropy', 
#'bootstrap': True, 'min_samples_split': 3}

rf=RandomForestClassifier(n_estimators=500)
rf=RandomForestClassifier(criterion='entropy', max_depth=None, min_samples_split=3, 
                       min_samples_leaf=1,max_features=7,bootstrap=True,random_state=1234)


In [26]:
#fit the model
rf.fit(X_res,y_res)

predicted_probs = rf.predict_proba(train_std)
a= predicted_probs[:,1]
np.mean(a)
b= predicted_probs[:,0]
np.mean(b)
a1=sorted(a,reverse=True)
a2=pd.DataFrame(a1)
a2.iloc[train_PERT.value_counts()["Y"]*4-1,:]

0    0.1
Name: 6679, dtype: float64

In [27]:
#Synchronize with Jeff's Scoring Functions
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
    f1_score, roc_auc_score, average_precision_score, brier_score_loss, \
    fbeta_score, confusion_matrix
from creonmetrics import labeled_metric, assumed_metric, pu_score, \
    pr_one_unlabeled, brier_score_partial_loss

y_true = test_Ycat.values
y_prob = rf.predict_proba(test_std)

y_pred=np.array([1 if x >= 0.1 else 0 for x in y_prob[:,1]])

In [28]:
import collections
data = {'labeled_acc' : labeled_metric(y_true, y_pred, accuracy_score),
        'labeled_prec' : labeled_metric(y_true, y_pred, precision_score),
        'labeled_recall' : labeled_metric(y_true, y_pred, recall_score),
        'labeled_f1' : labeled_metric(y_true, y_pred, f1_score),
        'labeled_roc_auc' : labeled_metric(y_true, y_pred, roc_auc_score),
        'labeled_avg_prec' : labeled_metric(y_true, y_pred, average_precision_score),
        'labeled_brier' : labeled_metric(y_true, y_prob, brier_score_loss),
        'labeled_brier_pos' : labeled_metric(y_true, y_prob, brier_score_partial_loss, label=1),
        'labeled_brier_neg' : labeled_metric(y_true, y_prob, brier_score_partial_loss, label=0),
        'confusion_matrix_lab' : labeled_metric(y_true, y_pred, confusion_matrix),
        'pr_one_unlabeled' : pr_one_unlabeled(y_true, y_pred),
        'assumed_brier' : assumed_metric(y_true, y_prob, brier_score_loss),
        'assumed_brier_neg' : assumed_metric(y_true, y_prob, brier_score_partial_loss, label=0),
        'assumed_f1' : assumed_metric(y_true, y_pred, f1_score),
        'assumed_f1beta10' : assumed_metric(y_true, y_pred, fbeta_score, beta=10),
        'confusion_matrix_un' : assumed_metric(y_true, y_pred, confusion_matrix),
        'pu_score' : pu_score(y_true, y_pred),
        }
data_s = collections.OrderedDict(sorted(data.items()))
for k, v in data_s.items():
    print(k, ': ', v)


assumed_brier :  0.00613907466821
assumed_brier_neg :  0.00309353034625
assumed_f1 :  0.100098135427
assumed_f1beta10 :  0.650707427994
confusion_matrix_lab :  [[161  55]
 [112 306]]
confusion_matrix_un :  [[82332  5390]
 [  112   306]]
labeled_acc :  0.736593059937
labeled_avg_prec :  0.878179498525
labeled_brier :  0.431848400806
labeled_brier_neg :  0.01881719393
labeled_brier_pos :  0.64528079479
labeled_f1 :  0.785622593068
labeled_prec :  0.847645429363
labeled_recall :  0.732057416268
labeled_roc_auc :  0.738713893319
pr_one_unlabeled :  0.0609672479601
pu_score :  8.29265036363
