# Setup, matplotlib inline, automatically reload libraries on every evaluation

In [13]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
from loadcreon import LoadCreon
from creonmetrics import pu_scorer, prior_squared_error_scorer_015, brier_score_labeled_loss_scorer, \
    f1_assumed_scorer, f1_labeled_scorer, report_metrics, f1_assumed_beta10_scorer
from semisuperhelper import SemiSupervisedHelper
from pnuwrapper import PNUWrapper
from jeffsearchcv import JeffRandomSearchCV
from nestedcross import NestedCV, rerun_nested_for_estimator, rerun_nested_for_scoring
from frankenscorer import FrankenScorer, extract_scores_from_nested, extract_score_grid
from searchrf import save_search, load_search
from rfsubsample import RandomForestSubsample

In [15]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.base import clone

In [17]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

## Set up randomized search parameters

In [18]:
rf_param_search = {
 'base_estimator__target_imbalance_ratio': sp.stats.uniform(loc=0.1, scale=0.9),
 'base_estimator__class_weight': [None,'balanced','balanced_subsample'],
 'base_estimator__criterion': ['gini','entropy'],
 'base_estimator__max_depth': [None] + list(range(2,100)),
 'base_estimator__max_features': ['sqrt','log2',None] + list(range(5,100)),
 'base_estimator__min_samples_leaf': [1,2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,45,50,75,100],
 'base_estimator__min_samples_split':[2,0.005,0.01,0.015,0.02,0.025,0.03,0.035,0.04,
                                                        0.045,0.05,0.07,0.09,0.1,0.12,0.15,0.17,0.2,0.25],
 'base_estimator__n_estimators': sp.stats.randint(low=10, high=500),
 'pu_learning': [True, False],
                  }

## Set up PNU Wrapper with Random Forest, then JeffSearchCV, then NestedCV

In [21]:
pnu = PNUWrapper(base_estimator=RandomForestSubsample(verbose=1), random_state=42, num_unlabeled=1.0)

In [22]:
jeffsearch = JeffRandomSearchCV(pnu, rf_param_search, n_iter=100, scoring=FrankenScorer('pu_mix_assumed_f1beta10'),
                                n_jobs=-1, cv=3, verbose=1, pre_dispatch=8)

#### 3x3 (x100) nested cross validation

In [23]:
nested_cross = NestedCV(jeffsearch, scoring=FrankenScorer('pu_mix_assumed_f1beta10'),
                        cv=3, random_state=77, use_same_random_state=True)

## Score the nested cross - 900 models!

In [None]:
scores = nested_cross.score(X_train.values, y=y_train.values, verbose=1)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


## The scores of the 3 folds of the outer loop.  FrankenScorer for 100*f1beta=10 + PU

In [None]:
scores

In [None]:
save_search(nested_cross, './res/nested_cross_rfsubsample_large_20170219.pkl')

In [None]:
extract_scores_from_nested(nested_cross.test_score_datas_).mean().sort_index()

# Let's dive in and see the parameters for one of the best models

In [None]:
all_scores = [extract_score_grid(est) for est in nested_cross.estimators_]

In [None]:
best_clf_idx = [s.mean_pu_mix_assumed_f1beta10_test.idxmax() for s in all_scores]

In [None]:
[est.cv_results_['params'][best_idx] for est, best_idx in zip(nested_cross.estimators_, best_clf_idx)]

## Let's see what feature importance looks like for this specific estimator

In [None]:
all_feature_importances = [est.best_estimator_.feature_importances_ for est in nested_cross.estimators_]
feature_importances = np.mean(all_feature_importances, axis=0)
feature_table = pd.DataFrame(feature_importances, index=X_test.columns).sort_values(by=0, ascending=False)

In [None]:
feature_table

## Let's see what happens when use assumed_fbeta10 instead

In [None]:
nested_cross, new_estimators = rerun_nested_for_scoring(nested_cross, 'assumed_f1beta10', X=X_train.values, y=y_train.values,
                                        how='max', n_jobs=-1, verbose=1, return_estimators=True)

In [None]:
extract_scores_from_nested(nested_cross.test_score_datas_).mean().sort_index()

In [None]:
all_scores = [extract_score_grid(est) for est in nested_cross.estimators_]

In [None]:
best_clf_idx = [s.mean_assumed_f1beta10_test.idxmax() for s in all_scores]

In [None]:
[est.cv_results_['params'][best_idx] for est, best_idx in zip(nested_cross.estimators_, best_clf_idx)]

In [None]:
all_feature_importances = [est.feature_importances_ for est in new_estimators]
feature_importances = np.mean(all_feature_importances, axis=0)
feature_table = pd.DataFrame(feature_importances, index=X_test.columns).sort_values(by=0, ascending=False)

In [None]:
feature_table

## Lets see what unlabeled probability histogram looks like

In [None]:
probabs = nested_cross.estimators_[0].predict_proba(X_test.values)[:, 1]

In [None]:
pd.DataFrame(probabs, columns=['Predicted Probability']).hist(bins=100)

In [None]:
new_probabs = new_estimators[0].predict_proba(X_test.values)[:, 1]

In [None]:
pd.DataFrame(new_probabs, columns=['Predicted Probability']).hist(bins=100)