# Setup, matplotlib inline, automatically reload libraries on every evaluation

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from loadcreon import LoadCreon
from creonmetrics import pu_scorer, prior_squared_error_scorer_015, brier_score_labeled_loss_scorer, \
    f1_assumed_scorer, f1_labeled_scorer, report_metrics, f1_assumed_beta10_scorer
from semisuperhelper import SemiSupervisedHelper
from pnuwrapper import PNUWrapper
from jeffsearchcv import JeffRandomSearchCV
from nestedcross import NestedCV
from frankenscorer import FrankenScorer, extract_scores_from_nested, extract_score_grid
from searchrf import save_search, load_search

In [3]:
import xgboost

In [4]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.base import clone

In [6]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

# SET UP XGBOOST FIRST TRY!

In [11]:
y_train_xg = y_train.copy()
y_train_xg[y_train_xg==-1] = 0
y_train_xg.value_counts()

0    350886
1      1670
dtype: int64

In [12]:
xg = xgboost.XGBRegressor(max_depth=6, learning_rate=0.3)

In [13]:
xg.fit(X_train.values, y_train_xg.values, early_stopping_rounds=35)

IndexError: list index out of range

## Set up randomized search parameters

In [6]:
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
pnu = PNUWrapper(base_estimator=rf)

In [7]:
rf_param_search = {'base_estimator':[rf, et],
 'base_estimator__bootstrap': [True, False],
 'base_estimator__class_weight': [None,'balanced','balanced_subsample'],
 'base_estimator__criterion': ['gini','entropy'],
 'base_estimator__max_depth': [None] + list(range(2,100)),
 'base_estimator__max_features': ['sqrt','log2',None] + list(range(5,100)),
 'base_estimator__min_samples_leaf': [1,2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,45,50,75,100],
 'base_estimator__min_samples_split':[2,0.005,0.01,0.015,0.02,0.025,0.03,0.035,0.04,
                                                        0.045,0.05,0.07,0.09,0.1,0.12,0.15,0.17,0.2,0.25],
 'base_estimator__n_estimators': sp.stats.randint(low=10, high=300),
 'pu_learning': [True, False],
 'num_unlabeled':sp.stats.randint(low=0, high=15000)
                  }

## Set up PNU Wrapper with Random Forest, then JeffSearchCV, then NestedCV

In [8]:
jeffsearch = JeffRandomSearchCV(pnu, rf_param_search, n_iter=100, scoring=FrankenScorer(),
                                n_jobs=-1, cv=3, verbose=100, pre_dispatch=8)

#### 5x5 (x40) nested cross validation

In [9]:
nested_cross = NestedCV(jeffsearch, scoring=FrankenScorer(), cv=3, random_state=33, use_same_random_state=True)

## Score the nested cross - 1000 models!

In [None]:
scores = nested_cross.score(X_train.values, y=y_train.values, verbose=1)

## The scores of the 5 folds of the outer loop.  FrankenScorer by default uses labeled_f1 metric which probably isn't what we want to use for the end result

In [214]:
scores

(0.95601173020527863,
 0.9216589861751151,
 0.95718654434250761,
 0.89677419354838706,
 0.90822784810126567)