# Example of Using epiml.epimlmain.py

In [36]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2`

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
from epiml.epimlmain import EpimlModel

### Create a EpimlModel - see the documentation comment for more options

In [3]:
model = EpimlModel()

### Train model

In [None]:
data_path = "C:\\Data\\010317\\membership14_final_0103.txt"
model.generate_trained_model(data_path)

### Predict new data with model

In [None]:
predict_new_data_path = "C:\\Data\\new_data.txt"
model.predict(path=predict_new_data_path)

### Save model to disk

In [None]:
model.save_model(model_path="trained_model.pkl")

### Load model from disk

In [None]:
model.load_model(model_path="trained_model.pkl")

# Replicate Model 6 Memo numbers

In [42]:
model_train = EpimlModel()

In [43]:
data_path = "C:\\Data\\010317\\membership14_final_0103.txt"

In [44]:
clf, X_train, X_test, y_train, y_test = model_train.generate_trained_model_with_split(data_path)

In [8]:
from epiml.epimlsklearn.frankenscorer import FrankenScorer

In [41]:
FrankenScorer()(clf, X_test, y_test.values)

({'SCORE': 0.85496183206106879,
  'assumed_brier': 0.042653797253650108,
  'assumed_brier_neg': 0.042055028774489925,
  'assumed_f1': 0.1169305724725944,
  'assumed_f1beta10': 0.72006620127734511,
  'confusion_matrix_lab': array([[183,  32],
         [ 82, 336]]),
  'confusion_matrix_un': array([[82729,  4993],
         [   82,   336]]),
  'labeled_acc': 0.81990521327014221,
  'labeled_avg_prec': 0.92320654679803116,
  'labeled_brier': 0.14829033063607508,
  'labeled_brier_neg': 0.10936429540334816,
  'labeled_brier_pos': 0.16831209516965467,
  'labeled_f1': 0.85496183206106879,
  'labeled_prec': 0.91304347826086951,
  'labeled_recall': 0.80382775119617222,
  'labeled_roc_auc': 0.82749527094692332,
  'pr_one_unlabeled': 0.056692607448546976,
  'pu_mix_assumed_f1beta10': 82.693558799848489,
  'pu_score': 10.686938672113984},
 0.85496183206106879)

## Test splitting data and making sure same results

In [45]:
model_train.save_model('model_6_train_split.pkl')

In [52]:
X_train.to_csv('training_only.txt', sep='\t', index=False)
X_test.to_csv('testing_only.txt', sep='\t', index=False)

In [49]:
model_from_train_split = EpimlModel()
model_from_train_split.load_model('model_6_train_split.pkl')

Pipeline(steps=[('lc', <epiml.loadepiml.LoadEpimlTransformer object at 0x00000000159DCC50>), ('model', PNUWrapper(base_estimator=RepeatedRandomSubSampler(base_estimator=RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=47, max_features=49, max_leaf_nodes=None,...thresh'),
      num_unlabeled=1.0, pu_learning=True, random_state=1,
      threshold_set_pct=None))])

In [50]:
type(model_from_train_split)

epiml.epimlmain.EpimlModel

In [53]:
res_from_train_split = model_from_train_split.predict('testing_only.txt')

In [59]:
res_from_clf = clf.predict_proba(X_test)[:, -1]

In [62]:
np.all(res_from_clf == res_from_train_split)

True

In [64]:
model_from_train_only = EpimlModel()
model_from_train_only.generate_trained_model('training_only.txt')

Pipeline(steps=[('lc', <epiml.loadepiml.LoadEpimlTransformer object at 0x0000000065D92B70>), ('model', PNUWrapper(base_estimator=RepeatedRandomSubSampler(base_estimator=RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=47, max_features=49, max_leaf_nodes=None,...thresh'),
      num_unlabeled=1.0, pu_learning=True, random_state=1,
      threshold_set_pct=None))])

In [65]:
res_from_train_only = model_from_train_only.predict('testing_only.txt')

In [66]:
np.all(res_from_clf == res_from_train_only)

True