# Model Evauation
Classification models need the following metrics

Cross Validation
- Accuracy
- error

ROC Curve
Steps
- load data
- featurize
- choose model
- pass data and HP's
- run CV
- ROC possible?
- tabulate results
- report

In [1]:
#### Standard Libraries ####
import os
from pprint import pprint
import numpy as np
import pandas as pd
import multiprocessing as mp
from functools import partial
from itertools import product
import timeit
import uuid

#### third-party Libraries ####
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier as SRC
from lolopy.learners import RandomForestClassifier as LRC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

#### Local Libraries ####
from utils import (Result, run_k_folds, 
                   report_column_labels,
                   compile_data)
from data_manager import DataManager
from featurizer import Featurizer

In [2]:
# configuration
np.random.seed(8)
load_path = os.path.join('data','training_data.csv')
save_path = os.path.join('results','final_model.csv')
mp_api_key = '7n6DwPUQ5cf8ZTWO'
oversample = True
data_ramp = False
feature_set = ['standard']

## Load data
This object should handle data load and sampling.

Should be able to do the following:
1. load and sample 
2. Run a data ramp

In [3]:
# Load Data
dm = DataManager(load_path, save_path)
dm.load()

'Loaded 2572 records.'


In [4]:
# Sample data
#dm.sample_data(100)

In [5]:
# Format and careate composition objects
#dm.compute_formula()
dm.to_binary_classes()
dm.get_pymatgen_composition()
dm.remove_noble_gasses()
dm.remove_features()

In [6]:
f = Featurizer(feature_set)

In [7]:
dm.featurized_data = f.featurize(dm.data)

HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=20570, style=ProgressStyle(descripti…




In [9]:
dm.groups = dm.data['group']

In [10]:
# Set training labels
dm.outputs = dm.data['stable']

In [11]:
model = SRC()

In [28]:
k_folds = partial(run_k_folds, inputs=dm.featurized_data, outputs=dm.outputs, groups=dm.groups, sampling=oversample, ramp=data_ramp)

In [29]:
# set models
models = [GaussianNB(), SVC(), SRC(), LogisticRegression(), DummyClassifier(strategy="most_frequent")]
#models = [GaussianNB()]

In [30]:
%%capture
pool = mp.Pool(processes=mp.cpu_count())

In [31]:
%%capture
start_time = timeit.default_timer()
results = pool.map(k_folds, models)    
elapsed = timeit.default_timer() - start_time

In [32]:
compiled = []
for result in results:
    compiled.extend(compile_data(result))
res_df = pd.DataFrame(compiled, columns=report_column_labels)
res_df.to_csv(save_path)
res_df

Unnamed: 0,type,data_size,accuracy,accuracy_std,f1,f1_std,recall,recall_std,precision,precision_std
0,GaussianNB,4114,0.894984,0.023034,0.285615,0.04667,0.472003,0.079581,0.209712,0.047525
1,GaussianNB,8228,0.819395,0.012368,0.425016,0.046139,0.592549,0.057452,0.332174,0.041839
2,GaussianNB,12342,0.75798,0.019139,0.425815,0.022788,0.673705,0.041669,0.311791,0.020685
3,GaussianNB,16456,0.73432,0.022655,0.397885,0.022536,0.660335,0.044531,0.285167,0.018938
4,GaussianNB,20570,0.748274,0.014339,0.391613,0.02173,0.648737,0.049388,0.28081,0.016082
5,SVC,4114,0.955761,0.007033,0.0,0.0,0.0,0.0,0.0,0.0
6,SVC,8228,0.886852,0.011215,0.0,0.0,0.0,0.0,0.0,0.0
7,SVC,12342,0.866958,0.00563,0.0,0.0,0.0,0.0,0.0,0.0
8,SVC,16456,0.867039,0.009449,0.0,0.0,0.0,0.0,0.0,0.0
9,SVC,20570,0.875061,0.002801,0.0,0.0,0.0,0.0,0.0,0.0


# Single model test

In [220]:
import numpy
import pandas
from sklearn.utils import resample
from collections import Counter

def local_oversample(train, outputs):
    majority_class, minority_class = Counter(outputs[train]).most_common()
    minority = train[numpy.where(outputs[train] == minority_class[0])[0]]
    majority = train[numpy.where(outputs[train] == majority_class[0])[0]]
    minority_upsampled = resample(minority,
                              replace=True,
                              n_samples=len(majority),
                              random_state=8)
    t = numpy.append(majority, minority_upsampled)
    pprint(len(t))
    return (t, minority, majority, minority_upsampled)

In [236]:
model = SRC()
ovs = False

In [237]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import KFold, GroupKFold, LeavePGroupsOut, LeaveOneGroupOut
inputs = dm.featurized_data
outputs = dm.outputs
groups = dm.groups
gkf = GroupKFold(n_splits=10)
lpgo = LeavePGroupsOut(n_groups=1)
logo = LeaveOneGroupOut()
splits = list(gkf.split(inputs, outputs, groups=groups.values))
#splits = list(logo.split(inputs, outputs, groups))

In [238]:
train, test = splits[3]

In [239]:
%%capture
# Place all data into df's for easy analysis
train_df = pandas.DataFrame(dm.data.ix[train, ['formula','group','stable']])
test_df = pandas.DataFrame(dm.data.ix[test, ['formula','group','stable']])

In [240]:
if ovs:
    train, minority, majority, minority_upsampled = local_oversample(train, outputs)

In [242]:
# Run the ML
model.fit(inputs[train], outputs[train])
prediction = model.predict(inputs[test])
acc = accuracy_score(outputs[test], prediction)
f1s = [precision_recall_fscore_support(
    outputs[test], prediction, labels=[0,1], average='binary')]


The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



In [243]:
%%capture
# Place all data into df's for easy analysis
train_df = pandas.DataFrame(dm.data.ix[train, ['formula','group','stable']])
test_df = pandas.DataFrame(dm.data.ix[test, ['formula','group','stable']])
pred_df = pandas.DataFrame(dm.data.ix[test, ['formula','group','stable']])
pred_df['prediction'] = prediction
acc_df = pandas.DataFrame([acc], columns=['accuracy'])

f1_df = pandas.DataFrame(
        f1s, columns=['precision', 'recall', 'f1', 'support'])
ov_df = pd.DataFrame([train_df['group'].isin(test_df['group']).any()], columns=['overlap'])

try:
    lab_df = pandas.DataFrame([pred_df['prediction'].value_counts().values], columns=['pred_0','pred_1'])
except:
    lab_df = pandas.DataFrame([pred_df['prediction'].value_counts().values], columns=['pred_0'])
if ovs:
    sample_df = pandas.DataFrame([train_df['stable'].value_counts().values], columns=['smpl_0','smpl_1'])
else:
    sample_df = pandas.DataFrame([], columns=['sample'])
# Get the accuracy for each label
#ac_lab_df = pandas.DataFrame([test_df['stable'].value_counts().values], columns=['acc_0','acc_1'])

In [244]:
#res = f1_df.join(acc_df).join(ov_df).join(lab_df).join(ac_lab_df).join(sample_df)

In [245]:
res = f1_df.join(acc_df).join(ov_df).join(lab_df).join(sample_df)

In [246]:
#res.to_csv('results/final_modle.csv', index=False)

In [247]:
res

Unnamed: 0,precision,recall,f1,support,accuracy,overlap,pred_0,pred_1,sample
0,0.697531,0.43295,0.534279,,0.904229,False,1895,162,


In [255]:
import pickle
#pickle.dump(model, open('rfc.sav', 'wb+'))