# Model Evauation
Classification models need the following metrics

Cross Validation
- Accuracy
- error

ROC Curve
Steps
- load data
- featurize
- choose model
- pass data and HP's
- run CV
- ROC possible?
- tabulate results
- report

In [63]:
#### Standard Libraries ####
import os
from pprint import pprint
import numpy as np
import pandas as pd
import multiprocessing as mp
from functools import partial
from itertools import product
from collections import Counter
import timeit
import uuid

#### third-party Libraries ####
from sklearn.svm import SVC
from sklearn.utils import resample
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier as SRC
from lolopy.learners import RandomForestClassifier as LRC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

#### Local Libraries ####
from utils import (Result, run_k_folds, 
                   report_column_labels,
                   compile_data)
from data_manager import DataManager
from featurizer import Featurizer

In [83]:
# configuration
np.random.seed(8)
load_path = os.path.join('data','training_data.csv')
save_path = os.path.join('results','final_model.csv')
mp_api_key = '7n6DwPUQ5cf8ZTWO'
oversample = True
data_ramp = False
feature_set = ['standard','cmpd_energy']

## Load data
This object should handle data load and sampling.

Should be able to do the following:
1. load and sample 
2. Run a data ramp

In [65]:
# Load Data
dm = DataManager(load_path, save_path)
dm.load()

'Loaded 2572 records.'


In [4]:
# Sample data
if not data_ramp:
    dm.sample_data(100)
    #dm.data.to_csv('data/sample_07_19_19.csv', index=False)

In [66]:
# Format and careate composition objects
#dm.compute_formula()
dm.to_binary_classes()
dm.get_pymatgen_composition()
dm.remove_noble_gasses()
dm.remove_features()

In [64]:
if data_ramp:
    dm.data = dm.data.sample(frac=1).reset_index(drop=True)

In [84]:
f = Featurizer(feature_set, mp_api_key)

In [85]:
dm.featurized_data = f.featurize(dm.data)

HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=20570, style=ProgressStyle(descripti…




In [23]:
dm.groups = dm.data['group']

In [86]:
# Set training labels
dm.outputs = dm.data['stable']

In [25]:
k_folds = partial(run_k_folds, inputs=dm.featurized_data, outputs=dm.outputs, groups=dm.groups, sampling=oversample, ramp=data_ramp, splits=5)

In [88]:
# set models
#models = [GaussianNB(), SVC(), SRC(), LogisticRegression(), DummyClassifier(strategy="most_frequent")]
models = [SRC()]

In [27]:
%%capture
pool = mp.Pool(processes=mp.cpu_count())

In [28]:
%%capture
start_time = timeit.default_timer()
results = pool.map(k_folds, models)    
elapsed = timeit.default_timer() - start_time

In [29]:
compiled = []
for result in results:
    compiled.extend(compile_data(result))
res_df = pd.DataFrame(compiled, columns=report_column_labels)
res_df.to_csv(save_path)
res_df

Unnamed: 0,type,data_size,accuracy,accuracy_std,f1,f1_std,recall,recall_std,precision,precision_std
0,RandomForestClassifier,871,0.904609,0.02335,0.663163,0.070201,0.572087,0.112946,0.816171,0.113099


In [89]:
def oversample(train, outputs):
    majority_class, minority_class = Counter(outputs[train]).most_common()
    minority = train[np.where(outputs[train] == minority_class[0])[0]]
    majority = train[np.where(outputs[train] == majority_class[0])[0]]
    minority_upsampled = resample(minority,
                                  replace=True,
                                  n_samples=len(majority),
                                  random_state=8)
    return np.append(majority, minority_upsampled)

In [90]:
train = oversample(np.arange(0,len(dm.featurized_data),1), dm.outputs)
test = dm.featurized_data[train]
test_out = dm.outputs[train]

In [91]:
models[0].fit(test, test_out)


The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [92]:
# Save the best model
import pickle
pickle.dump(models[0], open('rfc.sav', 'wb+'))

# Single model test

In [18]:
import numpy
import pandas
from sklearn.utils import resample
from collections import Counter

def local_oversample(train, outputs):
    majority_class, minority_class = Counter(outputs[train]).most_common()
    minority = train[numpy.where(outputs[train] == minority_class[0])[0]]
    majority = train[numpy.where(outputs[train] == majority_class[0])[0]]
    minority_upsampled = resample(minority,
                              replace=True,
                              n_samples=len(majority),
                              random_state=8)
    t = numpy.append(majority, minority_upsampled)
    pprint(len(t))
    return (t, minority, majority, minority_upsampled)

In [19]:
model = SRC()
ovs = False

In [20]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import KFold, GroupKFold, LeavePGroupsOut, LeaveOneGroupOut
inputs = dm.featurized_data
outputs = dm.outputs
groups = dm.groups
gkf = GroupKFold(n_splits=10)
lpgo = LeavePGroupsOut(n_groups=1)
logo = LeaveOneGroupOut()
splits = list(gkf.split(inputs, outputs, groups=groups.values))
#splits = list(logo.split(inputs, outputs, groups))

In [21]:
train, test = splits[3]

In [22]:
%%capture
# Place all data into df's for easy analysis
train_df = pandas.DataFrame(dm.data.ix[train, ['formula','group','stable']])
test_df = pandas.DataFrame(dm.data.ix[test, ['formula','group','stable']])

In [23]:
if ovs:
    train, minority, majority, minority_upsampled = local_oversample(train, outputs)

In [24]:
# Run the ML
model.fit(inputs[train], outputs[train])
prediction = model.predict(inputs[test])
acc = accuracy_score(outputs[test], prediction)
f1s = [precision_recall_fscore_support(
    outputs[test], prediction, labels=[0,1], average='binary')]


The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



In [31]:
len(train)

18513

In [26]:
%%capture
# Place all data into df's for easy analysis
train_df = pandas.DataFrame(dm.data.ix[train, ['formula','group','stable']])
test_df = pandas.DataFrame(dm.data.ix[test, ['formula','group','stable']])
pred_df = pandas.DataFrame(dm.data.ix[test, ['formula','group','stable']])
pred_df['prediction'] = prediction
acc_df = pandas.DataFrame([acc], columns=['accuracy'])

f1_df = pandas.DataFrame(
        f1s, columns=['precision', 'recall', 'f1', 'support'])
ov_df = pd.DataFrame([train_df['group'].isin(test_df['group']).any()], columns=['overlap'])

try:
    lab_df = pandas.DataFrame([pred_df['prediction'].value_counts().values], columns=['pred_0','pred_1'])
except:
    lab_df = pandas.DataFrame([pred_df['prediction'].value_counts().values], columns=['pred_0'])
if ovs:
    sample_df = pandas.DataFrame([train_df['stable'].value_counts().values], columns=['smpl_0','smpl_1'])
else:
    sample_df = pandas.DataFrame([], columns=['sample'])
# Get the accuracy for each label
#ac_lab_df = pandas.DataFrame([test_df['stable'].value_counts().values], columns=['acc_0','acc_1'])

In [27]:
#res = f1_df.join(acc_df).join(ov_df).join(lab_df).join(ac_lab_df).join(sample_df)

In [28]:
res = f1_df.join(acc_df).join(ov_df).join(lab_df).join(sample_df)

In [29]:
#res.to_csv('results/final_modle.csv', index=False)

In [30]:
res

Unnamed: 0,precision,recall,f1,support,accuracy,overlap,pred_0,pred_1,sample
0,0.715232,0.413793,0.524272,,0.904716,False,1906,151,


In [255]:
import pickle
#pickle.dump(model, open('rfc.sav', 'wb+'))