# Model Evauation
Classification models need the following metrics

Cross Validation
- Accuracy
- error

ROC Curve

## SKL Models

In [67]:
import os
from pprint import pprint
import numpy as np
import pandas as pd
from utils import get_compostion, check_nobility, Result, run_k_folds, run_k_folds_oversample
from matminer.featurizers import composition as cf
from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates
from matminer.featurizers.base import MultipleFeaturizer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier as SRC
from lolopy.learners import RandomForestClassifier as LRC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
import multiprocessing as mp
from functools import partial
from itertools import product
import timeit
import uuid

In [2]:
# load data
# featurize
# choose model
# pass data and HP's
# run CV
# ROC possible?
# tabulate results
# report

In [106]:
# configuration
np.random.seed(8)
load_path = os.path.join('data','processed_data.csv')
save_path = os.path.join('results','elec_features.csv')
mp_api_key = '7n6DwPUQ5cf8ZTWO'
lr = False
energy_features = True
electronegativity_features = True
oversample = False

In [4]:
# Load Data
data = pd.read_csv(load_path)
pprint(f"Loaded {len(data.index)} records.")
data.head()

'Loaded 25802 records.'


Unnamed: 0,formula,formulaA,formulaB,stable
0,Ne,Ne,He,1
1,Cs,Cs,He,1
2,K,K,He,1
3,Ba,Ba,He,1
4,Sr,Sr,He,1


In [5]:
if lr:
    # If working with logistic regression
    load_path = os.path.join('data','training_data.csv')
    data = pd.read_csv(load_path)
    classes = list(product([0.0,1.0],repeat=10))
    data['formula'] = data['formulaA'] + data['formulaB']
    data.rename({'stabilityVec':'stable'}, axis=1, inplace=True)
    data = data[['formula', 'stable']]

In [6]:
# Format and Choose Training Data
data = data.sample(1000)
data['composition'] = data['formula'].apply(get_compostion)
data['noble'] = data.apply(check_nobility, axis=1)
data = data[data['noble'] == False]
data = data[['composition','stable']]
data.reset_index(drop=True, inplace=True)
data.sample(10)

Unnamed: 0,composition,stable
789,"(Nb, Cs)",0
132,"(Ge, Ce)",1
49,"(Ce, Ac)",0
738,"(Pt, Si)",0
742,(Si),1
562,(Th),1
32,"(Ho, U)",0
353,"(Li, Sb)",0
673,"(Tb, Ir)",1
354,"(Pr, Sn)",0


In [110]:
# Featurize Data
#%%capture
standard_features = [cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                             cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)]
if energy_features:
    standard_features.append(cf.CohesiveEnergy(mapi_key=mp_api_key))
    f = MultipleFeaturizer(standard_features)
    X = np.array(f.featurize_many(data['composition'], ignore_errors=True))
elif electronegativity_features:
    data = CompositionToOxidComposition(return_original_on_error=True, overwrite_data=True).featurize_dataframe(data, 'composition', ignore_errors=True)
    standard_features.extend([cf.OxidationStates(), cf.ElectronegativityDiff()])
    f = MultipleFeaturizer(standard_features)
    X = np.array(f.featurize_many(data['composition_oxid'], ignore_errors=True))
elif energy_features and electronegativity_features:
    data = CompositionToOxidComposition(return_original_on_error=True, overwrite_data=True).featurize_dataframe(data, 'composition', ignore_errors=True)
    standard_features.extend([cf.OxidationStates(), cf.ElectronegativityDiff(), cf.CohesiveEnergy(mapi_key=mp_api_key)])
    f = MultipleFeaturizer(standard_features)
    X = np.array(f.featurize_many(data['composition_oxid'], ignore_errors=True))
else:
    f = MultipleFeaturizer(standard_features)
    X = np.array(f.featurize_many(data['composition'], ignore_errors=True))

HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=887, style=ProgressStyle(description…




In [111]:
# Imputation method
X = np.nan_to_num(X)
#X = X[~np.isnan(X).any(axis=1)]

In [112]:
X.shape

(887, 146)

In [113]:
# Set training labels
Y = data['stable']

In [114]:
if oversample:
    k_folds = partial(run_k_folds_oversample, inputs=X, outputs=Y)
else:
    k_folds = partial(run_k_folds, inputs=X, outputs=Y)

In [115]:
# set models
models = [GaussianNB(), SVC(), SRC(), LogisticRegression(), DummyClassifier(strategy="most_frequent")]

In [116]:
if lr:
# Logistic Regression
    models[0].classes = classes

In [117]:
%%capture
pool = mp.Pool(processes=mp.cpu_count())

In [118]:
%%capture
start_time = timeit.default_timer()
results = pool.map(k_folds, models)
elapsed = timeit.default_timer() - start_time

In [119]:
pprint(elapsed)

2.089395342998614


In [120]:
compiled = [[r.model,
  r.accuracy,
  r.accuracy_std,
  r.f1,
  r.f1_std,
  r.recall,
  r.recall_std,
  r.precision,
  r.precision_std] for r in results]

In [121]:
cols = ['type',
 'accuracy',
 'accuracy_std',
 'f1',
 'f1_std',
 'recall',
 'recall_std',
 'precision',
 'precision_std']

In [122]:
res_df = pd.DataFrame(compiled, columns=cols)
res_df

Unnamed: 0,type,accuracy,accuracy_std,f1,f1_std,recall,recall_std,precision,precision_std
0,GaussianNB,0.830784,0.043285,0.536414,0.118413,0.475538,0.121626,0.634367,0.152183
1,SVC,0.84105,0.037926,0.399498,0.078719,0.252276,0.060378,1.0,0.0
2,RandomForestClassifier,0.881588,0.03255,0.650557,0.11368,0.545587,0.142096,0.849737,0.117802
3,LogisticRegression,0.860189,0.040029,0.62659,0.102458,0.56901,0.121646,0.72073,0.13627
4,DummyClassifier,0.785853,0.055929,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
res_df.to_csv(save_path)