# Model Evauation
Classification models need the following metrics

Cross Validation
- Accuracy
- error

ROC Curve

## SKL Models

In [1]:
import os
from pprint import pprint
import numpy as np
import pandas as pd
from utils import get_compostion, check_nobility, Result, run_k_folds
from matminer.featurizers import composition as cf
from matminer.featurizers.base import MultipleFeaturizer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier as SRC
from lolopy.learners import RandomForestClassifier as LRC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
import multiprocessing as mp
from functools import partial
from itertools import product
import timeit
import uuid

In [2]:
# load data
# featurize
# choose model
# pass data and HP's
# run CV
# ROC possible?
# tabulate results
# report

In [3]:
# configuration
np.random.seed(8)
load_path = os.path.join('data','processed_data.csv')
save_path = os.path.join('results','oversample_2_report.csv')
lr = False

In [4]:
# Load Data
data = pd.read_csv(load_path)
pprint(f"Loaded {len(data.index)} records.")
data.head()

'Loaded 25802 records.'


Unnamed: 0,formula,formulaA,formulaB,stable
0,Ne,Ne,He,1
1,Cs,Cs,He,1
2,K,K,He,1
3,Ba,Ba,He,1
4,Sr,Sr,He,1


In [5]:
if lr:
    # If working with logistic regression
    load_path = os.path.join('data','training_data.csv')
    data = pd.read_csv(load_path)
    classes = list(product([0.0,1.0],repeat=10))
    data['formula'] = data['formulaA'] + data['formulaB']
    data.rename({'stabilityVec':'stable'}, axis=1, inplace=True)
    data = data[['formula', 'stable']]

In [6]:
# Format and Choose Training Data
data = data.sample(1000)
data['composition'] = data['formula'].apply(get_compostion)
data['noble'] = data.apply(check_nobility, axis=1)
data = data[data['noble'] == False]
data = data[['composition','stable']]
data.reset_index(drop=True, inplace=True)
data.sample(10)

Unnamed: 0,composition,stable
789,"(Nb, Cs)",0
132,"(Ge, Ce)",1
49,"(Ce, Ac)",0
738,"(Pt, Si)",0
742,(Si),1
562,(Th),1
32,"(Ho, U)",0
353,"(Li, Sb)",0
673,"(Tb, Ir)",1
354,"(Pr, Sn)",0


In [7]:
# Featurize Data
#%%capture
f =  MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                         cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)])

X = np.array(f.featurize_many(data['composition']))

HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=887, style=ProgressStyle(description…




In [8]:
# Set training labels
Y = data['stable']

In [9]:
k_folds = partial(run_k_folds, inputs=X, outputs=Y)

In [10]:
# set models
models = [GaussianNB(), SVC(), SRC(), LogisticRegression(), DummyClassifier(strategy="most_frequent")]

In [11]:
if lr:
# Logistic Regression
    models[0].classes = classes

In [12]:
%%capture
pool = mp.Pool(processes=mp.cpu_count())

In [13]:
#%%capture
start_time = timeit.default_timer()
results = pool.map(k_folds, models)
elapsed = timeit.default_timer() - start_time

In [14]:
pprint(elapsed)

5.121948334999615


In [15]:
compiled = [[r.model,
  r.accuracy,
  r.accuracy_std,
  r.f1,
  r.f1_std,
  r.recall,
  r.recall_std,
  r.precision,
  r.precision_std] for r in results]

In [16]:
cols = ['type',
 'accuracy',
 'accuracy_std',
 'f1',
 'f1_std',
 'recall',
 'recall_std',
 'precision',
 'precision_std']

In [17]:
res_df = pd.DataFrame(compiled, columns=cols)

In [18]:
res_df.to_csv(save_path)

In [19]:
res_df

Unnamed: 0,type,accuracy,accuracy_std,f1,f1_std,recall,recall_std,precision,precision_std
0,GaussianNB,0.805988,0.037232,0.50525,0.112822,0.489174,0.135322,0.541912,0.129551
1,SVC,0.974068,0.015164,0.936354,0.036369,0.882329,0.065131,1.0,0.0
2,RandomForestClassifier,0.97071,0.014374,0.92944,0.036209,0.88656,0.075837,0.982738,0.040706
3,LogisticRegression,0.90641,0.032774,0.749929,0.086352,0.668266,0.101458,0.863554,0.094961
4,DummyClassifier,0.785853,0.055929,0.0,0.0,0.0,0.0,0.0,0.0
