# Model Evauation
Classification models need the following metrics

Cross Validation
- Accuracy
- error

ROC Curve

## SKL Models

In [28]:
from sklearn.model_selection import KFold
import os
from pprint import pprint
import numpy as np
import pandas as pd
from utils import get_compostion, check_nobility
from matminer.featurizers import composition as cf
from matminer.featurizers.base import MultipleFeaturizer
from sklearn.metrics import accuracy_score, roc_curve
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier as SRC
from lolopy.learners import RandomForestClassifier as LRC

In [29]:
# load data
# featurize
# choose model
# pass data and HP's
# run CV
# ROC possible?
# tabulate results
# report

In [30]:
# configuration
np.random.seed(8)
load_path = os.path.join('data','processed_data.csv')

In [31]:
# Load Data
data = pd.read_csv(load_path)
pprint(f"Loaded {len(data.index)} records.")

'Loaded 25802 records.'


In [32]:
# Format and Choose Training Data
data = data.sample(1000)
data['composition'] = data['formula'].apply(get_compostion)
data['noble'] = data.apply(check_nobility, axis=1)
data = data[data['noble'] == False]
data = data[['composition','stable']]
data.reset_index(drop=True, inplace=True)
data.sample(10)

Unnamed: 0,composition,stable
789,"(Nb, Cs)",0
132,"(Ge, Ce)",1
49,"(Ce, Ac)",0
738,"(Pt, Si)",0
742,(Si),1
562,(Th),1
32,"(Ho, U)",0
353,"(Li, Sb)",0
673,"(Tb, Ir)",1
354,"(Pr, Sn)",0


In [33]:
# Featurize Data

In [34]:
%%capture
f =  MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                         cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)])

X = np.array(f.featurize_many(data['composition']))

In [35]:
# Set training labels
Y = data['stable']

In [44]:
class Result():
    def __init__(self, model, accuracies, roc, mean_accuracy):
        self.model = model
        self.accuracies = accuracies
        self.roc = roc
        self.mean_accuracy = mean_accuracy

In [79]:
# K-folds
def run_k_folds(model, inputs, outputs, binary_classifier):
    name = type(model).__name__
    roc = {}
    accuracies = {}
    cnt = 1
    
    for train, test in KFold(n_splits=10, shuffle=True, random_state=8).split(inputs):
        # Data Preprocessing
        model.fit(inputs[train], outputs[train])
        prediction = model.predict(inputs[test])
        accuracies[f"Fold_{cnt}"] = accuracy_score(outputs[test], prediction)
        if binary_classifier:
            roc[f"Fold_{cnt}"] = roc_curve(outputs[test], prediction)
        cnt += 1
        mean_score = sum([score for key, score in accuracies.items()])/len(accuracies)
    
    return Result(name, roc, accuracies, mean_score)


In [80]:
# set models
models = (GaussianNB(), SVC(), SRC(), LRC())

In [81]:
results = [run_k_folds(model, X, Y, True) for model in models]






















The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



In [84]:
for result in results:
    pprint(f"{result.model}: {result.mean_accuracy}")

'GaussianNB: 0.8093590398365679'
'SVC: 0.8410495403472931'
'RandomForestClassifier: 0.880464759959142'
'RandomForestClassifier: 0.8770939734422882'
