# Model Evauation
Classification models need the following metrics

Cross Validation
- Accuracy
- error

ROC Curve
Steps
- load data
- featurize
- choose model
- pass data and HP's
- run CV
- ROC possible?
- tabulate results
- report

In [1]:
import os
from pprint import pprint
import numpy as np
import pandas as pd
from utils import (Result, run_k_folds, 
                   report_column_labels,
                   compile_data)
from matminer.featurizers import composition as cf
from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates
from matminer.featurizers.base import MultipleFeaturizer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier as SRC
from lolopy.learners import RandomForestClassifier as LRC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
import multiprocessing as mp
from functools import partial
from itertools import product
import timeit
import uuid

In [39]:
# configuration
np.random.seed(8)
load_path = os.path.join('data','training_data.csv')
save_path = os.path.join('results','baseline_100_groupkfolds.csv')
mp_api_key = '7n6DwPUQ5cf8ZTWO'
lr = False
energy_features = False
electronegativity_features = False
oversample = False
data_ramp = True

## Load data
This object should handle data load and sampling.

Should be able to do the following:
1. load and sample 
2. Run a data ramp

In [3]:
import pandas
import numpy
from functools import partial
from pymatgen import Composition

class DataManager():
    def __init__(self, load_path, save_path):
        self.load_path = load_path
        self.save_path = save_path
        self.data = None
        self.num_records = None
        self.groups = None
        
        
    def load(self):
        self.data = pandas.read_csv(self.load_path)
        self.num_records = len(self.data.index)
        pprint(f"Loaded {self.num_records} records.")
        
        
    def sample_data(self, sample_size: int = 1000):
        if not self.data.empty:
            self.data = self.data.sample(sample_size)
            self.data.reset_index(drop=True, inplace=True)
            self.num_records = len(self.data.index)
            
            
    def ramp(self):
        pass
    
    
    def get_pymatgen_composition(self):
        if not self.data.empty:
            def _get_composition(c):
                """Attempt to parse composition, return None if failed"""
                try:
                    return Composition(c)
                except:
                    return None
            
            self.data['composition'] = self.data['formula'].apply(_get_composition)
    
    
    def remove_noble_gasses(self):
        if not self.data.empty:
            def _check_nobility(row):
                comp = row['composition']
                return comp.contains_element_type('noble_gas')
            
            self.data['noble'] = self.data.apply(_check_nobility, axis=1)
            self.data = self.data[self.data['noble'] == False]
            self.data.reset_index(drop=True, inplace=True)
    
    
    def remove_features(self):
        if 'composition' in self.data.columns:
            self.data = self.data[['formula','composition','group','stable']]
    
    
    def compute_formula(self):
        self.data['formula'] = self.data['formulaA'] + self.data['formulaB']
    def save_to_csv(self):
        pass
    
    
    def to_binary_classes(self):    
        
        def _vec_to_stability(row: pd.Series, cols: list) -> pd.Series:
            vec = eval(row['stabilityVec'])
            for element, col in zip(vec, cols):
                row[col] = int(element)
            return row
        
        def _row_to_formula(row: pd.Series):
            w = float(row['weight_fraction_element_b'])
            a = row['formulaA']
            b = row['formulaB']

            if w == 0.0:
                return a
            elif w == 1.0:
                return b
            else:
                wa = 1.0-w
                def _compute_formula(wa):
                    return f"{a}{wa:.1f}{b}{w:.1f}"
                return _compute_formula(wa)
        
        cols = ['{}'.format(i/10) for i in range(11)]
        _vtf = partial(_vec_to_stability, cols=cols)

        self.groups = numpy.arange(self.num_records)
        self.data['group'] = self.groups
        original_cols = self.data.columns

        self.data = self.data.apply(_vtf, axis=1)
        self.data = self.data.melt(id_vars=original_cols, var_name='weight_fraction_element_b', value_name='stable')
        self.data['formula'] = self.data.apply(_row_to_formula, axis=1)
        self.data = self.data.drop_duplicates('formula')

In [23]:
# Load Data
dm = DataManager(load_path, save_path)
dm.load()

'Loaded 2572 records.'


In [24]:
# Sample data
dm.sample_data(100)

In [25]:
# Format and careate composition objects
#dm.compute_formula()
dm.to_binary_classes()
dm.get_pymatgen_composition()
dm.remove_noble_gasses()
dm.remove_features()

In [26]:
import pandas
from typing import List
from matminer.featurizers import composition as cf
from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates
from matminer.featurizers.base import MultipleFeaturizer

class Featurizer():
    def __init__(self, feature_set: ['str'] = ['standard']):
        self.feature_set = feature_set
        
        self.featurizers = {'standard': [cf.Stoichiometry(),
                                         cf.ElementProperty.from_preset("magpie"),
                                         cf.ValenceOrbital(props=['avg']),
                                         cf.IonProperty(fast=True)],
                           'energy': [cf.CohesiveEnergy(mapi_key=mp_api_key)],
                           'electronegtivity': [cf.OxidationStates(),
                                                cf.ElectronegativityDiff()]}     
    
    def featurize(self, data: pandas.DataFrame) -> pandas.DataFrame:
        impute = False
        if 'electronegtivity' in self.feature_set:
            impute = True
            data = CompositionToOxidComposition(return_original_on_error=True,
                                                overwrite_data=True).featurize_dataframe(data,
                                                                                         'composition',
                                                                                        ignore_errors=True)
        features = []
        for feature in self.feature_set:
            features.extend(self.featurizers.get(feature))

        f = MultipleFeaturizer(features)
        X = np.array(f.featurize_many(data['composition'], ignore_errors=True))

        if impute:
            return np.nan_to_num(X)
        else:
            return X

In [27]:
f = Featurizer(['standard'])

In [28]:
X = f.featurize(dm.data)

HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=817, style=ProgressStyle(description…




In [29]:
groups = dm.data['group']

In [30]:
# Set training labels
Y = dm.data['stable']

In [31]:
k_folds = partial(run_k_folds, inputs=X, outputs=Y, groups=groups)

In [42]:
# set models
#models = [GaussianNB(), SRC(), LogisticRegression(), DummyClassifier(strategy="most_frequent")]
models = [SVC()]

In [43]:
%%capture
pool = mp.Pool(processes=mp.cpu_count())

In [45]:
%%capture
start_time = timeit.default_timer()
results = pool.map(k_folds, models)    
elapsed = timeit.default_timer() - start_time

In [46]:
compiled = compile_data(results)
res_df = pd.DataFrame(compiled, columns=report_column_labels)
#res_df.to_csv(save_path)
res_df

Unnamed: 0,type,accuracy,accuracy_std,f1,f1_std,recall,recall_std,precision,precision_std
0,SVC,0.805183,0.054827,0.0,0.0,0.0,0.0,0.0,0.0


# Single model test

In [166]:
model = SVC()

In [167]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import KFold, GroupKFold
inputs = X
outputs = Y
groups = dm.data['group']
gkf = GroupKFold(n_splits=10)
splits = list(gkf.split(inputs, outputs, groups=groups))

In [168]:
%%capture
train, test = splits[0]
train_df = pandas.DataFrame(dm.data.ix[train, ['formula','group']])
test_df = pandas.DataFrame(dm.data.ix[test, ['formula','group','stable']])

model.fit(inputs[train], outputs[train])
prediction = model.predict(inputs[test])
pred_df = pandas.DataFrame(dm.data.ix[test, ['formula','group','stable']])
pred_df['prediction'] = prediction
acc = accuracy_score(outputs[test], prediction)
acc_df = pandas.DataFrame([acc], columns=['accuracy'])


In [169]:
f1s = [precision_recall_fscore_support(
    outputs[test], prediction, labels=[0,1], average='binary')]
f1_df = pandas.DataFrame(
        f1s, columns=['precision', 'recall', 'f1', 'support'])


Precision and F-score are ill-defined and being set to 0.0 due to no predicted samples.



In [171]:
ov_df = pd.DataFrame([train_df['group'].isin(test_df['group']).any()], columns=['overlap'])

try:
    lab_df = pandas.DataFrame([pred_df['prediction'].value_counts().values], columns=['0','1'])
except:
    lab_df = pandas.DataFrame([pred_df['prediction'].value_counts().values], columns=['0'])

ac_lab_df = pandas.DataFrame([test_df['stable'].value_counts().values], columns=['acc_0','acc_1'])

f1_df.join(acc_df).join(ov_df).join(lab_df).join(ac_lab_df)

Unnamed: 0,precision,recall,f1,support,accuracy,overlap,0,acc_0,acc_1
0,0.0,0.0,0.0,,0.8125,False,80,65,15
