# Model Evauation
Classification models need the following metrics

Cross Validation
- Accuracy
- error

ROC Curve
Steps
- load data
- featurize
- choose model
- pass data and HP's
- run CV
- ROC possible?
- tabulate results
- report

In [1]:
import os
from pprint import pprint
import numpy as np
import pandas as pd
from utils import (Result, run_k_folds, 
                   report_column_labels,
                   compile_data)
from matminer.featurizers import composition as cf
from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates
from matminer.featurizers.base import MultipleFeaturizer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier as SRC
from lolopy.learners import RandomForestClassifier as LRC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
import multiprocessing as mp
from functools import partial
from itertools import product
import timeit
import uuid

In [31]:
# configuration
np.random.seed(8)
load_path = os.path.join('data','training_data.csv')
save_path = os.path.join('results','testing_ene_eln_std_100_groupkfolds.csv')
mp_api_key = '7n6DwPUQ5cf8ZTWO'
oversample = False
data_ramp = True
feature_set = ['standard', 'energy', 'electronegativity']

## Load data
This object should handle data load and sampling.

Should be able to do the following:
1. load and sample 
2. Run a data ramp

In [3]:
import pandas
import numpy
from functools import partial
from pymatgen import Composition

class DataManager():
    def __init__(self, load_path, save_path):
        self.load_path = load_path
        self.save_path = save_path
        self.data = None
        self.num_records = None
        self.groups = None
        
        
    def load(self):
        self.data = pandas.read_csv(self.load_path)
        self.num_records = len(self.data.index)
        pprint(f"Loaded {self.num_records} records.")
        
        
    def sample_data(self, sample_size: int = 1000):
        if not self.data.empty:
            self.data = self.data.sample(sample_size)
            self.data.reset_index(drop=True, inplace=True)
            self.num_records = len(self.data.index)
            
            
    def ramp(self):
        pass
    
    
    def get_pymatgen_composition(self):
        if not self.data.empty:
            def _get_composition(c):
                """Attempt to parse composition, return None if failed"""
                try:
                    return Composition(c)
                except:
                    return None
            
            self.data['composition'] = self.data['formula'].apply(_get_composition)
    
    
    def remove_noble_gasses(self):
        if not self.data.empty:
            def _check_nobility(row):
                comp = row['composition']
                return comp.contains_element_type('noble_gas')
            
            self.data['noble'] = self.data.apply(_check_nobility, axis=1)
            self.data = self.data[self.data['noble'] == False]
            self.data.reset_index(drop=True, inplace=True)
    
    
    def remove_features(self):
        if 'composition' in self.data.columns:
            self.data = self.data[['formula','composition','group','stable']]
    
    
    def compute_formula(self):
        self.data['formula'] = self.data['formulaA'] + self.data['formulaB']
    def save_to_csv(self):
        pass
    
    
    def to_binary_classes(self):    
        
        def _vec_to_stability(row: pd.Series, cols: list) -> pd.Series:
            vec = eval(row['stabilityVec'])
            for element, col in zip(vec, cols):
                row[col] = int(element)
            return row
        
        def _row_to_formula(row: pd.Series):
            w = float(row['weight_fraction_element_b'])
            a = row['formulaA']
            b = row['formulaB']

            if w == 0.0:
                return a
            elif w == 1.0:
                return b
            else:
                wa = 1.0-w
                def _compute_formula(wa):
                    return f"{a}{wa:.1f}{b}{w:.1f}"
                return _compute_formula(wa)
        
        cols = ['{}'.format(i/10) for i in range(11)]
        _vtf = partial(_vec_to_stability, cols=cols)

        self.groups = numpy.arange(self.num_records)
        self.data['group'] = self.groups
        original_cols = self.data.columns

        self.data = self.data.apply(_vtf, axis=1)
        self.data = self.data.melt(id_vars=original_cols, var_name='weight_fraction_element_b', value_name='stable')
        self.data['formula'] = self.data.apply(_row_to_formula, axis=1)
        self.data = self.data.drop_duplicates('formula')

In [4]:
# Load Data
dm = DataManager(load_path, save_path)
dm.load()

'Loaded 2572 records.'


In [5]:
# Sample data
dm.sample_data(100)

In [6]:
# Format and careate composition objects
#dm.compute_formula()
dm.to_binary_classes()
dm.get_pymatgen_composition()
dm.remove_noble_gasses()
dm.remove_features()

In [7]:
import pandas
from typing import List
from matminer.featurizers import composition as cf
from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates
from matminer.featurizers.base import MultipleFeaturizer

class Featurizer():
    def __init__(self, feature_set: ['str'] = ['standard']):
        self.feature_set = feature_set
        
        self.featurizers = {'standard': [cf.Stoichiometry(),
                                         cf.ElementProperty.from_preset("magpie"),
                                         cf.ValenceOrbital(props=['avg']),
                                         cf.IonProperty(fast=True)],
                           'energy': [cf.CohesiveEnergy(mapi_key=mp_api_key)],
                           'electronegativity': [cf.OxidationStates(),
                                                cf.ElectronegativityDiff()]}     
    
    def featurize(self, data: pandas.DataFrame) -> pandas.DataFrame:
        impute = False
        if 'electronegtivity' in self.feature_set:
            data = CompositionToOxidComposition(return_original_on_error=True,
                                                overwrite_data=True).featurize_dataframe(data,
                                                                                         'composition',
                                                                                        ignore_errors=True)
        if any(c in self.feature_set for c in ('energy', 'electronegativity')):
            impute = True
            
        features = []
        for feature in self.feature_set:
            features.extend(self.featurizers.get(feature))

        f = MultipleFeaturizer(features)
        X = np.array(f.featurize_many(data['composition'], ignore_errors=True))

        if impute:
            return np.nan_to_num(X)
        else:
            return X

In [32]:
f = Featurizer(feature_set)

In [33]:
X = f.featurize(dm.data)

HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=871, style=ProgressStyle(description…




In [34]:
np.isnan(X).any()

False

In [35]:
groups = dm.data['group']

In [36]:
# Set training labels
Y = dm.data['stable']

In [37]:
k_folds = partial(run_k_folds, inputs=X, outputs=Y, groups=groups, sampling=oversample)

In [38]:
# set models
models = [GaussianNB(), SVC(), SRC(), LogisticRegression(), DummyClassifier(strategy="most_frequent")]
#models = [SVC()]

In [39]:
%%capture
pool = mp.Pool(processes=mp.cpu_count())

In [40]:
%%capture
start_time = timeit.default_timer()
results = pool.map(k_folds, models)    
elapsed = timeit.default_timer() - start_time

In [41]:
compiled = compile_data(results)
res_df = pd.DataFrame(compiled, columns=report_column_labels)
res_df.to_csv(save_path)
res_df

Unnamed: 0,type,accuracy,accuracy_std,f1,f1_std,recall,recall_std,precision,precision_std
0,GaussianNB,0.822265,0.094755,0.516789,0.104154,0.532717,0.114971,0.555572,0.189302
1,SVC,0.833208,0.0304,0.0,0.0,0.0,0.0,0.0,0.0
2,RandomForestClassifier,0.904566,0.024164,0.649579,0.069213,0.530579,0.076656,0.853687,0.113564
3,LogisticRegression,0.878338,0.045551,0.624953,0.106217,0.596016,0.114602,0.679515,0.143107
4,DummyClassifier,0.833208,0.0304,0.0,0.0,0.0,0.0,0.0,0.0


# Single model test

In [276]:
from sklearn.utils import resample
from collections import Counter
def local_oversample(train, outputs):
    majority_class, minority_class = Counter(outputs[train]).most_common()
    minority = numpy.where(outputs[train] == minority_class[0])[0]
    majority = numpy.where(outputs[train] == majority_class[0])[0]
    minority_upsampled = resample(minority,
                              replace=True,
                              n_samples=len(majority),
                              random_state=8)                          
    return (numpy.append(minority, minority_upsampled), minority, majority, minority_upsampled)

In [277]:
model = SVC()
ovs = True

In [278]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import KFold, GroupKFold
inputs = X
outputs = Y
groups = dm.data['group']
gkf = GroupKFold(n_splits=10)
splits = list(gkf.split(inputs, outputs, groups=groups))

In [279]:
%%capture
train, test = splits[0]
if ovs:
    train, minority, majority, minority_upsampled = local_oversample(train, outputs)
train_df = pandas.DataFrame(dm.data.ix[train, ['formula','group','stable']])
test_df = pandas.DataFrame(dm.data.ix[test, ['formula','group','stable']])

model.fit(inputs[train], outputs[train])
prediction = model.predict(inputs[test])
pred_df = pandas.DataFrame(dm.data.ix[test, ['formula','group','stable']])
pred_df['prediction'] = prediction
acc = accuracy_score(outputs[test], prediction)
acc_df = pandas.DataFrame([acc], columns=['accuracy'])


In [280]:
f1s = [precision_recall_fscore_support(
    outputs[test], prediction, labels=[0,1], average='binary')]
f1_df = pandas.DataFrame(
        f1s, columns=['precision', 'recall', 'f1', 'support'])

In [281]:
ov_df = pd.DataFrame([train_df['group'].isin(test_df['group']).any()], columns=['overlap'])

try:
    lab_df = pandas.DataFrame([pred_df['prediction'].value_counts().values], columns=['pred_0','pred_1'])
except:
    lab_df = pandas.DataFrame([pred_df['prediction'].value_counts().values], columns=['pred_0'])
if ovs:
    sample_df = pandas.DataFrame([train_df['stable'].value_counts().values], columns=['smpl_0','smpl_1'])
else:
    sample_df = pandas.DataFrame([], columns=['sample'])
ac_lab_df = pandas.DataFrame([test_df['stable'].value_counts().values], columns=['acc_0','acc_1'])

f1_df.join(acc_df).join(ov_df).join(lab_df).join(ac_lab_df).join(sample_df)

Unnamed: 0,precision,recall,f1,support,accuracy,overlap,pred_0,pred_1,acc_0,acc_1,smpl_0,smpl_1
0,1.0,0.266667,0.421053,,0.8625,True,76,4,65,15,413,324


In [282]:
pprint(f"Minority: {len(minority)}")
pprint(f"Majority: {len(majority)}")
pprint(f"Minority Upsampled: {len(minority_upsampled)}")

'Minority: 143'
'Majority: 594'
'Minority Upsampled: 594'
