# Model Evauation
Classification models need the following metrics

Cross Validation
- Accuracy
- error

ROC Curve
Steps
- load data
- featurize
- choose model
- pass data and HP's
- run CV
- ROC possible?
- tabulate results
- report

In [3]:
import os
from pprint import pprint
import numpy as np
import pandas as pd
from utils import (get_compostion, 
                   check_nobility, 
                   Result, run_k_folds, 
                   run_k_folds_oversample,
                   report_column_labels,
                   compile_data)
from matminer.featurizers import composition as cf
from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates
from matminer.featurizers.base import MultipleFeaturizer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier as SRC
from lolopy.learners import RandomForestClassifier as LRC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
import multiprocessing as mp
from functools import partial
from itertools import product
import timeit
import uuid

In [46]:
# configuration
np.random.seed(8)
load_path = os.path.join('data','training_data.csv')
save_path = os.path.join('results','data_ramp.csv')
mp_api_key = '7n6DwPUQ5cf8ZTWO'
lr = False
energy_features = False
electronegativity_features = False
oversample = False
data_ramp = True

## Load data
This object should handle data load and sampling.

Should be able to do the following:
1. load and sample 
2. Run a data ramp

In [106]:
import pandas
from pymatgen import Composition

class DataManager():
    def __init__(self, load_path, save_path):
        self.load_path = load_path
        self.save_path = save_path
        self.data = None
        self.num_records = None
        
        
    def load(self):
        self.data = pandas.read_csv(self.load_path)
        self.num_records = len(self.data.index)
        pprint(f"Loaded {self.num_records} records.")
        
        
    def sample_data(self, sample_size: int = 1000):
        if not self.data.empty:
            self.data = self.data.sample(sample_size)
            self.data.reset_index(drop=True, inplace=True)
            self.num_records = len(self.data.index)
            
            
    def ramp(self):
        pass
    
    
    def get_pymatgen_composition(self):
        if not self.data.empty:
            def _get_composition(c):
                """Attempt to parse composition, return None if failed"""
                try:
                    return Composition(c)
                except:
                    return None
            
            self.data['composition'] = self.data['formula'].apply(_get_composition)
    
    
    def remove_noble_gasses(self):
        if not self.data.empty:
            def _check_nobility(row):
                comp = row['composition']
                return comp.contains_element_type('noble_gas')
            
            self.data['noble'] = self.data.apply(_check_nobility, axis=1)
            self.data = self.data[self.data['noble'] == False]
            self.data.reset_index(drop=True, inplace=True)
    
    
    def remove_features(self):
        if 'composition' in self.data.columns:
            self.data = self.data[['formula','formulaA','formulaB','composition','stabilityVec']]
    
    def compute_formula(self):
        self.data['formula'] = self.data['formulaA'] + self.data['formulaB']
    def save_to_csv(self):
        pass
    

In [107]:
# Load Data
dm = DataManager(load_path, save_path)
dm.load()
#dm.data.head()

'Loaded 2572 records.'


In [108]:
if lr:
    # If working with logistic regression
    load_path = os.path.join('data','training_data.csv')
    data = pd.read_csv(load_path)
    classes = list(product([0.0,1.0],repeat=10))
    data['formula'] = data['formulaA'] + data['formulaB']
    data.rename({'stabilityVec':'stable'}, axis=1, inplace=True)
    data = data[['formula', 'stable']]

In [109]:
# Sample data
dm.sample_data()

In [110]:
# Format and careate composition objects
dm.compute_formula()
dm.get_pymatgen_composition()
dm.remove_noble_gasses()
dm.remove_features()

In [112]:
from typing import List

class Featurizer():
    def __init__(self, feature_set: List('str') = ['standard']):
        self.feature_set = feature_set
        
        standard_features = [cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                             cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)]
        self.featurizers = {'standard': standard_features,
                           'energy': standard_features.extend([cf.OxidationStates(), cf.ElectronegativityDiff()]),
                           'electronegtivity':} 
    def _get_featurizer(self:)    
    
    
    def featurize(self, data: pandas.DataFrame) -> pandas.DataFrame:
        if 'energy' or 'electronegtivity' in feature_set:
                data = CompositionToOxidComposition(return_original_on_error=True,
                                                    overwrite_data=True).featurize_dataframe(data,
                                                                                             'composition',
                                                                                            ignore_errors=True)
        

            features = []

In [69]:
# Featurize Data
#%%capture
standard_features = [cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                             cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)]
if energy_features:
    standard_features.append(cf.CohesiveEnergy(mapi_key=mp_api_key))
    f = MultipleFeaturizer(standard_features)
    X = np.array(f.featurize_many(data['composition'], ignore_errors=True))
elif electronegativity_features:
    standard_features.extend([cf.OxidationStates(), cf.ElectronegativityDiff()])
    f = MultipleFeaturizer(standard_features)
    X = np.array(f.featurize_many(data['composition_oxid'], ignore_errors=True))
elif energy_features and electronegativity_features:
    data = CompositionToOxidComposition(return_original_on_error=True, overwrite_data=True).featurize_dataframe(data, 'composition', ignore_errors=True)
    standard_features.extend([cf.OxidationStates(), cf.ElectronegativityDiff(), cf.CohesiveEnergy(mapi_key=mp_api_key)])
    f = MultipleFeaturizer(standard_features)
    X = np.array(f.featurize_many(data['composition_oxid'], ignore_errors=True))
else:
    f = MultipleFeaturizer(standard_features)
    X = np.array(f.featurize_many(data['composition'], ignore_errors=True))

TypeError: 'DataManager' object is not subscriptable

In [111]:
# Imputation method
X = np.nan_to_num(X)
#X = X[~np.isnan(X).any(axis=1)]

In [113]:
# Set training labels
Y = data['stable']

In [114]:
if oversample:
    k_folds = partial(run_k_folds_oversample, inputs=X, outputs=Y)
else:
    k_folds = partial(run_k_folds, inputs=X, outputs=Y)

In [115]:
# set models
models = [GaussianNB(), SVC(), SRC(), LogisticRegression(), DummyClassifier(strategy="most_frequent")]

In [116]:
if lr:
# Logistic Regression
    models[0].classes = classes

In [117]:
%%capture
pool = mp.Pool(processes=mp.cpu_count())

In [118]:
%%capture
start_time = timeit.default_timer()
if data_ramp:
    X_splits = np.split(X, 5)
    Y_splits = np.split(Y, 5)
    for i in range(5):
        x = np.ndarray()
        for arr in X_splits[:i]:
            x.append(arr)
        if oversample:
            results = [run_k_folds_oversample(model, inputs=x, outputs=y) for model in models]
        else:
            results = [run_k_folds(model, inputs=x, outputs=y) for model in models]
else:
    results = pool.map(k_folds, models)
    
    
elapsed = timeit.default_timer() - start_time

In [120]:
if not data_ramp:
    compiled = compile_results(results)
else:
    compiled
res_df = pd.DataFrame(compiled, columns=report_column_labels)
res_df.to_csv(save_path)
res_df