# Model Evauation
Classification models need the following metrics

Cross Validation
- Accuracy
- error

ROC Curve
Steps
- load data
- featurize
- choose model
- pass data and HP's
- run CV
- ROC possible?
- tabulate results
- report

In [231]:
import os
from pprint import pprint
import numpy as np
import pandas as pd
from utils import (get_compostion, 
                   check_nobility, 
                   Result, run_k_folds, 
                   run_k_folds_oversample,
                   report_column_labels,
                   compile_data)
from matminer.featurizers import composition as cf
from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates
from matminer.featurizers.base import MultipleFeaturizer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier as SRC
from lolopy.learners import RandomForestClassifier as LRC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
import multiprocessing as mp
from functools import partial
from itertools import product
import timeit
import uuid

In [232]:
# configuration
np.random.seed(8)
load_path = os.path.join('data','training_data.csv')
save_path = os.path.join('results','data_ramp.csv')
mp_api_key = '7n6DwPUQ5cf8ZTWO'
lr = False
energy_features = False
electronegativity_features = False
oversample = False
data_ramp = True

## Load data
This object should handle data load and sampling.

Should be able to do the following:
1. load and sample 
2. Run a data ramp

In [258]:
import pandas
import numpy
from functools import partial
from pymatgen import Composition

class DataManager():
    def __init__(self, load_path, save_path):
        self.load_path = load_path
        self.save_path = save_path
        self.data = None
        self.num_records = None
        self.groups = None
        
        
    def load(self):
        self.data = pandas.read_csv(self.load_path)
        self.num_records = len(self.data.index)
        pprint(f"Loaded {self.num_records} records.")
        
        
    def sample_data(self, sample_size: int = 1000):
        if not self.data.empty:
            self.data = self.data.sample(sample_size)
            self.data.reset_index(drop=True, inplace=True)
            self.num_records = len(self.data.index)
            
            
    def ramp(self):
        pass
    
    
    def get_pymatgen_composition(self):
        if not self.data.empty:
            def _get_composition(c):
                """Attempt to parse composition, return None if failed"""
                try:
                    return Composition(c)
                except:
                    return None
            
            self.data['composition'] = self.data['formula'].apply(_get_composition)
    
    
    def remove_noble_gasses(self):
        if not self.data.empty:
            def _check_nobility(row):
                comp = row['composition']
                return comp.contains_element_type('noble_gas')
            
            self.data['noble'] = self.data.apply(_check_nobility, axis=1)
            self.data = self.data[self.data['noble'] == False]
            self.data.reset_index(drop=True, inplace=True)
    
    
    def remove_features(self):
        if 'composition' in self.data.columns:
            self.data = self.data[['formula','formulaA','formulaB','composition','group','stabilityVec']]
    
    
    def compute_formula(self):
        self.data['formula'] = self.data['formulaA'] + self.data['formulaB']
    def save_to_csv(self):
        pass
    
    
    def to_binary_classes(self):    
        
        def _vec_to_stability(row: pd.Series, cols: list) -> pd.Series:
            vec = eval(row['stabilityVec'])
            for element, col in zip(vec, cols):
                row[col] = int(element)
            return row
        
        def _row_to_formula(row: pd.Series):
            w = float(row['weight_fraction_element_b'])
            a = row['formulaA']
            b = row['formulaB']

            if w == 0.0:
                return a
            elif w == 1.0:
                return b
            else:
                wa = 1.0-w
                def _compute_formula(wa):
                    return f"{a}{wa:.1f}{b}{w:.1f}"
                return _compute_formula(wa)
        
        cols = ['{}'.format(i/10) for i in range(11)]
        _vtf = partial(_vec_to_stability, cols=cols)

        self.groups = numpy.arange(self.num_records)
        self.data['group'] = self.groups
        original_cols = self.data.columns

        self.data = self.data.apply(_vtf, axis=1)
        self.data = self.data.melt(id_vars=original_cols, var_name='weight_fraction_element_b', value_name='stable')
        self.data['formula'] = self.data.apply(_row_to_formula, axis=1)
        self.data = self.data.drop_duplicates('formula')

In [259]:
# Load Data
dm = DataManager(load_path, save_path)
dm.load()

'Loaded 2572 records.'


In [260]:
if lr:
    # If working with logistic regression
    load_path = os.path.join('data','training_data.csv')
    data = pd.read_csv(load_path)
    classes = list(product([0.0,1.0],repeat=10))
    data['formula'] = data['formulaA'] + data['formulaB']
    data.rename({'stabilityVec':'stable'}, axis=1, inplace=True)
    data = data[['formula', 'stable']]

In [261]:
# Sample data
dm.sample_data(100)

In [262]:
dm.to_binary_classes()

In [267]:
dm.data[dm.data['group'] == 99]

Unnamed: 0,formulaA,formulaB,formulaA_elements_AtomicVolume,formulaB_elements_AtomicVolume,formulaA_elements_AtomicWeight,formulaB_elements_AtomicWeight,formulaA_elements_BoilingT,formulaB_elements_BoilingT,formulaA_elements_BulkModulus,formulaB_elements_BulkModulus,...,formulaB_elements_SpaceGroupNumber,avg_coordination_A,avg_coordination_B,avg_nearest_neighbor_distance_A,avg_nearest_neighbor_distance_B,stabilityVec,group,weight_fraction_element_b,stable,formula
99,Kr,Ag,37.107495,17.075648,83.798,107.8682,119.78,2435.0,0.0,100.0,...,225,12.0,12.0,4.59369,2.94195,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",99,0.0,1,Kr
199,Kr,Ag,37.107495,17.075648,83.798,107.8682,119.78,2435.0,0.0,100.0,...,225,12.0,12.0,4.59369,2.94195,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",99,0.1,0,Kr0.9Ag0.1
299,Kr,Ag,37.107495,17.075648,83.798,107.8682,119.78,2435.0,0.0,100.0,...,225,12.0,12.0,4.59369,2.94195,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",99,0.2,0,Kr0.8Ag0.2
399,Kr,Ag,37.107495,17.075648,83.798,107.8682,119.78,2435.0,0.0,100.0,...,225,12.0,12.0,4.59369,2.94195,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",99,0.3,0,Kr0.7Ag0.3
499,Kr,Ag,37.107495,17.075648,83.798,107.8682,119.78,2435.0,0.0,100.0,...,225,12.0,12.0,4.59369,2.94195,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",99,0.4,0,Kr0.6Ag0.4
599,Kr,Ag,37.107495,17.075648,83.798,107.8682,119.78,2435.0,0.0,100.0,...,225,12.0,12.0,4.59369,2.94195,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",99,0.5,0,Kr0.5Ag0.5
699,Kr,Ag,37.107495,17.075648,83.798,107.8682,119.78,2435.0,0.0,100.0,...,225,12.0,12.0,4.59369,2.94195,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",99,0.6,0,Kr0.4Ag0.6
799,Kr,Ag,37.107495,17.075648,83.798,107.8682,119.78,2435.0,0.0,100.0,...,225,12.0,12.0,4.59369,2.94195,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",99,0.7,0,Kr0.3Ag0.7
899,Kr,Ag,37.107495,17.075648,83.798,107.8682,119.78,2435.0,0.0,100.0,...,225,12.0,12.0,4.59369,2.94195,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",99,0.8,0,Kr0.2Ag0.8
999,Kr,Ag,37.107495,17.075648,83.798,107.8682,119.78,2435.0,0.0,100.0,...,225,12.0,12.0,4.59369,2.94195,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",99,0.9,0,Kr0.1Ag0.9


In [265]:
dm.data['group'].value_counts()

52    11
98    11
26    11
21    11
8     11
12    11
27    11
1     11
3     11
4     11
40    10
31    10
33    10
35    10
51    10
36    10
39    10
50    10
43    10
44    10
30    10
45    10
46    10
47    10
48    10
42    10
99    10
28    10
25    10
2     10
      ..
15     9
93     9
85     9
92     9
86     9
79     9
88     9
94     9
53     9
78     9
67     9
57     9
58     9
60     9
61     9
63     9
64     9
41     9
66     9
38     9
77     9
37     9
68     9
34     9
32     9
72     9
29     9
76     9
24     9
75     9
Name: group, Length: 100, dtype: int64

In [216]:
# Format and careate composition objects
#dm.compute_formula()
dm.to_binary_classes()
dm.remove_features()
dm.get_pymatgen_composition()
dm.remove_noble_gasses()

ValueError: Length of values does not match length of index

In [180]:
dm.data

Unnamed: 0,formulaA,formulaB,formulaA_elements_AtomicVolume,formulaB_elements_AtomicVolume,formulaA_elements_AtomicWeight,formulaB_elements_AtomicWeight,formulaA_elements_BoilingT,formulaB_elements_BoilingT,formulaA_elements_BulkModulus,formulaB_elements_BulkModulus,...,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,weight_fraction_element_b,stable


In [167]:
from typing import List
from matminer.featurizers import composition as cf
from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates
from matminer.featurizers.base import MultipleFeaturizer

class Featurizer():
    def __init__(self, feature_set: ['str'] = ['standard']):
        self.feature_set = feature_set
        
        self.featurizers = {'standard': [cf.Stoichiometry(),
                                         cf.ElementProperty.from_preset("magpie"),
                                         cf.ValenceOrbital(props=['avg']),
                                         cf.IonProperty(fast=True)],
                           'energy': [cf.OxidationStates(),
                                      cf.ElectronegativityDiff()],
                           'electronegtivity': [cf.OxidationStates(),
                                                cf.ElectronegativityDiff()]}     
    
    def featurize(self, data: pandas.DataFrame) -> pandas.DataFrame:
        impute = False
        if ('energy' or 'electronegtivity') in self.feature_set:
            impute = True
            data = CompositionToOxidComposition(return_original_on_error=True,
                                                overwrite_data=True).featurize_dataframe(data,
                                                                                         'composition',
                                                                                        ignore_errors=True)
        features = []
        for feature in self.feature_set:
            features.extend(self.featurizers.get(feature))

        f = MultipleFeaturizer(features)
        X = np.array(f.featurize_many(data['composition'], ignore_errors=True))

        if impute:
            return np.nan_to_num(X)
        else:
            return X

In [168]:
f = Featurizer(['standard'])

In [169]:
X = f.featurize(dm.data)

KeyError: 'composition'

In [145]:
# Set training labels
Y = dm.data['stable']

TypeError: 'DataManager' object is not subscriptable

In [114]:
if oversample:
    k_folds = partial(run_k_folds_oversample, inputs=X, outputs=Y)
else:
    k_folds = partial(run_k_folds, inputs=X, outputs=Y)

In [115]:
# set models
models = [GaussianNB(), SVC(), SRC(), LogisticRegression(), DummyClassifier(strategy="most_frequent")]

In [116]:
if lr:
# Logistic Regression
    models[0].classes = classes

In [117]:
%%capture
pool = mp.Pool(processes=mp.cpu_count())

In [118]:
%%capture
start_time = timeit.default_timer()
if data_ramp:
    X_splits = np.split(X, 5)
    Y_splits = np.split(Y, 5)
    for i in range(5):
        x = np.ndarray()
        for arr in X_splits[:i]:
            x.append(arr)
        if oversample:
            results = [run_k_folds_oversample(model, inputs=x, outputs=y) for model in models]
        else:
            results = [run_k_folds(model, inputs=x, outputs=y) for model in models]
else:
    results = pool.map(k_folds, models)
    
    
elapsed = timeit.default_timer() - start_time

In [120]:
if not data_ramp:
    compiled = compile_results(results)
else:
    compiled
res_df = pd.DataFrame(compiled, columns=report_column_labels)
res_df.to_csv(save_path)
res_df