In [238]:
%autoreload 1

#### Standard Libraries ####
import os
from pprint import pprint
import numpy as np
import pandas as pd
import multiprocessing as mp
from functools import partial
from itertools import product
import timeit
import uuid

#### third-party Libraries ####
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier as SRC
from lolopy.learners import RandomForestClassifier as LRC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from matminer.featurizers.base import MultipleFeaturizer
from pymatgen import Composition

#### Local Libraries ####
from utils import (Result, run_k_folds, 
                   report_column_labels,
                   compile_data)
from data_manager import DataManager
%aimport featurizer

In [245]:
# configuration
np.random.seed(8)
load_path = os.path.join('data','training_data.csv')
save_path = os.path.join('results','test_baseline.csv')
mp_api_key = '7n6DwPUQ5cf8ZTWO'
oversample = False
data_ramp = False
feature_set = ['electronegativity']
#feature_set = ['energy_b']

In [246]:
# Load Data
dm = DataManager(load_path, save_path)
dm.load()
dm.sample_data(1)

'Loaded 2572 records.'


In [247]:
# Format and careate composition objects
#dm.compute_formula()
dm.to_binary_classes()
dm.get_pymatgen_composition()
dm.remove_noble_gasses()
dm.remove_features()

In [248]:
f = Featurizer(feature_set, mp_api_key)

In [249]:
dm.featurized_data = f.featurize(dm.data)

HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=11, style=ProgressStyle(description_…




In [250]:
dm.featurized_data

array([[0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [74]:
from matminer.featurizers import composition as cf
features = [cf.CohesiveEnergy(mapi_key=mp_api_key)]
f = MultipleFeaturizer(features)

In [75]:
dm.featurized_data = f.featurize_many(dm.data['composition'], ignore_errors=True)


HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=1, style=ProgressStyle(description_w…




In [78]:
dm.featurized_data[0][0]

3.3614263291300004

In [32]:
dm.data.head()

Unnamed: 0,formula,formulaA,formulaB,composition,group,stable
0,Br,(Br),(Zr),(Br),0,1
1,Si,(Si),(Ne),(Si),1,1
2,Y,(Y),(Sn),(Y),3,1
3,K,(K),(Sn),(K),4,1
4,Te,(Te),(Se),(Te),5,1


In [31]:
# For a single element it works
dm.featurized_data

[[1.22],
 [4.63],
 [4.37],
 [0.934],
 [2.19],
 [5.81],
 [4.28],
 [5.31],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [3.596110979],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [4.13674427],
 [nan],
 [1.5933909569999998],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [4.7431766175],
 [4.3414006574999995],
 [2.329973943375],
 [2.3325621809635417],
 [3.493027030000001],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [4.1613667266666665],
 [2.6495544651666667],
 [2.375556925729166],
 [3.283997586666667],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [6.25],
 [3.14],
 [2.46],
 [2.75],
 [8.03]]

## Notes 07/26/2019
Using the dictionary approach I am now able to get some cohesive energies. I am still getting many NAN's so I would like to include a weighted energy for the constitiuent atoms. One idea for how to do this, is to try and pass multiple columns to `featurize_many`. To do this I will need to add an `element` column to the data. May consider using PMG's `reduced_formula` in place of a dictionary.

### Update
It looks like this will not work, we will need to loop through and featurize each column. We also need to retain element a and b.

In [115]:
# Add an element column to the data
# Can I do this using the pmg composition object?
def _get_composition(c):
    """Attempt to parse composition, return None if failed"""
    try:
        return Composition(c)
    except:
        return None

In [116]:
dm.data['element_a'] = dm.data['formulaA'].apply(_get_composition)

In [117]:
dm.data['element_b'] = dm.data['formulaB'].apply(_get_composition)

In [119]:
dm.data = dm.data[['element_a', 'element_b', 'composition', 'formula']]

In [125]:
dm.featurized_data = f.featurize_many(dm.data['element_a'], ignore_errors=True)


HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=67, style=ProgressStyle(description_…

In [126]:
dm.featurized_data

[[1.22],
 [4.63],
 [4.37],
 [0.934],
 [2.19],
 [5.81],
 [4.28],
 [5.31],
 [1.22],
 [4.37],
 [0.934],
 [2.19],
 [4.28],
 [5.31],
 [1.22],
 [4.37],
 [0.934],
 [2.19],
 [4.28],
 [5.31],
 [1.22],
 [4.37],
 [0.934],
 [2.19],
 [4.28],
 [5.31],
 [1.22],
 [4.37],
 [0.934],
 [2.19],
 [4.28],
 [5.31],
 [1.22],
 [4.37],
 [0.934],
 [2.19],
 [4.28],
 [5.31],
 [1.22],
 [4.37],
 [0.934],
 [2.19],
 [4.28],
 [5.31],
 [1.22],
 [4.37],
 [0.934],
 [2.19],
 [4.28],
 [5.31],
 [1.22],
 [4.37],
 [0.934],
 [2.19],
 [4.28],
 [5.31],
 [1.22],
 [4.37],
 [0.934],
 [2.19],
 [4.28],
 [5.31],
 [1.22],
 [4.37],
 [2.19],
 [4.28],
 [5.31]]

## Method for finding stoichiometric formulas

In [77]:
from pymatgen import Composition
fm = "Mg0.33Cl0.66"
comp = Composition(fm)

In [78]:
comp = Composition(comp.get_integer_formula_and_factor()[0])

In [79]:
comp

Comp: Mg1 Cl2