In [1]:
#### Standard Libraries ####
import os
from pprint import pprint
import numpy as np
import pandas as pd
import multiprocessing as mp
from functools import partial
from itertools import product
import timeit
import uuid

#### third-party Libraries ####
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier as SRC
from lolopy.learners import RandomForestClassifier as LRC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from matminer.featurizers.base import MultipleFeaturizer

#### Local Libraries ####
from utils import (Result, run_k_folds, 
                   report_column_labels,
                   compile_data)
from data_manager import DataManager
from featurizer import Featurizer


calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.



In [110]:
# configuration
np.random.seed(8)
load_path = os.path.join('data','training_data.csv')
save_path = os.path.join('results','test_baseline.csv')
mp_api_key = '7n6DwPUQ5cf8ZTWO'
oversample = False
data_ramp = False
feature_set = ['energy']

In [111]:
# Load Data
dm = DataManager(load_path, save_path)
dm.load()
dm.data = dm.data.sample(10).reset_index()

'Loaded 2572 records.'


In [20]:
#f = Featurizer(feature_set)

In [21]:
#dm.featurized_data = f.featurize(dm.data)

In [112]:
dm.num_records = len(dm.data.index)

In [113]:
dm.data

Unnamed: 0,index,formulaA,formulaB,formulaA_elements_AtomicVolume,formulaB_elements_AtomicVolume,formulaA_elements_AtomicWeight,formulaB_elements_AtomicWeight,formulaA_elements_BoilingT,formulaB_elements_BoilingT,formulaA_elements_BulkModulus,...,formulaB_elements_Row,formulaA_elements_ShearModulus,formulaB_elements_ShearModulus,formulaA_elements_SpaceGroupNumber,formulaB_elements_SpaceGroupNumber,avg_coordination_A,avg_coordination_B,avg_nearest_neighbor_distance_A,avg_nearest_neighbor_distance_B,stabilityVec
0,1421,Br,Zr,42.527825,23.265943,79.904,91.224,332.0,4682.0,1.9,...,5,0.0,33.0,64,194,1.0,12.0,2.38875,3.19147,"[1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0]"
1,54,Si,Ne,20.016378,37.232186,28.0855,20.1791,3173.0,26.92,100.0,...,2,0.0,0.0,227,225,4.0,12.0,2.36803,3.04326,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]"
2,2539,He,B,37.236036,7.297767,4.002602,10.811,4.07,4273.0,0.0,...,2,0.0,0.0,225,166,8.0,5.5,2.73717,1.72536,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]"
3,1283,Y,Sn,33.013213,26.966785,88.90585,118.71,3618.0,2875.0,41.0,...,5,26.0,18.0,194,141,12.0,4.0,3.53391,2.87792,"[1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0]"
4,1274,K,Sn,75.847865,26.966785,39.0983,118.71,1032.0,2875.0,3.1,...,5,1.3,18.0,229,141,8.0,4.0,4.57083,2.87792,"[1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0]"
5,1226,Te,Se,33.956689,27.208805,127.6,78.96,1261.0,958.0,65.0,...,4,16.0,3.7,152,14,2.0,2.0,2.89227,2.364,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]"
6,151,B,Xe,7.297767,36.952924,10.811,131.293,4273.0,165.0,320.0,...,5,0.0,0.0,166,225,5.5,12.0,1.72536,4.85032,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]"
7,1088,Fe,Sb,11.777365,30.191423,55.845,121.76,3134.0,1860.0,170.0,...,5,82.0,20.0,229,166,8.0,3.0,2.46654,2.95449,"[1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]"
8,2040,V,Re,13.844898,14.710334,50.9415,186.207,3680.0,5869.0,160.0,...,6,47.0,178.0,229,194,8.0,12.0,2.59229,2.76299,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0]"
9,1216,He,Se,37.236036,27.208805,4.002602,78.96,4.07,958.0,0.0,...,4,0.0,3.7,225,14,8.0,2.0,2.73717,2.364,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]"


In [114]:
# Format and careate composition objects
#dm.compute_formula()
dm.to_binary_classes()
dm.get_pymatgen_composition()
dm.remove_noble_gasses()
#dm.remove_features()

In [29]:
from matminer.featurizers import composition as cf
features = [cf.CohesiveEnergy(mapi_key=mp_api_key)]
f = MultipleFeaturizer(features)

In [30]:
dm.featurized_data = f.featurize_many(dm.data['composition'], ignore_errors=True)


HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=67, style=ProgressStyle(description_…

In [31]:
# For a single element it works
dm.featurized_data

[[1.22],
 [4.63],
 [4.37],
 [0.934],
 [2.19],
 [5.81],
 [4.28],
 [5.31],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [3.596110979],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [4.13674427],
 [nan],
 [1.5933909569999998],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [4.7431766175],
 [4.3414006574999995],
 [2.329973943375],
 [2.3325621809635417],
 [3.493027030000001],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [4.1613667266666665],
 [2.6495544651666667],
 [2.375556925729166],
 [3.283997586666667],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [nan],
 [6.25],
 [3.14],
 [2.46],
 [2.75],
 [8.03]]

## Notes 07/26/2019
Using the dictionary approach I am now able to get some cohesive energies. I am still getting many NAN's so I would like to include a weighted energy for the constitiuent atoms. One idea for how to do this, is to try and pass multiple columns to `featurize_many`. To do this I will need to add an `element` column to the data. May consider using PMG's `reduced_formula` in place of a dictionary.

In [115]:
# Add an element column to the data
# Can I do this using the pmg composition object?
def _get_composition(c):
    """Attempt to parse composition, return None if failed"""
    try:
        return Composition(c)
    except:
        return None

In [116]:
dm.data['element_a'] = dm.data['formulaA'].apply(_get_composition)

In [117]:
dm.data['element_b'] = dm.data['formulaB'].apply(_get_composition)

In [119]:
dm.data = dm.data[['element_a', 'element_b', 'composition', 'formula']]

In [None]:
dm.featurized_data = f.featurize_many(dm.data['element_a'], ignore_errors=True)


HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=67, style=ProgressStyle(description_…

In [None]:
dm.featurized_data

## Method for finding stoichiometric formulas

In [77]:
from pymatgen import Composition
fm = "Mg0.33Cl0.66"
comp = Composition(fm)

In [78]:
comp = Composition(comp.get_integer_formula_and_factor()[0])

In [79]:
comp

Comp: Mg1 Cl2