In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
from matminer.data_retrieval.retrieve_Citrine import CitrineDataRetrieval
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers import composition as cf
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from pymatgen import Composition
from scipy.stats import norm
from collections import Counter
from pprint import pprint
import pandas as pd
import numpy as np
import statistics
import os



calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.



In [2]:
# Load Data
# Compute features
# Run CV
# Report

## Configure

In [3]:
load_path = os.path.join('data','training_data.csv')
api_key = os.environ.get('CITRINATION_API_KEY')
site = 'https://citrination.com'
config_path = os.path.join('config','config.json')
np.random.seed(8)

## Compute Classes

In [4]:
# There will be 2^10 possible class labels
from itertools import product
num_class = list(product([0.0,1.0],repeat=11))

In [5]:
len(num_class)

2048

## Load data

In [6]:
data = pd.read_csv(load_path)
pprint(f"Loaded {len(data.index)} records.")

'Loaded 2572 records.'


In [7]:
data['formula'] = data['formulaA'] + data['formulaB']

In [8]:
def get_compostion(c):
    """Attempt to parse composition, return None if failed"""
    
    try:
        return Composition(c)
    except:
        return None

In [9]:
data = data.sample(1000)

In [10]:
o_feat = data

In [11]:
n_feat = StrToComposition().featurize_dataframe(data, col_id='formula')

HBox(children=(IntProgress(value=0, description='StrToComposition', max=1000, style=ProgressStyle(description_…




In [12]:
o_feat.equals(n_feat.drop('composition', axis=1))

True

In [13]:
data['composition'] = data['formula'].apply(get_compostion)

In [14]:
data = data[['composition','stabilityVec']]

In [15]:
def check_nobility(row):
    comp = row['composition']
    return comp.contains_element_type('noble_gas')

In [16]:
data['noble'] = data.apply(check_nobility, axis=1)
data = data[data['noble'] == False]

In [17]:
data.reset_index(drop=True, inplace=True)

## Compute features using matminer

In [18]:
n_feat = StrToComposition(df=data, col_id='formula')

TypeError: __init__() got an unexpected keyword argument 'df'

In [19]:
n_feat.head()

Unnamed: 0,formulaA,formulaB,formulaA_elements_AtomicVolume,formulaB_elements_AtomicVolume,formulaA_elements_AtomicWeight,formulaB_elements_AtomicWeight,formulaA_elements_BoilingT,formulaB_elements_BoilingT,formulaA_elements_BulkModulus,formulaB_elements_BulkModulus,...,formulaB_elements_ShearModulus,formulaA_elements_SpaceGroupNumber,formulaB_elements_SpaceGroupNumber,avg_coordination_A,avg_coordination_B,avg_nearest_neighbor_distance_A,avg_nearest_neighbor_distance_B,stabilityVec,formula,composition
1421,Br,Zr,42.527825,23.265943,79.904,91.224,332.0,4682.0,1.9,0.0,...,33.0,64,194,1.0,12.0,2.38875,3.19147,"[1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0]",BrZr,"(Br, Zr)"
54,Si,Ne,20.016378,37.232186,28.0855,20.1791,3173.0,26.92,100.0,0.0,...,0.0,227,225,4.0,12.0,2.36803,3.04326,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",SiNe,"(Si, Ne)"
2539,He,B,37.236036,7.297767,4.002602,10.811,4.07,4273.0,0.0,320.0,...,0.0,225,166,8.0,5.5,2.73717,1.72536,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",HeB,"(He, B)"
1283,Y,Sn,33.013213,26.966785,88.90585,118.71,3618.0,2875.0,41.0,58.0,...,18.0,194,141,12.0,4.0,3.53391,2.87792,"[1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0]",YSn,"(Y, Sn)"
1274,K,Sn,75.847865,26.966785,39.0983,118.71,1032.0,2875.0,3.1,58.0,...,18.0,229,141,8.0,4.0,4.57083,2.87792,"[1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0]",KSn,"(K, Sn)"


In [20]:
f =  MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                         cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)])

In [21]:
%%capture
X = np.array(f.featurize_many(data['composition']))

We see that our featurized input does have a non-numeric value. This very likely due to magpie not being able to compute features for noble gases. Lets check if any row containing a noble gas is stable. If we don't see any signal we will drop those rows and re-featurize.

## Run cross validation

In [22]:
model = LogisticRegression()

In [23]:
model.classes = num_class

In [24]:
y = data['stabilityVec'].values

In [25]:
y_resid = []
y_uncer = []
for train_id, test_id in KFold(10, shuffle=True).split(X):
    model.fit(X[train_id], y[train_id])
    yf_pred, yf_std = model.predict(X[test_id], return_std=True)
    y_resid.extend(yf_pred - y[test_id])
    y_uncer.extend(yf_std)







TypeError: predict() got an unexpected keyword argument 'return_std'

In [None]:
y_resid = []
y_uncer = []
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=8)


In [101]:
model.fit(X_train, y_train)







LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [102]:
yf_pred = model.predict(X_test)

In [109]:
from sklearn.metrics import accuracy_score

In [110]:
print(accuracy_score(y_test, yf_pred))

0.38636363636363635


In [111]:
%%bash
git add -A
git commit -m "Tested using Logistic Regression with full vector prediction"

[feature-lr bac1144] Tested using Logistic Regression with full vector prediction
 1 file changed, 653 insertions(+), 46 deletions(-)
