In [131]:
%matplotlib inline
from matplotlib import pyplot as plt
from matminer.data_retrieval.retrieve_Citrine import CitrineDataRetrieval
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers import composition as cf
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from pymatgen import Composition
from scipy.stats import norm
from collections import Counter
from pprint import pprint
import pandas as pd
import numpy as np
import statistics
import os


In [132]:
# Load Data
# Compute features
# Run CV
# Report

## Configure

In [133]:
load_path = os.path.join('data','training_data.csv')
api_key = os.environ.get('CITRINATION_API_KEY')
site = 'https://citrination.com'
config_path = os.path.join('config','config.json')
np.random.seed(8)

## Compute Classes

In [135]:
# There will be 2^10 possible class labels
from itertools import product
num_class = list(product([0.0,1.0],repeat=10))

## Load data

In [136]:
data = pd.read_csv(load_path)
pprint(f"Loaded {len(data.index)} records.")

'Loaded 2572 records.'


In [137]:
data['formula'] = data['formulaA'] + data['formulaB']

In [138]:
def get_compostion(c):
    """Attempt to parse composition, return None if failed"""
    
    try:
        return Composition(c)
    except:
        return None

In [139]:
data = data.sample(1000)

In [140]:
o_feat = data

In [142]:
n_feat = StrToComposition().featurize_dataframe(data, col_id='formula')

HBox(children=(IntProgress(value=0, description='StrToComposition', max=1000, style=ProgressStyle(description_…




In [150]:
o_feat.equals(n_feat.drop('composition', axis=1))

True

In [117]:
data['composition'] = data['formula'].apply(get_compostion)

In [119]:
data = data[['composition','stabilityVec']]

In [92]:
def check_nobility(row):
    comp = row['composition']
    return comp.contains_element_type('noble_gas')

In [93]:
data['noble'] = data.apply(check_nobility, axis=1)
data = data[data['noble'] == False]

In [120]:
data.reset_index(drop=True, inplace=True)

## Compute features using matminer

In [129]:
n_feat = StrToComposition(df=data, col_id='formula')

TypeError: BaseFeaturizer() takes no arguments

In [122]:
n_feat.head()

AttributeError: 'list' object has no attribute 'head'

In [95]:
f =  MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                         cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)])

In [96]:
%%capture
X = np.array(f.featurize_many(data['composition']))

We see that our featurized input does have a non-numeric value. This very likely due to magpie not being able to compute features for noble gases. Lets check if any row containing a noble gas is stable. If we don't see any signal we will drop those rows and re-featurize.

## Run cross validation

In [97]:
model = LogisticRegression()

In [98]:
model.classes = num_class

In [99]:
y = data['stabilityVec'].values

In [None]:
y_resid = []
y_uncer = []
for train_id, test_id in KFold(10, shuffle=True).split(X):
    model.fit(X[train_id], y[train_id])
    yf_pred, yf_std = model.predict(X[test_id], return_std=True)
    y_resid.extend(yf_pred - y[test_id])
    y_uncer.extend(yf_std)

In [100]:
y_resid = []
y_uncer = []
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=8)


In [101]:
model.fit(X_train, y_train)







LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [102]:
yf_pred = model.predict(X_test)

In [109]:
from sklearn.metrics import accuracy_score

In [110]:
print(accuracy_score(y_test, yf_pred))

0.38636363636363635


In [111]:
%%bash
git add -A
git commit -m "Tested using Logistic Regression with full vector prediction"

[feature-lr bac1144] Tested using Logistic Regression with full vector prediction
 1 file changed, 653 insertions(+), 46 deletions(-)
