In [3]:
%matplotlib inline
%%capture
from matplotlib import pyplot as plt
from matminer.data_retrieval.retrieve_Citrine import CitrineDataRetrieval
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers import composition as cf
from sklearn.svm import S
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from pymatgen import Composition
from scipy.stats import norm
from collections import Counter
from pprint import pprint
import pandas as pd
import numpy as np
import statistics
import os


UsageError: Line magic function `%%capture` not found.


In [92]:
# Load Data
# Compute features
# Run CV
# Report

## Configure

In [93]:
load_path = os.path.join('data','processed_data.csv')
api_key = os.environ.get('CITRINATION_API_KEY')
site = 'https://citrination.com'
config_path = os.path.join('config','config.json')
np.random.seed(8)

## Load data

In [94]:
data = pd.read_csv(load_path)
pprint(f"Loaded {len(data.index)} records.")

'Loaded 25802 records.'


In [95]:
def get_compostion(c):
    """Attempt to parse composition, return None if failed"""
    
    try:
        return Composition(c)
    except:
        return None

In [96]:
data = data.sample(1000)

In [97]:
data['composition'] = data['formula'].apply(get_compostion)

In [98]:
data = data[['composition','stable']]

In [99]:
def check_nobility(row):
    comp = row['composition']
    return comp.contains_element_type('noble_gas')

In [100]:
data['noble'] = data.apply(check_nobility, axis=1)
data = data[data['noble'] == False]

In [101]:
data.reset_index(drop=True, inplace=True)

## Compute features using matminer

In [102]:
f =  MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                         cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)])

In [103]:
%capture
X = np.array(f.featurize_many(data['composition']))

We see that our featurized input does have a non-numeric value. This very likely due to magpie not being able to compute features for noble gases. Lets check if any row containing a noble gas is stable. If we don't see any signal we will drop those rows and re-featurize.

## Run cross validation

In [105]:
model = GaussianNB()

In [106]:
y = data['stable'].values

In [None]:
y_resid = []
y_uncer = []
for train_id, test_id in KFold(10, shuffle=True).split(X):
    model.fit(X[train_id], y[train_id])
    yf_pred, yf_std = model.predict(X[test_id], return_std=True)
    y_resid.extend(yf_pred - y[test_id])
    y_uncer.extend(yf_std)

In [81]:
y_resid = []
y_uncer = []
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=8)


In [82]:
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [84]:
yf_pred = model.predict(X_test)

In [85]:
y_resid.extend(yf_pred - y_test)

In [86]:
rmse = sum([i**2 for i in y_resid])**0.5/len(y_resid)

In [87]:
rmse

0.020890699963649756

In [88]:
from sklearn.metrics import accuracy_score

In [89]:
print(accuracy_score(y_test, yf_pred))

0.8450704225352113


In [90]:
%%bash
git add -A
git commit -m "Tested using naieve bayes"

[feature-sklearn-nb dd776fc] Tested using naieve bayes
 2 files changed, 171 insertions(+), 130 deletions(-)
