In [16]:
%matplotlib inline
from matplotlib import pyplot as plt
from matminer.data_retrieval.retrieve_Citrine import CitrineDataRetrieval
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers import composition as cf
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from pymatgen import Composition
from scipy.stats import norm
from collections import Counter
from pprint import pprint
import pandas as pd
import numpy as np
import statistics
import os
%%capture

UsageError: Line magic function `%%capture` not found.


In [17]:
# Load Data
# Compute features
# Run CV
# Report

## Configure

In [18]:
load_path = os.path.join('data','processed_data.csv')
api_key = os.environ.get('CITRINATION_API_KEY')
site = 'https://citrination.com'
config_path = os.path.join('config','config.json')
np.random.seed(8)

## Load data

In [19]:
data = pd.read_csv(load_path)
pprint(f"Loaded {len(data.index)} records.")

'Loaded 25802 records.'


In [20]:
def get_compostion(c):
    """Attempt to parse composition, return None if failed"""
    
    try:
        return Composition(c)
    except:
        return None

In [21]:
data = data.sample(1000)

In [22]:
data['composition'] = data['formula'].apply(get_compostion)

In [23]:
data = data[['composition','stable']]

In [66]:
def check_nobility(row):
    comp = row['composition']
    return comp.contains_element_type('noble_gas')

In [69]:
data['noble'] = data.apply(check_nobility, axis=1)
data = data[data['noble'] == False]

In [24]:
data.reset_index(drop=True, inplace=True)

### Debugging
It looks like our input has either NaN's or values that are too large.

In [35]:
# we can use the following line to check for NaN
data.composition.isna().value_counts()

False    1000
Name: composition, dtype: int64

#### Checking stability of noble compounds using pymatgen
pymatgen composition objects provide the class method `get_element_type` which we can provide an acceptable class to check if a noble gas is present. We can use this as a boolean indexer to slice the data.


In [51]:
data.composition[0].contains_element_type('noble_gas')

False

In [48]:
%%bash
git add -A
git commit -m "Debugging noble gasses"

[feature-sklearn-nb c97d0fd] Debugging noble gasses
 2 files changed, 882 insertions(+)
 create mode 100644 .ipynb_checkpoints/2_naive_bayes-checkpoint.ipynb
 create mode 100644 2_naive_bayes.ipynb


## Compute features using matminer

In [26]:
f =  MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                         cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)])

In [27]:
%%capture
X = np.array(f.featurize_many(data['composition']))

### Debugging
It looks like our input has either NaN's or values that are too large.

In [44]:
np.isnan(X).any()

True

We see that our featurized input does have a non-numeric value. This very likely due to magpie not being able to compute features for noble gases. Lets check if any row containing a noble gas is stable. If we don't see any signal we will drop those rows and re-featurize.

## Run cross validation

In [29]:
model = GaussianNB()

In [30]:
y = data['stable'].values

In [31]:
y[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [None]:
y_resid = []
y_uncer = []
for train_id, test_id in KFold(8, shuffle=True).split(X):
    model.fit(X[train_id], y[train_id])
    yf_pred, yf_std = model.predict(X[test_id], return_std=True)
    y_resid.extend(yf_pred - y[test_id])
    y_uncer.extend(yf_std)

In [13]:
y_resid = []
y_uncer = []
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=8)


In [14]:
model.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [17]:
yf_pred, yf_std = model.predict(X_test, return_std=True)

In [19]:
y_resid.extend(yf_pred - y_test)
y_uncer.extend(yf_std)

In [20]:
rmse = sum([i**2 for i in y_resid])**0.5/len(y_resid)

In [21]:
rmse

0.014233593747617724