In [22]:
%matplotlib inline
from matplotlib import pyplot as plt
from matminer.data_retrieval.retrieve_Citrine import CitrineDataRetrieval
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers import composition as cf
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from pymatgen import Composition
from scipy.stats import norm
from collections import Counter
from pprint import pprint
import pandas as pd
import numpy as np
import statistics
import os


In [23]:
# Load Data
# Compute features
# Run CV
# Report

## Configure

In [24]:
load_path = os.path.join('data','processed_data.csv')
api_key = os.environ.get('CITRINATION_API_KEY')
site = 'https://citrination.com'
config_path = os.path.join('config','config.json')
np.random.seed(8)

## Load data

In [25]:
data = pd.read_csv(load_path)
pprint(f"Loaded {len(data.index)} records.")

'Loaded 25802 records.'


In [26]:
def get_compostion(c):
    """Attempt to parse composition, return None if failed"""
    
    try:
        return Composition(c)
    except:
        return None

In [27]:
data = data.sample(1000)

In [28]:
data['composition'] = data['formula'].apply(get_compostion)

In [29]:
data = data[['composition','stable']]

In [30]:
def check_nobility(row):
    comp = row['composition']
    return comp.contains_element_type('noble_gas')

In [31]:
data['noble'] = data.apply(check_nobility, axis=1)
data = data[data['noble'] == False]

In [32]:
data.reset_index(drop=True, inplace=True)

## Compute features using matminer

In [33]:
f =  MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                         cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)])

In [34]:
%%capture
X = np.array(f.featurize_many(data['composition']))

We see that our featurized input does have a non-numeric value. This very likely due to magpie not being able to compute features for noble gases. Lets check if any row containing a noble gas is stable. If we don't see any signal we will drop those rows and re-featurize.

## Run cross validation

In [35]:
model = SVC()

In [36]:
y = data['stable'].values

In [None]:
y_resid = []
y_uncer = []
for train_id, test_id in KFold(10, shuffle=True).split(X):
    model.fit(X[train_id], y[train_id])
    yf_pred, yf_std = model.predict(X[test_id], return_std=True)
    y_resid.extend(yf_pred - y[test_id])
    y_uncer.extend(yf_std)

In [37]:
y_resid = []
y_uncer = []
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=8)


In [38]:
model.fit(X_train, y_train)





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [39]:
yf_pred = model.predict(X_test)

In [40]:
y_resid.extend(yf_pred - y_test)

In [41]:
rmse = sum([i**2 for i in y_resid])**0.5/len(y_resid)

In [42]:
rmse

0.0449438202247191

In [43]:
from sklearn.metrics import accuracy_score

In [44]:
print(accuracy_score(y_test, yf_pred))

0.8202247191011236


In [45]:
%%bash
git add -A
git commit -m "Tested using SVM"

[feature-svm 1f4a7e2] Tested using SVM
 2 files changed, 899 insertions(+)
 create mode 100644 .ipynb_checkpoints/2_svm-checkpoint.ipynb
 create mode 100644 2_svm.ipynb
