In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd; pd.set_option('mode.chained_assignment',None)

data_class = pd.read_csv('mb_expt_is_metal_magpie.csv', index_col=0) # the matbench classification task dataset is slightly different

data_reg = pd.read_csv('mb_expt_gap_magpie.csv', index_col=0)

In [2]:
# get lasso features saved in csv and append to regression dataframe
lasso_fea = pd.read_csv('lasso_magpie.csv', index_col=0)['significant features']
data_reg_magpie_lasso = data_reg.iloc[:, 3:][data_reg.iloc[:, 3:].columns.intersection(lasso_fea)]
data_reg = pd.concat([data_reg.iloc[:, :3], data_reg_magpie_lasso], axis=1)
print(data_reg.columns[3:]) # print the 15 LASSO features

Index(['MagpieData maximum Number', 'MagpieData mean MeltingT',
       'MagpieData mean Row', 'MagpieData maximum Electronegativity',
       'MagpieData mean Electronegativity',
       'MagpieData avg_dev Electronegativity', 'MagpieData maximum NpValence',
       'MagpieData maximum NdValence', 'MagpieData range NdUnfilled',
       'MagpieData avg_dev NdUnfilled', 'MagpieData mean NUnfilled',
       'MagpieData range GSvolume_pa', 'MagpieData avg_dev GSvolume_pa',
       'MagpieData maximum GSmagmom', 'MagpieData range GSmagmom'],
      dtype='object')


In [3]:
# append lasso features to classification dataframe
data_class_magpie_lasso = data_class.iloc[:, 3:][data_class.iloc[:, 3:].columns.intersection(lasso_fea)]
data_class = pd.concat([data_class.iloc[:, :3], data_class_magpie_lasso], axis=1)
data_class

Unnamed: 0,formula,label,composition,MagpieData maximum Number,MagpieData mean MeltingT,MagpieData mean Row,MagpieData maximum Electronegativity,MagpieData mean Electronegativity,MagpieData avg_dev Electronegativity,MagpieData maximum NpValence,MagpieData maximum NdValence,MagpieData range NdUnfilled,MagpieData avg_dev NdUnfilled,MagpieData mean NUnfilled,MagpieData range GSvolume_pa,MagpieData avg_dev GSvolume_pa,MagpieData maximum GSmagmom,MagpieData range GSmagmom
0,Ag(AuS)2,True,Ag1 Au2 S2,79.0,937.262000,4.600000,2.58,2.434000,0.201600,4.0,10.0,0.0,0.000000,1.400000,9.456875,4.420900,0.000000,0.000000
1,Ag(W3Br7)2,True,Ag1 W6 Br14,74.0,1291.720476,4.619048,2.96,2.739524,0.293968,5.0,10.0,6.0,2.448980,2.428571,13.430000,5.951111,0.000000,0.000000
2,Ag0.5Ge1Pb1.75S4,False,Ag0.5 Ge1 Pb1.75 S4,82.0,611.499655,4.000000,2.58,2.396207,0.202806,4.0,10.0,0.0,0.000000,2.689655,11.780000,1.875196,0.000000,0.000000
3,Ag0.5Ge1Pb1.75Se4,False,Ag0.5 Ge1 Pb1.75 Se4,82.0,669.783793,4.551724,2.55,2.379655,0.187967,4.0,10.0,0.0,0.000000,2.689655,11.780000,1.905589,0.000000,0.000000
4,Ag2BBr,True,Ag2 B1 Br1,47.0,1270.915000,4.000000,2.96,2.215000,0.372500,5.0,10.0,0.0,0.000000,2.000000,22.307500,6.075938,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4916,ZrTaN3,False,Zr1 Ta1 N3,73.0,1121.430000,3.400000,3.04,2.390000,0.780000,3.0,3.0,8.0,3.600000,4.800000,8.426250,2.826600,0.000000,0.000000
4917,ZrTe,True,Zr1 Te1,52.0,1425.330000,5.000000,2.10,1.715000,0.385000,4.0,10.0,8.0,4.000000,5.000000,11.568333,5.784167,0.000000,0.000000
4918,ZrTi2O,True,Zr1 Ti2 O1,40.0,1516.200000,3.750000,3.44,1.962500,0.738750,4.0,2.0,8.0,3.000000,6.500000,14.090000,3.657500,0.000023,0.000023
4919,ZrTiF6,True,Zr1 Ti1 F6,40.0,548.750000,2.625000,3.98,3.343750,0.954375,5.0,2.0,8.0,3.000000,2.750000,13.487500,3.838125,0.000023,0.000023


In [4]:
from sklearn.model_selection import GridSearchCV, cross_validate, KFold, StratifiedKFold

In [5]:
# Use predefined CV splits found on Matbench's github
import json
with open('matbench_v0.1_validation.json', 'r') as f:
  mb_split = json.load(f)
mb_split_df = pd.DataFrame(data=mb_split)

mb_split_expt_gap = mb_split_df.loc['matbench_expt_gap', :]['splits']
mb_split_expt_is_metal = mb_split_df.loc['matbench_expt_is_metal', :]['splits']

In [6]:
# Custom CV iterator using Matbench CV splits JSON
mb_expt_gap_cv_outer = []
for i in range(5):
    train_idx = [int(x[12:])-1 for x in mb_split_expt_gap[f'fold_{i}']['train']]
    test_idx = [int(x[12:])-1 for x in mb_split_expt_gap[f'fold_{i}']['test']]
    mb_expt_gap_cv_outer.append((train_idx, test_idx))

mb_expt_is_metal_cv_outer = []
for i in range(5):
    train_idx = [int(x[17:])-1 for x in mb_split_expt_is_metal[f'fold_{i}']['train']]
    test_idx = [int(x[17:])-1 for x in mb_split_expt_is_metal[f'fold_{i}']['test']]
    mb_expt_is_metal_cv_outer.append((train_idx, test_idx))  

In [7]:
# use helper function written a separate file to identify and label compound types
from helper import add_type

data_class_typed, types = add_type(data_class)
data_class_typed.head()

86 nitrides in dataset
677 oxides in dataset
211 borides in dataset
318 halidess in dataset
1716 chalocogenidess in dataset
270 phosphides in dataset
94 phosphates in dataset
1549 others in dataset


Unnamed: 0,formula,label,composition,type,MagpieData maximum Number,MagpieData mean MeltingT,MagpieData mean Row,MagpieData maximum Electronegativity,MagpieData mean Electronegativity,MagpieData avg_dev Electronegativity,MagpieData maximum NpValence,MagpieData maximum NdValence,MagpieData range NdUnfilled,MagpieData avg_dev NdUnfilled,MagpieData mean NUnfilled,MagpieData range GSvolume_pa,MagpieData avg_dev GSvolume_pa,MagpieData maximum GSmagmom,MagpieData range GSmagmom
0,Ag(AuS)2,True,Ag1 Au2 S2,chalocogenides,79.0,937.262,4.6,2.58,2.434,0.2016,4.0,10.0,0.0,0.0,1.4,9.456875,4.4209,0.0,0.0
1,Ag(W3Br7)2,True,Ag1 W6 Br14,halides,74.0,1291.720476,4.619048,2.96,2.739524,0.293968,5.0,10.0,6.0,2.44898,2.428571,13.43,5.951111,0.0,0.0
2,Ag0.5Ge1Pb1.75S4,False,Ag0.5 Ge1 Pb1.75 S4,chalocogenides,82.0,611.499655,4.0,2.58,2.396207,0.202806,4.0,10.0,0.0,0.0,2.689655,11.78,1.875196,0.0,0.0
3,Ag0.5Ge1Pb1.75Se4,False,Ag0.5 Ge1 Pb1.75 Se4,chalocogenides,82.0,669.783793,4.551724,2.55,2.379655,0.187967,4.0,10.0,0.0,0.0,2.689655,11.78,1.905589,0.0,0.0
4,Ag2BBr,True,Ag2 B1 Br1,halides,47.0,1270.915,4.0,2.96,2.215,0.3725,5.0,10.0,0.0,0.0,2.0,22.3075,6.075938,0.0,0.0


In [8]:
x_class = data_class_typed.iloc[:, 4:]
y_class = data_class_typed['label']

In [33]:
from sklearn.pipeline import make_pipeline
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict

In [74]:
from matbench.bench import MatbenchBenchmark

mb = MatbenchBenchmark(autoload=False)
is_metal = mb.matbench_expt_is_metal


# train and validate your model
pipeline_rfc = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth= None, n_estimators = 500, random_state=123))
# the inner loop of the nested CV will be StratifiedKFold as per Matbench paper
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=18012019)

UT_TD = cross_validate(pipeline_rfc, x_class, y_class, cv = mb_expt_is_metal_cv_outer,scoring=['roc_auc', 'f1', 'balanced_accuracy'], return_estimator=True)

# Get testing data



# Predict w/model using CV-predict
predictions = cross_val_predict(pipeline_rfc, x_class, y_class, cv = mb_expt_is_metal_cv_outer)
predictions = pd.DataFrame(predictions)
# Combine test/train splits with predictions data frame to create fold, predictions in matbench example


2022-05-16 17:47:35 INFO     Initialized benchmark 'matbench_v0.1' with 13 tasks: 
['matbench_dielectric',
 'matbench_expt_gap',
 'matbench_expt_is_metal',
 'matbench_glass',
 'matbench_jdft2d',
 'matbench_log_gvrh',
 'matbench_log_kvrh',
 'matbench_mp_e_form',
 'matbench_mp_gap',
 'matbench_mp_is_metal',
 'matbench_perovskites',
 'matbench_phonons',
 'matbench_steels']


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [84]:
# Combine test/train splits with predictions data frame to create fold, predictions in matbench example
prediction = pd.DataFrame()
predictions = pd.DataFrame(predictions)
# Combine test/train splits with predictions data frame to create fold, predictions in matbench example
df = pd.DataFrame(mb_expt_is_metal_cv_outer, columns = ['train', 'test'])
for fold in range (0,4):
    idx = df['test'][fold]
    prediction = predictions.iloc[idx]
    print(prediction)
    is_metal.record(fold, prediction)

          0
3     False
17    False
20     True
21     True
24    False
...     ...
4907  False
4910   True
4911   True
4914   True
4918   True

[985 rows x 1 columns]
2022-05-16 17:51:48 ERROR    Fold number 0 already recorded! Aborting record...
          0
1      True
2     False
6     False
7     False
8     False
...     ...
4894   True
4896   True
4901   True
4902   True
4912  False

[984 rows x 1 columns]
2022-05-16 17:51:48 ERROR    Fold number 1 already recorded! Aborting record...
          0
5      True
12    False
14    False
19    False
34     True
...     ...
4905   True
4909   True
4917   True
4919   True
4920   True

[984 rows x 1 columns]
2022-05-16 17:51:48 ERROR    Fold number 2 already recorded! Aborting record...
          0
16    False
18    False
31    False
36    False
41    False
...     ...
4895   True
4903   True
4906   True
4908  False
4916   True

[984 rows x 1 columns]
2022-05-16 17:51:48 ERROR    Fold number 3 already recorded! Aborting record...


In [85]:
is_metal.to_file("UT_TD_benchmark.json.gz")

2022-05-16 17:54:56 INFO     Successfully wrote MatbenchTask to file 'UT_TD_benchmark.json.gz'.
