In [31]:
from collections import defaultdict
import itertools
import os
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from IPython.display import Markdown
from matminer.datasets import load_dataset, get_all_dataset_info
from pymatgen.core import Composition

from modnet.preprocessing import MODData
from modnet.featurizers import MODFeaturizer
from modnet.featurizers.presets import DeBreuck2020Featurizer

In [32]:
df = load_dataset('matbench_expt_gap')
df['composition'] = df['composition'].map(Composition)
df

Unnamed: 0,composition,gap expt
0,"(Ag, Au, S)",0.00
1,"(Ag, W, Br)",0.00
2,"(Ag, Ge, Pb, S)",1.83
3,"(Ag, Ge, Pb, Se)",1.51
4,"(Ag, B, Br)",0.00
...,...,...
4599,"(Zr, Ta, N)",1.72
4600,"(Zr, Te)",0.00
4601,"(Zr, Ti, O)",0.00
4602,"(Zr, Ti, F)",0.00


In [42]:
from matminer.featurizers.composition import ElementProperty

matscholar = ElementProperty.from_preset('matscholar_el')
feat = ElementProperty(matscholar.data_source, matscholar.features, stats=['mean', 'std_dev'])
feat_df = pd.DataFrame(feat.transform(df['composition']), columns=feat.feature_labels()).select_dtypes('number')
feat_df

Unnamed: 0,MatscholarElementData mean embedding 1,MatscholarElementData std_dev embedding 1,MatscholarElementData mean embedding 2,MatscholarElementData std_dev embedding 2,MatscholarElementData mean embedding 3,MatscholarElementData std_dev embedding 3,MatscholarElementData mean embedding 4,MatscholarElementData std_dev embedding 4,MatscholarElementData mean embedding 5,MatscholarElementData std_dev embedding 5,...,MatscholarElementData mean embedding 196,MatscholarElementData std_dev embedding 196,MatscholarElementData mean embedding 197,MatscholarElementData std_dev embedding 197,MatscholarElementData mean embedding 198,MatscholarElementData std_dev embedding 198,MatscholarElementData mean embedding 199,MatscholarElementData std_dev embedding 199,MatscholarElementData mean embedding 200,MatscholarElementData std_dev embedding 200
0,-0.003591,0.057427,0.035249,0.083360,0.011725,0.034370,0.021616,0.052416,0.031669,0.045029,...,0.040068,0.048842,0.024804,0.033855,0.111262,0.028340,-0.003313,0.061645,0.054466,0.042503
1,-0.017097,0.001886,-0.011967,0.039746,0.033914,0.019850,0.016558,0.064538,-0.026530,0.031111,...,0.009553,0.009222,0.022442,0.037096,0.002889,0.046974,-0.048617,0.011682,0.010825,0.079961
2,0.026121,0.060369,-0.001658,0.065045,0.030543,0.027779,0.051504,0.023307,-0.013456,0.021123,...,0.025710,0.051473,-0.003740,0.029255,0.081189,0.040458,0.016426,0.053277,0.032452,0.052363
3,0.068723,0.096610,0.059994,0.017914,-0.010385,0.035647,0.053803,0.025920,-0.058548,0.051351,...,0.018905,0.055868,0.023300,0.039493,0.067591,0.041128,-0.085493,0.069807,0.031954,0.052091
4,-0.036843,0.050088,0.028695,0.064734,0.009536,0.045812,0.050019,0.041757,-0.000367,0.051653,...,0.016798,0.018342,0.054368,0.039552,0.075689,0.087409,-0.041892,0.059353,0.063164,0.077328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4599,-0.065811,0.034929,-0.037832,0.022494,0.078252,0.059726,0.007499,0.054294,-0.032627,0.030212,...,0.000830,0.033519,0.007771,0.065592,0.093478,0.044513,0.003102,0.049275,-0.035063,0.090402
4600,0.026611,0.063109,-0.001758,0.064243,0.036091,0.007541,0.039583,0.072248,-0.069230,0.000061,...,-0.015139,0.060341,0.020321,0.062876,0.039449,0.053558,-0.053556,0.083282,0.042594,0.029055
4601,-0.023105,0.017138,-0.065338,0.015014,0.008387,0.027283,-0.065576,0.079565,-0.057071,0.013174,...,0.016886,0.048513,0.021705,0.048863,0.078352,0.009439,-0.031303,0.029489,0.034271,0.034546
4602,-0.010181,0.016565,-0.052636,0.008669,0.085890,0.067159,0.076440,0.140190,0.005393,0.058058,...,-0.013365,0.024270,-0.043544,0.084062,0.044121,0.033574,0.028537,0.051811,0.022088,0.028930


In [49]:
combo_df = pd.concat([feat_df, df], axis=1).select_dtypes('number')
display(combo_df)
combo_df.to_feather('datasets/mb_expt_gap.feather')

Unnamed: 0,MatscholarElementData mean embedding 1,MatscholarElementData std_dev embedding 1,MatscholarElementData mean embedding 2,MatscholarElementData std_dev embedding 2,MatscholarElementData mean embedding 3,MatscholarElementData std_dev embedding 3,MatscholarElementData mean embedding 4,MatscholarElementData std_dev embedding 4,MatscholarElementData mean embedding 5,MatscholarElementData std_dev embedding 5,...,MatscholarElementData std_dev embedding 196,MatscholarElementData mean embedding 197,MatscholarElementData std_dev embedding 197,MatscholarElementData mean embedding 198,MatscholarElementData std_dev embedding 198,MatscholarElementData mean embedding 199,MatscholarElementData std_dev embedding 199,MatscholarElementData mean embedding 200,MatscholarElementData std_dev embedding 200,gap expt
0,-0.003591,0.057427,0.035249,0.083360,0.011725,0.034370,0.021616,0.052416,0.031669,0.045029,...,0.048842,0.024804,0.033855,0.111262,0.028340,-0.003313,0.061645,0.054466,0.042503,0.00
1,-0.017097,0.001886,-0.011967,0.039746,0.033914,0.019850,0.016558,0.064538,-0.026530,0.031111,...,0.009222,0.022442,0.037096,0.002889,0.046974,-0.048617,0.011682,0.010825,0.079961,0.00
2,0.026121,0.060369,-0.001658,0.065045,0.030543,0.027779,0.051504,0.023307,-0.013456,0.021123,...,0.051473,-0.003740,0.029255,0.081189,0.040458,0.016426,0.053277,0.032452,0.052363,1.83
3,0.068723,0.096610,0.059994,0.017914,-0.010385,0.035647,0.053803,0.025920,-0.058548,0.051351,...,0.055868,0.023300,0.039493,0.067591,0.041128,-0.085493,0.069807,0.031954,0.052091,1.51
4,-0.036843,0.050088,0.028695,0.064734,0.009536,0.045812,0.050019,0.041757,-0.000367,0.051653,...,0.018342,0.054368,0.039552,0.075689,0.087409,-0.041892,0.059353,0.063164,0.077328,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4599,-0.065811,0.034929,-0.037832,0.022494,0.078252,0.059726,0.007499,0.054294,-0.032627,0.030212,...,0.033519,0.007771,0.065592,0.093478,0.044513,0.003102,0.049275,-0.035063,0.090402,1.72
4600,0.026611,0.063109,-0.001758,0.064243,0.036091,0.007541,0.039583,0.072248,-0.069230,0.000061,...,0.060341,0.020321,0.062876,0.039449,0.053558,-0.053556,0.083282,0.042594,0.029055,0.00
4601,-0.023105,0.017138,-0.065338,0.015014,0.008387,0.027283,-0.065576,0.079565,-0.057071,0.013174,...,0.048513,0.021705,0.048863,0.078352,0.009439,-0.031303,0.029489,0.034271,0.034546,0.00
4602,-0.010181,0.016565,-0.052636,0.008669,0.085890,0.067159,0.076440,0.140190,0.005393,0.058058,...,0.024270,-0.043544,0.084062,0.044121,0.033574,0.028537,0.051811,0.022088,0.028930,0.00
