# MODNet 'matbench_log_gvrh' and 'matbench_log_kvrh' benchmarking

Multi-target benchmarking of the ~11,000 entry elastic properties datasets provided by matminer.

In [1]:
from collections import defaultdict
import itertools
import os
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from matminer.datasets import load_dataset
from pymatgen.core import Composition

from IPython.display import Markdown
from modnet.preprocessing import MODData
from modnet.models import MODNetModel
from modnet.featurizers import MODFeaturizer
from modnet.featurizers.presets import DeBreuck2020Featurizer

os.environ["CUDA_VISIBLE_DEVICES"] = ""


If you use the ChemEnv tool for your research, please consider citing the following reference(s) :
David Waroquiers, Xavier Gonze, Gian-Marco Rignanese, Cathrin Welker-Nieuwoudt, Frank Rosowski,
Michael Goebel, Stephan Schenk, Peter Degelmann, Rute Andre, Robert Glaum, and Geoffroy Hautier,
"Statistical analysis of coordination environments in oxides",
Chem. Mater., 2017, 29 (19), pp 8346-8360,
DOI: 10.1021/acs.chemmater.7b02766



In [None]:
Markdown(filename="./README.md")

## Data exploration

In [None]:
df_gvrh = load_dataset("matbench_log_gvrh")
df_kvrh = load_dataset("matbench_log_kvrh")

In [None]:
df_combined = pd.DataFrame(df_gvrh)
df_combined["logK"] = df_kvrh["log10(K_VRH)"]
df_combined["logG"] = df_gvrh["log10(G_VRH)"]

In [None]:
df_temp = df_combined[['structure']]
df_temp['structure_dict'] = df_temp['structure'].apply(lambda x: x.as_dict())
df_temp = df_temp.drop('structure',axis=1)
df_temp.to_csv('df_GK_struc.csv')

### Target space

In [None]:
df_combined.describe()

In [None]:
fig, ax = plt.subplots(facecolor="w")
ax.hist(df_combined["log10(G_VRH)"], bins=100, density=True);
ax.set_ylabel("Frequency")
ax.set_xlabel("log10(G_VRH)")
fig, ax = plt.subplots(facecolor="w")
ax.hist(df_combined["log10(K_VRH)"], bins=100, density=True);
ax.set_ylabel("Frequency")
ax.set_xlabel("log10(K_VRH)")
fig, ax = plt.subplots(facecolor="w")
ax.scatter(df_combined["log10(G_VRH)"], df_combined["log10(K_VRH)"], alpha=0.1)
ax.set_xlabel("log10(G_VRH)")
ax.set_ylabel("log10(K_VRH)")

## Featurization and feature selection

In [2]:
PRECOMPUTED_MODDATA = "./precomputed/elastic_benchmark_moddata_MPCNMI.pkl.gz"

if os.path.isfile(PRECOMPUTED_MODDATA):
    data = MODData.load(PRECOMPUTED_MODDATA)
else:
    data = MODData(
        structures=df_combined["structure"].tolist(), 
        targets=df_combined[["log10(G_VRH)", "log10(K_VRH)"]],
        target_names=["log10(G_VRH)", "log10(K_VRH)"],
        featurizer=DeBreuck2020Featurizer(n_jobs=8)
    )
    data.featurize()
    # As this is a small data/feature set, order all features 
    data.feature_selection(n=-1)

INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7fcb34ad6730> object, created with modnet version 0.1.8~develop


In [9]:
data.df_targets.rename(columns={"log10(G_VRH)": "logG", "log10(K_VRH)": "logK"}, inplace=True)

In [None]:
#data.num_classes = {"log10(G_VRH)":0, "log10(K_VRH)":0}
#data.feature_selection(n=-1)
#data.save("./precomputed/elastic_benchmark_moddata_MPCNMI.pkl.gz")

In [None]:
#data.save("./precomputed/elastic_benchmark_moddata_MPCNMI.pkl.gz")

In [12]:
data.df_targets

Unnamed: 0,logG,logK
id0,1.447158,1.707570
id1,1.518514,1.633468
id2,1.740363,1.908485
id3,1.707570,2.117271
id4,1.602060,1.690196
...,...,...
id10982,1.414973,1.778151
id10983,1.431364,1.724276
id10984,1.000000,1.342423
id10985,1.579784,1.770852


In [22]:
data.

['logG', 'logK']

## Training

This is a very small dataset, so we must make judicious use of what data there is. First, let's generate test folds according to [matbench's suggestions](https://hackingmaterials.lbl.gov/automatminer/datasets.html#benchmarking-and-reporting-your-algorithm):

In [18]:
#data.df_targets.columns=['G','K']
from modnet.matbench.benchmark import matbench_benchmark

best_settings = {
    "increase_bs":False,
    "num_neurons": [[256], [64], [64], [32]],
    "n_feat": 350,
    "lr": 0.005,
    "epochs": 200,
    "verbose": 0,
    "act": "elu",
    "batch_size": 64,
    "loss": "mae",
    
}

results = matbench_benchmark(data, [[["logG"],["logK"]]],
                             {"logG": 1, "logK":1}, best_settings,multi_target=True,save_folds=True)
np.mean(np.array(results["scores"])[[0,1,2,4]],axis=0)

INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7fca142e8550> object, created with modnet version 0.1.8
INFO:root:Data successfully saved as folds/train_moddata_f1!
INFO:root:Training preset #1/24: {'batch_size': 32, 'lr': 0.01, 'n_feat': 70, 'num_neurons': [[140], [70, 70], [17], [17]], 'epochs': 1000, 'loss': 'mae', 'act': 'elu'}


KeyError: 'logG'

In [17]:
data.optimal_features_by_target["logG"] = data.optimal_features_by_target["log10(G_VRH)"]
data.optimal_features_by_target["logK"] = data.optimal_features_by_target["log10(K_VRH)"]

In [None]:
errors_K = []
errors_G = []
for f in range(5):
    df = pd.read_csv('folds/test_f{}.csv'.format(f+1),index_col=0)
    if f==3:
        print(df['K_error'].abs().nlargest(n=5))
        df = df.drop('id6951')
    print(df['K_error'].abs().describe())
    errors_K.append(df['K_error'].abs().mean())
    errors_G.append(df['G_error'].abs().mean())
print(np.mean(errors_K))
print(np.mean(errors_G))

In [None]:
data.df_structure.loc['id6951']['structure'].formula

In [None]:
for i,c in zip(range(5),['b','k','r','g','y']):
    plt.plot(results["models"][i].history.history["loss"][50:],c=c)