# MODNet 'matbench_phonons' benchmarking

In [None]:
from collections import defaultdict
import itertools
import os
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from IPython.display import Markdown
from matminer.datasets import load_dataset
from pymatgen.core import Composition

from modnet.preprocessing import MODData
from modnet.models import MODNetModel
from modnet.featurizers import MODFeaturizer
from modnet.featurizers.presets import DeBreuck2020Featurizer

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [None]:
Markdown(filename="./README.md")

## Data exploration

In [None]:
df = load_dataset("matbench_phonons")

In [None]:
df.columns

### Target space

In [None]:
df.describe()

In [None]:
fig, ax = plt.subplots(facecolor="w")
ax.hist(df["last phdos peak"], bins=100, density=True);
ax.set_ylabel("Frequency")
ax.set_xlabel("Last PhDOS peak")

## Featurization and feature selection

First, we define some convenience classes that pass wraps composition data in a fake structure containe, and we define a composition only featurizer preset based on `DeBreuck2020Featurizer`.

In [None]:
PRECOMPUTED_MODDATA = "./precomputed/phonon_benchmark_moddata.pkl.gz"

if os.path.isfile(PRECOMPUTED_MODDATA):
    data = MODData.load(PRECOMPUTED_MODDATA)
else:
    data = MODData(
        structures=df["structure"].tolist(), 
        targets=df["last phdos peak"].tolist(), 
        target_names=["last phdos peak"],
        featurizer=DeBreuck2020Featurizer(n_jobs=8)
    )
    data.featurize()
    data.feature_selection(n=-1)
    data.save(PRECOMPUTED_MODDATA)

In [None]:
#data.optimal_features=None
#data.cross_nmi = None
#data.num_classes = {"w":0}
#data.feature_selection(n=-1)
#data.save("./precomputed/phonon_benchmark_moddata_MPCNMI.pkl.gz")

## Training

In [None]:
try:
    plot_benchmark
except:
    import sys
    sys.path.append('..')
    from modnet_matbench.utils import *
from sklearn.model_selection import KFold
from modnet.models import MODNetModel
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

data.df_targets.rename(columns={data.target_names[0]: "w"}, inplace=True)

# [[512], [128], [32], [16]]
best_settings = {
    "increase_bs":True,
    "num_neurons": [[512], [128], [64], [64]],
    "n_feat": 280,
    "lr": 0.005,
    "epochs": 800,
    "act": "elu",
    "batch_size": 64,
    "loss": "mae",
}

results = matbench_benchmark(data, [[["w"]]], {"w": 1}, best_settings,save_folds=True)
np.mean(results['scores'])

In [None]:
best_settings = {
    "increase_bs":True,
    "num_neurons": [[512], [128], [64], [64]],
    "n_feat": 280,
    "lr": 0.005,
    "epochs": 800,
    "act": "elu",
    "batch_size": 64,
    "loss": "mae",
}

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(data=reg_df, x="targets", y="predictions", hue="split", palette="Dark2", ax=ax, alpha=0.5)
sns.regplot(data=reg_df, x="targets", y="predictions", ax=ax, scatter=False)
plt.xlabel("True")
plt.ylabel("Pred.")

In [None]:
g = sns.jointplot(data=reg_df, x="errors", y="predictions", hue="split", palette="Dark2", alpha=0.0, marginal_kws={"shade": False})
g.plot_joint(sns.scatterplot, hue=None, c="black", s=5, alpha=0.8)
g.plot_joint(sns.kdeplot, color="split", zorder=0, levels=5, alpha=0.5)

In [None]:
sns.kdeplot(data=reg_df, x="targets", y="predictions", hue="split", shade=False, levels=3, palette="Dark2", alpha=0.5, )