# MODNet 'matbench_dielectric' benchmarking

In [None]:
from collections import defaultdict
import itertools
import os
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from matminer.datasets import load_dataset
from pymatgen.core import Composition

from modnet.preprocessing import MODData
from modnet.models import MODNetModel
from modnet.featurizers import MODFeaturizer
from modnet.featurizers.presets import DeBreuck2020Featurizer

## Data exploration

In [None]:
df = load_dataset("matbench_dielectric")
# df["composition"] = df["composition"].map(Composition)

In [None]:
from matminer.datasets import get_all_dataset_info

# Get dataset info from matminer
info = get_all_dataset_info("matbench_dielectric")

# Check out the info about the dataset.
print(info)

In [None]:
df.columns

### Target space

In [None]:
df.describe()

In [None]:
fig, ax = plt.subplots(facecolor="w")
ax.hist(df["n"], bins=100, density=True);
ax.set_ylabel("Frequency")
ax.set_xlabel("n")

In [None]:
df['structure_dict'] = df['structure'].map(lambda x: x.as_dict())

In [None]:
df = df.drop('structure',axis=1)

In [None]:
df.to_csv('df.csv')

In [None]:
df

## Featurization and feature selection

In [None]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [None]:
PRECOMPUTED_MODDATA = "./precomputed/dielectric_benchmark_moddata.pkl.gz"

if os.path.isfile(PRECOMPUTED_MODDATA):
    data = MODData.load(PRECOMPUTED_MODDATA)
else:
    # Use a fresh copy of the dataset
    df = load_dataset("matbench_dielectric")
    
    data = MODData(
        structures=df["structure"].tolist(), 
        targets=df["n"].tolist(), 
        target_names=["n"],
        featurizer=DeBreuck2020Featurizer(n_jobs=8)
    )
    data.featurize()
    data.save(PRECOMPUTED_MODDATA)
    # data.feature_selection(n=-1)

In [None]:
# Look at some of the top features chosen by MODNet
for feat in data.optimal_features[:10]:
    fig, ax = plt.subplots(facecolor="w")
    plt.scatter(data.df_featurized[feat], data.df_targets, alpha=0.5)
    plt.xlabel(feat)
    plt.ylabel("Yield strength (MPa)")

## Training

In [None]:
from sklearn.model_selection import KFold
from modnet.models import MODNetModel
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
try:
    matbench_benchmark
except:
    import sys
    sys.path.append('..')
    from modnet_matbench.utils import matbench_benchmark


best_settings = {
    "increase_bs":False,
    "num_neurons": [[128], [32], [8], [8]],
    "n_feat": 512,
    "lr": 0.005,
    "epochs": 1000,
    "verbose": 0,
    "act": "relu",
    "batch_size": 64,
    "loss": "mae",
}

results = matbench_benchmark(data, [[["n"]]], {"n": 1}, best_settings, save_folds=True)
np.mean(results['scores'])

In [None]:
for i in range(5):
    plt.plot(results["models"][i].history.history["loss"][50:])