In [None]:
#import necessary packages including matbench

import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matbench.bench import MatbenchBenchmark

In [None]:
#import the crabnet algorithm and its functions

import train_crabnet
from train_crabnet import get_model, load_model, get_results

In [None]:
#Defining a number of helper function to prepare the data for the CrabNet algorithm

#condense_formula takes a material and returns the chemical formula in the correct format for CrabNet
def condense_formula(mat):
    if isinstance(mat, str):
        return mat
    else:
        return mat.formula.replace(' ', '')

#change_input runs condense_formula on all the input data used for training
def change_input(train_inputs):
  inputs = []
  for input in train_inputs:
    inputs.append(condense_formula(input))
  return inputs

#make_df creates a data frame containing the train inputs and outputs for CrabNet
def make_df(train_inputs, train_outputs):
  input_df = pd.DataFrame({'formula': train_inputs, 'target': train_outputs})
  return input_df

#make_df_test creates a data frame containing the test inputs for CrabNet
def make_df_test(test_inputs):
  test_df = pd.DataFrame({'formula' : test_inputs})
  test_df['target'] = np.nan
  return test_df

#split_train_val splits the training data into two sets: training and validation
def split_train_val(df):
  df = df.sample(frac = 1.0, random_state = 7)
  val_df = df.sample(frac = 0.25, random_state = 7)
  train_df = df.drop(val_df.index)

  return train_df, val_df

In [None]:
#Defining a subset containing all of the regression tasks from the matbench tasks

subset = ["matbench_jdft2d", "matbench_steels", 
          "matbench_perovskites", "matbench_expt_gap",
          "matbench_phonons", "matbench_dielectric", 
          "matbench_log_gvrh", "matbench_log_kvrh",
          "matbench_mp_gap", "matbench_mp_e_form"]

mb = MatbenchBenchmark(autoload=False, subset=subset)
data_dir = 'data/matbench_temp'
os.makedirs(data_dir, exist_ok= True)

results_dict = {}

for task in mb.tasks:
    task.load()
    mat_prop = task.dataset_name
    os.makedirs(f'{data_dir}/{mat_prop}', exist_ok= True)
    for fold in task.folds:
        train_inputs, train_outputs = task.get_train_and_val_data(fold)
        test_inputs = task.get_test_data(fold, include_target=False)


        #Preparing the inputs data for CrabNet
        inputs = change_input(train_inputs)
        df = make_df(inputs, train_outputs)

        #Creating the training and validation sets
        train_df, val_df = split_train_val(df)
        train_df.to_csv(f'{data_dir}/{mat_prop}/train.csv')
        val_df.to_csv(f'{data_dir}/{mat_prop}/val.csv')

        #Getting and preparing the testing data
        test_inputs = change_input(test_inputs)
        output_df = make_df_test(test_inputs)
        output_df.to_csv(f'{data_dir}/{mat_prop}/test.csv')

        #Training CrabNet
        model = get_model(data_dir, mat_prop, classification = False, verbose = True, drop_unary = False)


        #Predicting on the testing data
        model = load_model(data_dir, mat_prop, classification = False, file_name = 'test.csv', verbose = True, drop_unary = False)
        model, output = get_results(model)
        
        # Recording our data!
        predictions = output[1]
        task.record(fold, predictions)
        

# Saving our results
mb.to_file("my_models_benchmark.json")