In [None]:
import pandas as pd
import numpy as np
import shap
from automatminer import MatPipe
from automatminer.automl.adaptors import SinglePipelineAdaptor, TPOTAdaptor
from automatminer.featurization import AutoFeaturizer
from automatminer.preprocessing import DataCleaner, FeatureReducer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from matbench.bench import MatbenchBenchmark
from sklearn.model_selection import KFold
from matminer.datasets.dataset_retrieval import load_dataset

In [None]:
df_phonon = load_dataset('matbench_phonons') # load matminer dataset
af =  AutoFeaturizer(n_jobs=10, preset="debug") # initialize matminer featurizer on debug preset
df = af.fit_transform(df_phonon,target ='last phdos peak') # get dataframe with features and targets

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=18012019) #set cv splits to match matbench

In [None]:
#configure matpine of automatminer
learner = SinglePipelineAdaptor(
                regressor=RandomForestRegressor(n_estimators=500),
                classifier=RandomForestClassifier(n_estimators=500),
            )
pipe_config = {
            "learner": learner,
            "reducer": FeatureReducer(reducers=[]),
            "cleaner": DataCleaner(feature_na_method="mean", max_na_frac=0.01, na_method_fit="drop", na_method_transform="mean"),
           "autofeaturizer": AutoFeaturizer(n_jobs=10, preset="debug"),
        }

pipe = MatPipe(**pipe_config)

In [None]:
mb = MatbenchBenchmark(autoload=False, subset=['matbench_phonons'])

for task in mb.tasks:
    task.load()
    for fold, (train_ix, test_ix) in zip(task.folds, cv.split(df)):
        
        # split data
        df_train, df_test = df.iloc[train_ix, :], df.iloc[test_ix, :]
        #y_train, y_test = df.iloc[train_ix], df.iloc[test_ix]

        pipe.fit(df_train, task.metadata.target)

        predictions = pipe.predict(df_test)[f"{task.metadata.target} predicted"]
        
        params = {'note': 'single config; see benchmark user metadata'}

        task.record(fold, predictions, params=params)
        

In [None]:
task.scores #get scores for model

In [None]:
explainer = shap.TreeExplainer(pipe.learner.best_pipeline, df_train.loc[:,df_train.columns[1:]])
shap_values = explainer.shap_values(df_train.loc[:,df_train.columns[1:]], check_additivity=False)

fig = shap.summary_plot(shap_values, features=df_train.loc[:,df_train.columns[1:]], 
                        feature_names=df_train.loc[:,df_train.columns[1:]].columns)

In [None]:
features_imp = pd.DataFrame(columns=['features','importances'])

features_imp['features'] = df_train.loc[:,df_train.columns[1:]].columns
features_imp['importances'] = pipe.learner.best_pipeline.feature_importances_

#features_imp.to_csv('RF_feature_imp_data.csv')