# Regression with ML, tabular data and tree based

**Author**: Jonathan TRICARD

**Summary**: using a dataset propose by scikit-learn, we build an Random Forest model to predict the price of a house. Then, we try to explain your model with the conditions tabular data and tree based.

**ExplainML**: create a HTML report to have visualizations to explain how machine learning model works.

## Import libraries

In [None]:
import pandas as pd 

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics  import mean_absolute_error, mean_squared_error, r2_score

from readml.logger import ROOT_DIR
from readml.explainers.ml.explain_ml import ExplainML 

## Import data

In [None]:
def create_and_split_dataframe():
    dict_data = fetch_california_housing()
    X = pd.DataFrame(dict_data["data"], columns=dict_data["feature_names"])
    y = pd.DataFrame(dict_data["target"], columns=dict_data["target_names"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    df_train = X_train.copy()
    df_train[y_train.columns.values[0]] = y_train
    df_test = X_test.copy()
    df_test[y_test.columns.values[0]] = y_test
    return X_train, X_test, y_train, y_test, df_train, df_test

In [None]:
X_train, X_test, y_train, y_test, df_train, df_test = create_and_split_dataframe() 

## Train model

In [None]:
rf = RandomForestRegressor(max_depth=2, random_state=42)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test) 

In [None]:
mae_rf = mean_absolute_error(y_test, y_pred)
mse_rf = mean_squared_error(y_test, y_pred)
r2_rf = r2_score(y_test, y_pred)

print("MAE : ", mae_rf)
print("MSE : ", mse_rf)
print("R2 : ", r2_rf)

## Use intelligibility from readml

 **WARNINGS**: Take care of change de config_local.cfg to adapt it to the use case, you may need to re run ```pip install -e .``` after the change in the configuration.

In [None]:
model_explain = rf # The model you use
task = "regression" # here we try to solve a regression problem
tree_based_model = True # if you use a tree based model
features_name = list(X_train.columns) # all the features without de target column
features_to_interpret = features_name # the features on which you want to make intelligibility
target_col = list(y_train.columns)[0] # the tagets columns
out_path = "../outputs/notebooks/" # the path where you want to save the report
out_path = os.path.join(ROOT_DIR, out_path)
if not os.path.exists(out_path)
    os.makedirs(out_path)

In [None]:
exp = ExplainML(
        model=model_explain,
        task_name=task, 
        tree_based_model=tree_based_model,
        features_name=features_name,
        features_to_interpret=features_to_interpret,
        target_col=target_col,
        out_path=out_path,
    )

In [None]:
exp.global_ale(df_train) # you need to use the dataframe with features and target into 

In [None]:
exp.global_pdp_ice(df_train) 

In [None]:
exp.global_shap(df_train) 

In [None]:
exp.local_shap(df_test.head()) # be careful it will produce one output by rows  