# Training Complex Models

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score


data = fetch_california_housing(as_frame=True)


X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

models = {
    "Linear Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("lr", LinearRegression())
        ]),
    "Random Forest": RandomForestRegressor(random_state=0),
    "Gradient Boosting": GradientBoostingRegressor(random_state=0),
}

for model in models.values():
    model.fit(X_train, y_train)
    
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return rmse, r2

for name, model in models.items():
    rmse, r2 = evaluate_model(model, X_test, y_test)
    print(f"{name} - RMSE: {rmse:.3f}, R²: {r2:.3f}")


Linear Regression - RMSE: 0.727, R²: 0.594
Random Forest - RMSE: 0.513, R²: 0.798
Gradient Boosting - RMSE: 0.539, R²: 0.777


# Selecting a random instance to explain

In [5]:
def get_random_instance(y, X, model, 
                        percentile=0.95, above=True, random_state=0):
    p = y.quantile(percentile)
    subset = y[y >= p] if above else y[y <= p]
    random_instance = subset.sample(n=1, random_state=random_state)
    
    idx = random_instance.index[0]
    true_val = random_instance.values[0]
    pred_val = model.predict(X.loc[[idx]])[0]
    
    return {
        "idx": idx,
        "true_val": true_val,
        "pred_val": pred_val,
        "percentile_value": p
    }


def show_random_instance(instance_dict, percentile=95, label="instance"):
    print(f"\n{percentile}th percentile of y_test: {instance_dict['percentile_value']:.3f}")
    print(f"Random {label}:")
    print(f"  Index: {instance_dict['idx']}")
    print(f"  True value: {instance_dict['true_val']:.3f}")
    print(f"  Predicted value: {instance_dict['pred_val']:.3f}")




chosen_model = models["Random Forest"]
sample = get_random_instance(y_test, X_test, chosen_model, percentile=0.5, above=True)
sample["label"] = "sample instance"
show_random_instance(sample, percentile=50, label=sample["label"])


50th percentile of y_test: 1.778
Random sample instance:
  Index: 10241
  True value: 2.655
  Predicted value: 2.119


# LIME

In [7]:
from lime.lime_tabular import LimeTabularExplainer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


explainer = LimeTabularExplainer(
    training_data=X_train.values,
    feature_names=X.columns.tolist(),
    mode="regression",
    discretize_continuous=False, # TODO: try True
    sample_around_instance=True,
    kernel_width=0.75*np.sqrt(X_train.shape[1]),
    random_state=0
)


def predict_fn(X_array):
    X_df = pd.DataFrame(X_array, columns=X_train.columns)
    return chosen_model.predict(X_df)

exp= explainer.explain_instance(
    data_row=X.loc[[sample["idx"]]].to_numpy()[0],
    predict_fn=predict_fn,
    num_features=20,
    num_samples=3000,
    distance_metric='euclidean'
)

# Questions

The `as_map` method returns `label`: [(`feature`, `weight`)]. With regression, it always returns two `label`: 0, 1. The weights are always mirrors of each other.

- What to the labels 0 and 1 represet?
- Why are they inverse?

The output of `as_list` always seem to correspond to the 1 label in `as_map` output.

In [10]:
exp.as_map()

{0: [(np.int64(0), np.float64(-0.5298087337759396)),
  (np.int64(5), np.float64(0.45932409178040723)),
  (np.int64(7), np.float64(0.34674288851385254)),
  (np.int64(6), np.float64(0.20342177481127297)),
  (np.int64(1), np.float64(-0.16818346774366183)),
  (np.int64(3), np.float64(-0.06095483829008354)),
  (np.int64(4), np.float64(-0.03225760743187124)),
  (np.int64(2), np.float64(-0.013053547744401778))],
 1: [(np.int64(0), np.float64(0.5298087337759396)),
  (np.int64(5), np.float64(-0.45932409178040723)),
  (np.int64(7), np.float64(-0.34674288851385254)),
  (np.int64(6), np.float64(-0.20342177481127297)),
  (np.int64(1), np.float64(0.16818346774366183)),
  (np.int64(3), np.float64(0.06095483829008354)),
  (np.int64(4), np.float64(0.03225760743187124)),
  (np.int64(2), np.float64(0.013053547744401778))]}

In [8]:
exp.as_list()

[('MedInc', 0.5298087337759396),
 ('AveOccup', -0.45932409178040723),
 ('Longitude', -0.34674288851385254),
 ('Latitude', -0.20342177481127297),
 ('HouseAge', 0.16818346774366183),
 ('AveBedrms', 0.06095483829008354),
 ('Population', 0.03225760743187124),
 ('AveRooms', 0.013053547744401778)]

https://lime-ml.readthedocs.io/en/latest/lime.html?highlight=map#lime.explanation.Explanation.as_map