# Feature Importance

## Outline

Feature importance is a technique that allows us to identify the most contributing features in a given dataset by some scoring method. The score can be defined by any model fit on the dataset. We score each feature by scrambling its values and fitting the model. The relative change in performance of the model fit on the scrambled feature dataset versus the original feature dataset tells us the importance. 

### Load Dependencies

In [None]:
import numpy as np
import pandas as pd
import math

import torch
from torch.autograd import Variable, grad
from torch.autograd.functional import jacobian
import torch.nn.functional as F
import torch.utils.data as Data
from torch import nn, optim

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

from L96_model import (
    L96,
    L96_eq1_xdot,
    integrate_L96_2t,
    EulerFwd,
    RK2,
    RK4,
)

%matplotlib inline

In [None]:
np.random.seed(14)  # For reproducibility
torch.manual_seed(14)  # For reproducibility

### Generate L96 data

In [None]:
time_steps = 20000
Forcing, dt, T = 18, 0.01, 0.01 * time_steps

# Create a "synthetic world" with K=8 and J=32
K = 8
J = 32
W = L96(K, J, F=Forcing)
# Get training data for the neural network.

# - Run the true state and output subgrid tendencies (the effect of Y on X is xy_true):
X_true, _, _, xy_true = W.run(dt, T, store=True, return_coupling=True)

In [None]:
def plot_feature_importance(resultdf):
    fig = plt.figure()
    ax = fig.add_axes([0, 0, 1, 1])
    ax.set_xlabel("Shift in score")
    ax.set_ylabel("Column")
    ax.set_title("Permutation Feature Importance")

    # Sort the DataFrame by score in descending order
    resultdf_sorted = resultdf.sort_values(by="score", ascending=False)

    predictors = resultdf_sorted.pred
    scores = resultdf_sorted.score
    y_pos = range(len(predictors))

    # Plot the bars in descending order of bar heights
    ax.barh(y_pos, scores[::-1])
    plt.yticks(y_pos, predictors[::-1])
    plt.show()

### Fit model on training data

In [None]:
model = LinearRegression()
model.fit(X_true, xy_true)

## Feature importance using Linear Regression

In [None]:
baseline = r2_score(model.predict(X_true), xy_true)
results = []
df_x_true = pd.DataFrame(X_true)
for column in df_x_true:
    # Create a copy of X_test
    df_x_true_copy = df_x_true.copy()

    # Scramble the values of the given predictor
    df_x_true_copy[column] = (
        df_x_true[column].sample(frac=1).reset_index(drop=True).values
    )

    # Calculate the new RMSE
    score = r2_score(model.predict(df_x_true_copy), xy_true)

    # Append the increase in MSE to the list of results
    results.append({"pred": column, "score": abs(score - baseline)})

# Put the results into a pandas dataframe and rank the predictors by score
resultsdf = pd.DataFrame(results).reset_index(drop=True)

In [None]:
plot_feature_importance(resultsdf)

## Feature importance using Artificial Neural Network

In [None]:
# Specify a path
PATH = "networks/network_3_layers_100_epoches.pth"
# Load
weights = torch.load(PATH)
weights.keys()

In [None]:
class Net_ANN(nn.Module):
    def __init__(self):
        super(Net_ANN, self).__init__()
        self.linear1 = nn.Linear(8, 16)  # 8 inputs, 16 neurons for first hidden layer
        self.linear2 = nn.Linear(16, 16)  # 16 neurons for second hidden layer
        self.linear3 = nn.Linear(16, 8)  # 8 outputs

    def forward(self, x):
        x = x.to(self.linear1.weight.dtype)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x

In [None]:
model = Net_ANN()
model.load_state_dict(weights)
model.eval()

In [None]:
baseline = r2_score(model(torch.tensor(X_true)).detach().numpy(), xy_true)
results = []
df_x_true = pd.DataFrame(X_true)
for column in df_x_true:
    # Create a copy of X_test
    df_x_true_copy = df_x_true.copy()

    # Scramble the values of the given predictor
    df_x_true_copy[column] = (
        df_x_true[column].sample(frac=1).reset_index(drop=True).values
    )

    # Calculate the new RMSE
    score = r2_score(
        model(torch.tensor(df_x_true_copy.to_numpy())).detach().numpy(), xy_true
    )

    # Append the increase in MSE to the list of results
    results.append({"pred": column, "score": abs(score - baseline)})

# Put the results into a pandas dataframe and rank the predictors by score
resultsdf = pd.DataFrame(results).reset_index(drop=True)

In [None]:
plot_feature_importance(resultsdf)

## Observation

We observe the importance of the input features are strongly dependant on the model used to evaluate the data. 