In [None]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 60)
# warning messageを出さない
warnings.filterwarnings('ignore')


In [None]:
REGNAME = "KRCV"  # RF, RidgeCV, KRCV
RANDOM_STATE = 1
ADD_RANDOM_VAR = True # add random variable or not.

In [None]:
METADATA = {"outputdir": "image_executed", "prefix": "permutationimportancerandom", 
              "dataname":"ReCo", 
             "regtype":REGNAME, }

In [None]:
import get_data
g_df, g_raw_descriptor_names, g_target_name = get_data.load("ReCo")

In [None]:
from sklearn.preprocessing import StandardScaler
from copy import deepcopy

def get_Xy(df, descriptor_names, target_name, add_random_var=True, random_state=RANDOM_STATE):
    """データからX,yを得る．

    N(0,1)なるrandom変数も加える．

    Args:
        df (pd.DataFrame): データ
        descriptor_names (list[str]): 説明変数名リスト
        target_name (str): 目的変数名
        add_random (bool): add random variable or not. Defaults to True.
        add_random_var (int): random_state. Defaults to RANDOM_STATE

    Returns:
        tuples containing

        - np.ndarray: X
        - np.ndarray: y
        - list[str]: 説明変数名リスト（"random"を含む．）
    """
    descriptor_names = deepcopy(descriptor_names)
    print(descriptor_names)
    df_std = df[descriptor_names].copy()
    scaler = StandardScaler()
    df_std.iloc[:, :] = scaler.fit_transform(df_std.values)
    if add_random_var and random_state is not None:
        np.random.seed(random_state)
        df_std["random"] = np.random.normal(0, 1, size=df_std.shape[0])
        descriptor_names += ["random"]
    X = df_std.values
    y = df[target_name].values
    return X, y, descriptor_names


g_X, g_y, g_descriptor_names = get_Xy(g_df, g_raw_descriptor_names, g_target_name, 
                                      add_random_var=ADD_RANDOM_VAR, random_state=RANDOM_STATE)
len(g_descriptor_names),g_descriptor_names

In [None]:
# random values
plt.hist(g_X[:,-1])

In [None]:
def show_X(X):
    """Xの図示(線画)を行う．

    Args:
        X (np.ndarray): X
    """
    fig, ax = plt.subplots()
    ax.plot(X)
    ax.set_xlabel("index")
    ax.set_ylabel("x")
    fig.show()

show_X(g_X)


def show_hist(y):
    """yのヒストグラム表示を行う．

    Args:
        y (np.ndarray): y
    """
    fig, ax = plt.subplots()
    ax.hist(y)
    ax.set_xlabel("y")
    fig.show()


show_hist(g_y)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold


In [None]:
def get_model(regname, X, y, random_state=RANDOM_STATE):
    """予測モデルを得る

    Args:
        regname (str): 回帰モデル．"RF" or "RidgeCV"．
        X (np.ndarray): X
        y (np.ndarray): y

    Returns:
        Union[RandomForestRegressor, RidgeCV]: 予測モデル
    """
    if regname == "RF":
        reg = RandomForestRegressor(n_estimators=100)
        reg.fit(X, y)

    elif regname == "RidgeCV":
        kf = KFold(n_splits=10, shuffle=True, random_state=random_state)
        reg = RidgeCV(cv=kf)
        reg.fit(X, y)
        
    elif regname == "KRCV":
        kf = KFold(n_splits=10, shuffle=True, random_state=random_state)
        estimator = KernelRidge(alpha=1, gamma=1, kernel="rbf")
        reg = GridSearchCV(estimator,
                          cv=kf, param_grid={"alpha": np.logspace(-6, 0, 11), "gamma": np.logspace(-4, 0, 10)})
        reg.fit(X, y)
        print(reg.best_estimator_)
    else:
        print("unknown regname", regname)
        raise ValueError('unknown regname')
    print("score", reg.score(X,y))
    return reg

g_reg = get_model(REGNAME, g_X, g_y)

In [None]:
import os
def show_rf_importance(reg, descriptor_names, metadata: dict=METADATA):
    """RandomForestのfeature importanceの図示．

    regがRandomForestRegressorでない場合は何もしない．

    Args:
        reg (Union[RandomForestRegressor, RidgeCV]): 予測モデル
        descriptor_names (list[str]): 説明変数名リスト
        metadata (dict): 表示用データ. Defaults to METADATA.
    """
    if isinstance(reg,RandomForestRegressor):
        reg.feature_importances_
        df_rf_imp = pd.DataFrame(
            {"descriptor": descriptor_names, "importance": reg.feature_importances_})
        df_rf_imp.sort_values(by="importance", ascending=False, inplace=True)
        df_rf_imp.plot.bar(x="descriptor", y="importance",)
        plt.yscale('log')
        plt.tight_layout()
        filename = "_".join([metadata["prefix"], metadata["dataname"], metadata["regtype"], "rf_importance"])+".png"
        print(filename)
        plt.savefig(os.path.join(metadata["outputdir"],filename))
    else:
        print("not RandomForestRegressor. skipped.")

show_rf_importance(g_reg, g_descriptor_names)

In [None]:
from sklearn.inspection import permutation_importance
g_feature_importance = permutation_importance(
    g_reg, g_X, g_y, n_repeats=30, random_state=RANDOM_STATE)
g_df_perm = pd.DataFrame(
    g_feature_importance["importances"], index=g_descriptor_names).T

In [None]:
from importance_misc import show_r2_decrease

show_r2_decrease(g_df_perm, comment="random_var_"+str(ADD_RANDOM_VAR), metadata=METADATA)

In [None]:
from sklearn.model_selection import cross_val_score
estimator = KernelRidge(alpha=g_reg.best_estimator_.alpha, gamma=g_reg.best_estimator_.gamma, kernel="rbf")
NSPLIT=10
kf = KFold(NSPLIT, shuffle=True, random_state=RANDOM_STATE)
cv_scores = cross_val_score(estimator, g_X, g_y, cv = kf)
cv_scores.mean(), cv_scores.std()