In [None]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
%matplotlib inline

# pandas表示設定
pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 80)

In [None]:
DATA_NAME = "x5sin"
NORMALIZATIONTYPE = "standard"
REGTYPE = "linear" # linear, lasso, ridge
RANDOM_STATE = 1 

In [None]:
SHUFFLE = True # shuffle in CV or not.

In [None]:
METADATA = {"outputdir": "image_executed", "prefix": "linear_regression_CV", 
              "dataname":DATA_NAME, "normalizationtype": NORMALIZATIONTYPE,
             "regtype":REGTYPE, "random_state": RANDOM_STATE}

In [None]:
def get_data(data_name):
    """観測データの作成

    Args:
        data_name (str): 作成するデータの名前．

    Raises:
        ValueError: 規定外のdata_name．

    Returns:
        tuple containing
        
        - pd.DataFrame: 観測データ．
        - pd.DataFrame: 新規データ
        - list[str]: 説明変数名のリスト
        - str: 目的変数
    """    
    if data_name == "x5_sin":
        filename = "../data_calculated/x5_sin.csv"
        filename_new = "../data_calculated/x5_sin_new.csv"
        descriptor_names = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6']
        # descriptor_names = ['x1', 'x2', 'x3', 'x4', 'x5', ]
        target_name = 'y'
    elif data_name == "x123":
        filename = "../data_calculated/x123.csv"
        filename_new = "../data_calculated/x123_new.csv"
        descriptor_names = ['x1', 'x2', 'x3']
        target_name = 'y'
    else:
        raise ValueError("unknown data_name={}".format(data_name))
    df_obs = pd.read_csv(filename)
    df_new = pd.read_csv(filename_new)
    return df_obs, df_new, descriptor_names, target_name

import get_data
g_df_obs, g_descriptor_names, g_target_name = get_data.load(DATA_NAME)
g_df_new, g_descriptor_names, g_target_name = get_data.load(DATA_NAME+"_new")

g_df_obs

In [None]:
# obs
g_Xraw = g_df_obs.loc[:, g_descriptor_names].values
g_y = g_df_obs.loc[:, g_target_name].values

# new 
g_Xraw_new = g_df_new.loc[:, g_descriptor_names].values
g_y_new = g_df_new.loc[:, g_target_name].values

In [None]:
def scale_X(Xraw, normalizationtype=None, scaler=None):
    """Xを規格化する．

    Args:
        Xraw (np.ndarray): 説明変数．
        normalizationtype (str, optional): 規格化の名前. Defaults to None.
        scaler (Union[StandardScaler, MinMaxScaler], optional): 規格化クラスインスタンス. Defaults to None.

    Raises:
        ValueError: 規定外normalizationtype

    Returns:
        nd.ndarray: 規格化された説明変数

    """    
    if scaler is not None:
        print("use", scaler)
        X = scaler.transform(Xraw)
    else:
        print("normalizationtype", normalizationtype)
        if normalizationtype=="standard":
            from sklearn.preprocessing import StandardScaler
            scaler = StandardScaler()
            scaler.fit(Xraw)
            X = scaler.transform(Xraw)    
        elif normalizationtype=="mimax":
            from sklearn.preprocessing import MinMaxScaler
            scaler = MinMaxScaler()
            scaler.fit(Xraw)
            X = scaler.transform(Xraw)    
        elif normalizationtype is None:
            # 規格化を行わない．
            X = Xraw
            scaler = None
        else:
            raise ValueError("unkown normalizationtype={}".format(normalizationtype))
    return X, scaler


g_X, g_scaler = scale_X(g_Xraw, NORMALIZATIONTYPE)
g_X_new, _ = scale_X(g_Xraw_new, scaler=g_scaler)

In [None]:
plt.plot(g_X)
plt.show()
plt.plot(g_X_new)
_ # <- plot.showの戻り値の表示をしないために追加している．

In [None]:
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import KFold

def choose_linear_model(regtype :str, alpha:float=1e-2):
    """線形モデルの選択を行う

    Args:
        regtype (str): 線形モデル名
        alpha (float, optional): Lasso, Ridgeのhyperparameter. Defaults to 1e-2.

    Raises:
        ValueError: 規定外線形モデル名．

    Returns:
        Union[LinearRegression, Lasso, Ridge]: 線型回帰モデルinstance
    """
    if regtype=="linear":
        reg = LinearRegression()
    elif regtype=="lasso":
        reg = Lasso(alpha=alpha)
    elif regtype=="ridge":
        reg = Ridge(alpha=alpha)
    else:
        raise ValueError("unkown regtype={}".format(regtype))
    return reg

def linear_regression_CV_score(X, y, regtype="linear", 
                               n_splits=10, shuffle=True, random_state=1):
    """linear regression with cross validation sore

    Args:
        X (np.array): descriptor
        y (np.array): target variable
        n_splits (int, optional): the number of splits in CV. Defaults to 10.
        random_state (int, optional): random state in KFold(). Defaults to 1.

    Returns:
        tuple containing
        
        - dict: the mean value of the CV score, the stddev value of the CV score
        - Union[LinearRegression, Lasso, Ridge]: 線型回帰モデルinstance
    """
    reg = choose_linear_model(regtype)
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    test_score_list = []
    for train, test in kf.split(X):
        Xtrain, ytrain = X[train], y[train]
        Xtest, ytest = X[test], y[test]
        reg.fit(Xtrain, ytrain)
        ytestp = reg.predict(Xtest)
        test_score = r2_score(ytest, ytestp)
        test_score_list.append(test_score)

    return {"mean(R2)":np.mean(test_score_list), "std(R2)":np.std(test_score_list)}, reg

g_result, g_reg = linear_regression_CV_score(g_X, g_y, REGTYPE, shuffle=SHUFFLE)
g_result

In [None]:
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold


def linear_regression_CV_score_ytestp(X, y, regtype="linear", 
                                      n_splits=10, random_state=RANDOM_STATE):
    """linear regression with cross validation sore.
        It also returns y_test and y_test^predict

    Args:
        X (np.array): explanatory variables
        y (np.array): target variable
        n_splits (int, optional): the number of splits in CV. Defaults to 10.
        random_state (int, optional): random state in KFold(). Defaults to RANDOM_STATE.

    Returns:
        tuple containing
        
        - dict: the mean value of the CV score, the stddev value of the CV score, 
            a list of y_test,a list of y_test^predict.
        - Union[LinearRegression, Lasso, Ridge]: 最後のCVの線型回帰モデルinstance
    """
    reg = choose_linear_model(regtype)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    test_score_list = []
    ytest_list = []
    ytestp_list = []
    for train, test in kf.split(X):
        Xtrain, ytrain = X[train], y[train]
        Xtest, ytest = X[test], y[test]
        reg.fit(Xtrain, ytrain)

        ytestp = reg.predict(Xtest)
        ytest_list.append(ytest)
        ytestp_list.append(ytestp)

        test_score = r2_score(ytest, ytestp)
        test_score_list.append(test_score)

    return {"mean(R2)":np.mean(test_score_list), "std(R2)":np.std(test_score_list), \
           "ytest": ytest_list, "ytestp": ytestp_list}, reg

g_result, g_reg = linear_regression_CV_score_ytestp( g_X, g_y, regtype=REGTYPE)

for _key in ["mean(R2)","std(R2)"]:
    print(_key,":",g_result[_key])


In [None]:
g_yp_new = g_reg.predict(g_X_new)

In [None]:
def show_CV_splot(X, shuffle, n_splits = 10, random_state=1, metadata: dict=METADATA):
    """CVの分離具合を表示する．

    Args:
        X (np.ndarray): 説明変数
        shuffle (bool): KFoldでのshffle
        n_splits (int, optional): KFoldでの分割数. Defaults to 10.
        random_state (int, optional): KFoldでのrandom_state. Defaults to 1.
        metadata (dict): image出力用データ
    """
    if shuffle:
        kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    else:
        kf = KFold(n_splits=n_splits, shuffle=shuffle)        
        
    for i, (train, test) in enumerate(kf.split(X)):
        # test選択部分に色を付けて表示しているだけ．
        c = np.zeros(test.shape[0])
        c += i
        plt.plot(test, c, "o")
    plt.xlabel("index")
    plt.ylabel("CV id")

    plt.tight_layout()
    
    filename = "_".join([metadata["prefix"], metadata["dataname"], metadata["normalizationtype"], 
                         metadata["regtype"], "Kfold","shuffle",str(shuffle)])+".png"
    print(filename)
    plt.savefig(os.path.join(metadata["outputdir"],filename))
    plt.show()
    
show_CV_splot(g_X, shuffle=False, n_splits=5)

In [None]:
show_CV_splot(g_X, shuffle=True, n_splits=5)

In [None]:
def plot_y_yp(y,yp, title: str=None, metadata: dict=METADATA):
    """y vs ypを図示する．

    Args:
        y (np.ndarray): 目的変数観測値
        yp (np.ndarray): s目的変数予測値
        title (str, optional): 図のtitle. Defaults to None.
        metadata (dict): 図示用のデータ
    """
    fig, ax = plt.subplots(figsize=(5,5))

    # $y^{obs}$ vs $y^{predict}$
    ax.plot(y,yp,"o")

    # 斜め線を引く
    yall = np.hstack([y,yp])
    ylim = yall.min(), yall.max()
    ax.plot(ylim,ylim,"--")

    # labelを書く
    ax.set_xlabel("$y_{obs}$")
    ax.set_ylabel("$y_{pred}$")
    if title is not None:
        ax.set_title(title)
        
    fig.tight_layout()
    filename = "_".join([metadata["prefix"], metadata["dataname"], metadata["normalizationtype"], 
                         metadata["regtype"], "predict",str(title)])+".png"
    print(filename)
    fig.savefig(os.path.join(metadata["outputdir"],filename))
    
    fig.show()

plot_y_yp(g_result["ytest"],g_result["ytestp"],)

In [None]:
# 新規データに対する予測
plot_y_yp(g_y_new, g_yp_new)

In [None]:
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold


def linear_regression_CV_coef(X, y, regtype="linear",
                              n_splits=10, random_state=1):
    """linear regression with cross validation

    Args:
        X (np.array): explanatory variables
        y (np.array): target variable
        n_splits (int, optional): the number of splits in CV. Defaults to 10.
        random_state (int, optional): random state in KFold(). Defaults to 1.

    Returns:
        list: a list of linear coefficients
    """
    reg = choose_linear_model(regtype)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    coef_list = []
    for train, test in kf.split(X):
        Xtrain, ytrain = X[train], y[train]
        Xtest, ytest = X[test], y[test]
        reg.fit(Xtrain, ytrain)
        coef_list.append(list(reg.coef_.ravel()))
    return coef_list


g_coef_list = linear_regression_CV_coef(g_X, g_y, regtype=REGTYPE)


In [None]:
def show_coeflist(coeflist, data_name, regtype, 
                  tickfontsize=15, labelfontsize=15, titlefontsize=15, legendfontsize=15):
    """線形モデルのcoefの図示．

    Args:
        coeflist ([float]): 回帰係数
        data_name (str)): データ名
        regtype (LinearModel): 線形モデルインスタンス.
        tickfontsize (int, optional): ticks fontsize. Defaults to 15.
        labelfontsize (int, optional): ticks fontsize. Defaults to 15.
        titlefontsize (int, optional): ticks fontsize. Defaults to 15.
        legendfontsize (int, optional): ticks fontsize. Defaults to 15.
    """
    fig, ax = plt.subplots()
    dfcoef = pd.DataFrame(coeflist)
    dfcoef.plot(ax=ax)
    ax.set_xlabel("CV set index", fontsize=labelfontsize)
    ax.set_title("{},{}".format(data_name,regtype), fontsize=titlefontsize)
    ax.tick_params(axis = 'x', labelsize =tickfontsize)
    ax.tick_params(axis = 'y', labelsize =tickfontsize)    
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=legendfontsize)
    fig.tight_layout()
    filepath = "image_executed/fig_regression_CV_{}_{}.pdf".format(data_name,regtype)
    fig.savefig(filepath)
    print(filepath)
    
show_coeflist(g_coef_list, DATA_NAME, REGTYPE)