In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, LeaveOneOut
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

pd.set_option("display.max_columns", 60)
pd.set_option("display.max_rows", 10)

DATA_NAME = "ReCo"

In [None]:
import get_data
g_df,g_descriptor_names, g_target_name = get_data.load(DATA_NAME)


In [None]:
METADATA = {"outputdir": "image_executed", "prefix": "randomforest", 
              "dataname":DATA_NAME}

In [None]:
# 結果を入れるdict
g_result = {}

In [None]:
g_Xraw = g_df[g_descriptor_names].values
g_y = g_df[g_target_name].values

# データプリプロセス
g_scaler = MinMaxScaler()
g_scaler.fit(g_Xraw)
g_X = g_scaler.transform(g_Xraw)


In [None]:
g_df

In [None]:
def show_X(X):
    """Xの図示

    Args:
        X (np.ndarray): 説明変数
    """
    fig, ax = plt.subplots()
    plt.plot(X, ".-")
    plt.show()
show_X(g_X)

def show_hist(y):
    """yの図示

    Args:
        y (np.ndarray): 目的変数
    """
    fig, ax = plt.subplots()
    ax.hist(y)
    ax.set_xlabel("y")
    fig.show()
show_hist(g_y)

In [None]:
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
def make_CV_y_yp_test(X, y,  nsplit=5):
    """CVによりy, ypを計算する．

    Args:
        X (np.ndarray): 説明変数．
        y (np.ndarray): 目的変数．
        nsplit (int, optional): KFoldの分割数. Defaults to 5.

    Returns:
        tuple containing
        
        - list[np.ndarray]: a list of y.
        - list[np.ndarray]: a list of predicted y.
        - list[float]: a list of CV score.
    """
    yp_list = []
    y_list = []
    score_list = []
    kf = KFold(nsplit, shuffle=True)
    for train, test in kf.split(X):
        Xtrain, ytrain = X[train], y[train]
        Xtest, ytest = X[test], y[test]
        krcv = RandomForestRegressor()
        # krcv = SVR(kernel="rbf", C=10000, gamma=0.1, epsilon=0.1)
        # krcv = SVR(kernel="linear", C=10000, gamma="auto")
        # krcv = SVR(kernel="poly", C=10000, gamma="auto", degree=3, epsilon=0.1, coef0=1)
        krcv.fit(Xtrain, ytrain)
        ytestp = krcv.predict(Xtest)
        score = r2_score(ytest, ytestp)
        score_list.append(score)
        y_list.extend(ytest)
        yp_list.extend(ytestp)
    return y_list, yp_list, score_list

g_y_list, g_yp_list, g_score_list = make_CV_y_yp_test(g_X, g_y)

In [None]:
print("R2 CV(test)={}({})".format(np.mean(g_score_list),np.std(g_score_list)))

In [None]:
import os
def show_y_yp(y,yp, metadata=METADATA, tickfontsize=15, labelfontsize=15):
    """y vs ypの図示．

    Args:
        y (np.ndarray): 目的変数観測値
        yp (np.ndarray): 目的変数予測値
        metadata (dict,optional); 表示用データ. Defaults to METADATA.
        tickfontsize (int, optional): ticks fontsize. Defaults to 15.
        labelfontsize (int, optional): label fontsize. Defaults to 15.
    """
    fig, ax =plt.subplots(figsize=(5, 5))
    ax.plot(y, yp, "o")
    yall = np.hstack([yp, y])
    y1, y2 = np.min(yall), np.max(yall)
    ax.plot([y1, y2], [y1, y2], "--")  # 対角線を引く
    ax.set_xlabel("$y_{obs}$", fontsize=labelfontsize)
    ax.set_ylabel("$y_{pred}$", fontsize=labelfontsize)
    ax.tick_params(axis = 'x', labelsize =tickfontsize)
    ax.tick_params(axis = 'y', labelsize =tickfontsize)
    fig.tight_layout()
    
    filename = "_".join([metadata["prefix"], metadata["dataname"],"yobs_ypredCV"])+".pdf"
    print(filename)
    fig.savefig(os.path.join(metadata["outputdir"],filename))
    
show_y_yp(g_y_list, g_yp_list)