In [None]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')


%matplotlib inline
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 60)


In [None]:
DATA_NAME = "ZB_WZ_all" # ZB_WZ_all| ZB_WZ_3 | ZB_WZ_2
RANDOM_STATE = 3

In [None]:

import get_data
g_df, g_descriptor_names, g_target_name = get_data.load(DATA_NAME)

In [None]:
g_df

In [None]:
from sklearn.model_selection import cross_val_predict
def classify_df(df, descriptor_names, target_name, random_state=1, prediction="cv"):
    """データを分類する．
    
    CV test prediction if prediction=="cv". fitting all (X,y) and predict if prediction!="cv".

    Args:
        df (pd.DataFrame): データ
        descriptor_names ([str]): 説明変数名リスト
        target_name ([str])): 目的変数名リスト
        random_state (int): random state. Defaults to 1.
        prediction (str): 予測の種類. Defaults to "cv".
        
    Returns:
        tuple containing
        
        - LogisticRegressionCV: LogisticRegressionCVインスタンス .
        - np.ndarray: X.
        - np.ndarray: y.
        - np.ndarray: values of cls.predict(X)
        - np.ndarray: values of cls.predict_proba(X)
    """
    Xraw = df.loc[:, descriptor_names].values
    yraw = df.loc[:, target_name].values
    y = yraw > 0

    # データプリプロセス
    scaler = StandardScaler()
    scaler.fit(Xraw)
    X = scaler.transform(Xraw)
    print(X.shape)

    # データ解析
    kf = KFold(5, shuffle=True, random_state=random_state)
    cls_cv = LogisticRegressionCV(cv=kf)
    cls_cv.fit(X, y)
    score = cls_cv.score(X, y)
    print("score=", score)
    print("prediction",prediction)
    if prediction=="cv":
        cls = LogisticRegression(C=cls_cv.C_[0])
        # kf = KFold(5, shuffle=True, random_state=random_state)
        yp = cross_val_predict(cls, X, y, cv=kf)
        yp_proba = cross_val_predict(cls, X, y, cv=kf, method="predict_proba")
    else:
        yp = cls_cv.predict(X)
        yp_proba = cls_cv.predict_proba(X)
    print(classification_report(y, yp))
    index = []
    columns= []
    for s in cls_cv.classes_:
        index.append("actual({})".format(s))
        columns.append("predict({})".format(s))
    cmdf = pd.DataFrame(confusion_matrix(y, yp, labels=cls_cv.classes_), index=index, columns=columns)
    display(cmdf)
    
    return cls_cv, X, y, yp, yp_proba

g_cls, g_X, g_y, g_yp, g_yp_proba =  classify_df(g_df, g_descriptor_names, g_target_name, 
                                                 random_state=RANDOM_STATE)

In [None]:
g_df

In [None]:
def plot_X(X):
    """説明変数の図示．

    Args:
        X (np.ndarray): 説明変数
    """
    fig, ax = plt.subplots()
    ax.plot(X)
    ax.set_xlabel("index")
    ax.set_ylabel("X")
    plt.show()
    
plot_X(g_X)

from collections import Counter
print("Counter", Counter(g_y))

In [None]:
g_Copt = g_cls.C_[0]
print("Copt=", g_Copt)

In [None]:
def plot_CV_scores(cls):
    """cls.scoreの表示．

    Args:
        cls (LogisticRegressionCV): LogisticRegressionCVインスタンス.

    """
    scores_mean = np.mean(cls.scores_[True], axis=0)
    scores_std = np.std(cls.scores_[True], axis=0)
    ic = np.argmax(scores_mean)
    print("index=", ic, "score=", scores_mean[ic])

    fig, ax = plt.subplots()
    ax.errorbar(np.log10(cls.Cs_), scores_mean, yerr=scores_std, capsize=5)
    ax.set_xlabel("log10(C)")
    ax.set_ylabel("score")
    plt.show()
    
plot_CV_scores(g_cls)

In [None]:
def plot_CV_scores_as_boxplot(cls):
    """cls.scoreの表示をboxplotで行う．

    Args:
        cls (LogisticRegressionCV): LogisticRegressionCVインスタンス.

    """    
    labels = []
    for x in np.log10(cls.Cs_):
        labels.append("{:.3f}".format(x))
    df_score = pd.DataFrame(cls.scores_[True], columns=labels)
    fig, ax = plt.subplots()
    df_score.boxplot(rot=90, ax=ax)
    ax.set_xlabel("log10(C)")
    fig.savefig("image_executed/ZB_WZ_cls_boxplot.png")
    plt.show()
    
plot_CV_scores_as_boxplot(g_cls)

In [None]:
def calc_CV_score(X,y, Copt):
    """CVを行い，score，y, ypを出力する．

    Args:
        X (np.ndarray)): 説明変数
        y (np.ndarray): 目的変数
        Copt (float): C of logistic regression.
        
    Returns:
        tuple containing
        
        - list[float]: a list of KFold scores.
        - list[np.array]: a list of KFold y_test.
        - list[np.array]: a list of KFold predicted y_test.

    """
    ytest_list = []
    ytestp_list = []
    score_list = []
    kf = KFold(5, shuffle=True)
    for train, test in kf.split(X):
        Xtrain, ytrain = X[train], y[train]
        Xtest, ytest = X[test], y[test]
        cls = LogisticRegression(C=Copt)
        cls.fit(Xtrain, ytrain)
        ytestp = cls.predict(Xtest)
        ytest_list.extend(ytest)
        ytestp_list.extend(ytestp)
        score = cls.score(Xtest, ytest)
        score_list.append(score)
    return score_list, ytest_list, ytestp_list

g_score_list, g_ytest_list, g_ytestp_list = calc_CV_score(g_X, g_y, g_Copt)

In [None]:
def show_scores(score_list, ytest_list, ytestp_list, classes):
    """score, y, ypの図示．

    Args:
        score_list (list[float]]): a list of KFold scores.
        ytest_list (list[np.array]): a list of y_test
        ytestp_list (list[np.array]): a list of predicted y_test
        classes (list[int]): classification classes.
    """
    print("score = {}({})".format(np.mean(score_list), np.std(score_list)))
    print(classification_report(ytest_list, ytestp_list))
    with open("image_executed/ZB_WZ_cls_report.txt", "w") as f:
        f.write((classification_report(ytest_list, ytestp_list)))
    index = []
    columns= []
    for s in classes:
        index.append("actual({})".format(s))
        columns.append("predict({})".format(s))
    df_cm = pd.DataFrame(confusion_matrix(ytest_list, ytestp_list, labels=classes), index=index,
                         columns=columns)
    display(df_cm)
    
show_scores(g_score_list, g_ytest_list, g_ytestp_list, g_cls.classes_ )