In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import warnings
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')


In [None]:
# データ取得
import get_data 
g_dfraw, descriptor_names, target_names = get_data.load('mono')
RANDOM_STATE = 2

In [None]:
METADATA = {"outputdir": "image_executed", 
              "prefix": "mono_logistic_regression", 
              "dataname":"mono_structure", 
              "regtype":"LogisticRegressionCV"}

In [None]:
def convert_crystaltype(dfraw, target_name, 
                        target_str = {0: "misc", 1:"hcp", 2:"bcc", 3:"fcc"}):
    """       0: misc (black)
       1: hcp (red)
       2: bcc (blue)
       3: fcc (green)
       の変換を行う．

    Args:
        dfraw (pd.DataFrame): データ.
        target_name (list[str]): 目的変数名
        target_str (dict, optional): 変換辞書. Defaults to {0: "misc", 1:"hcp", 2:"bcc", 3:"fcc"}.

    Returns:
        pd.DataFrame: 目的変数を変換されたデータ
    """
    targets = dfraw[target_name].values
    targetlist = []
    for target in targets:
        targetlist.append(target_str[target])
    targetlist
    dfraw[target_name] = targetlist
    return dfraw

g_dfraw =  convert_crystaltype(g_dfraw, target_names)
g_dfraw

In [None]:
# df = dfraw[dfraw["crystal_structure"] != 0].reset_index(drop=True)
g_df = g_dfraw[g_dfraw["crystal_structure"] != "misc"].reset_index(drop=True)

In [None]:
from sklearn.model_selection import cross_val_predict

def predict_and_score(df, descriptor_names, target_name, random_state=1, prediction="cv"):
    """fitとpredictを行う．

    Args:
        df (pd.DataFrame): データ.
        descriptor_names (list[str]]): a list of explanaroty variables names.
        target_name (str): target variable.
        random_state (int): random state. Defaults to 1.
        prediction (str): 予測の種類. Defaults to "cv".
        
    Returns:
        tuple containing
        
        - np.ndarray: explanaroty variables.
        - np.ndarray: observed target variables.
        - np.ndarray: predicted target variables.
        - np.ndarray: predicted probability of target variables.
        - LogisticRegressionCV: LogisticRegressionCV instance
    """
    Xraw = df.loc[:, descriptor_names].values
    y = df.loc[:, target_name].values

    # データプリプロセス
    scaler = StandardScaler()
    X = scaler.fit(Xraw)
    X = scaler.transform(Xraw)

    # データ解析
    kf = KFold(5, shuffle=True)
    cls_cv = LogisticRegressionCV(cv=kf, refit=True, multi_class='ovr')
    cls_cv.fit(X, y)
    score = cls_cv.score(X, y)
    print("score", score)
    print("prediction",prediction)
    if prediction=="cv":
        cls = LogisticRegression(C=cls_cv.C_[0])
        # kf = KFold(5, shuffle=True, random_state=random_state)
        yp = cross_val_predict(cls, X, y, cv=kf)
        yp_proba = cross_val_predict(cls, X, y, cv=kf, method="predict_proba")
    else:
        yp = cls_cv.predict(X)
        yp_proba = cls_cv.predict_proba(X)
    print(classification_report(y, yp, digits=3))
    with open("image_executed/mono_structure_cls_report.txt", "w") as f:
        f.write(classification_report(y, yp, digits=3))
    index = ["actual({})".format(i) for i in cls_cv.classes_]
    columns = ["predict({})".format(i) for i in cls_cv.classes_]
    cmdf = pd.DataFrame(confusion_matrix(y, yp, labels=cls_cv.classes_), index=index,
                        columns=columns)
    display(cmdf)
    return X, y, yp, yp_proba, cls_cv 

g_X, g_y,g_yp,g_yproba,g_cls = predict_and_score(g_df, descriptor_names, target_names,
                                                 random_state=RANDOM_STATE)

In [None]:
g_cls.classes_

In [None]:
def plot_X(X):
    """説明変数の図示．

    Args:
        X (np.ndarray): 説明変数
    """    
    fig, ax = plt.subplots()
    ax.plot(X)
    ax.set_xlabel("index")
    ax.set_ylabel("X")
    
plot_X(g_X)

def hist_y(y):
    """目的変数のhistogram図示．

    Args:
        y (np.ndarray): 目的変数
    """    
    plt.figure()
    plt.hist(y)
    plt.xlabel("y")

hist_y(g_y)

In [None]:
g_cls.C_

In [None]:
def show_CV_score_multi(cls,uniquey):
    """多目的変数回帰モデルのhyperparameterに対するscoreの図示．

    Args:
        cls (LogisticRegressionCV): LogisticRegressionCVインスタンス.
        uniquey (np.ndarray): 目的変数uniqueリスト．
    """
    plt.figure()
    plt.xlabel("log10(C)")
    plt.ylabel("score")
    for i, ytarget in enumerate(uniquey):
        score_mean = np.mean(cls.scores_[ytarget], axis=0)
        score_std = np.std(cls.scores_[ytarget], axis=0)
        plt.errorbar(np.log10(cls.Cs_)+0.1*i, score_mean,
                     yerr=score_std, capsize=5, label=str(ytarget))
        ic = np.argmax(score_mean)
        print("y=", ytarget, "Copt=", cls.Cs_[ic])
    plt.legend()
    plt.savefig("image_executed/mono_structure_hyperparameter_vs_score.png")
    plt.show()
    
show_CV_score_multi(g_cls,np.unique(g_y))

In [None]:
def plot_CV_scores_as_boxplot(cls, y):
    """多目的変数回帰モデルのhyperparameterに対するscoreのbox plotでの図示．

    Args:
        cls (LogisticRegressionCV): LogisticRegressionCVインスタンス.
        y (np.ndarray): 目的変数.
    """
    labels = []
    for x in np.log10(cls.Cs_):
        labels.append("{:.2f}".format(x))

    for ytarget in np.unique(y):
        fig, ax = plt.subplots()
        ax.set_title("target={}".format(ytarget))
        df_score = pd.DataFrame(cls.scores_[ytarget], columns=labels)
        df_score.boxplot(rot=90, ax=ax)
        ax.set_xlabel("log10(C)")
        
plot_CV_scores_as_boxplot(g_cls, g_y)

In [None]:
%matplotlib inline


def plot_y(y, y_predict, proba, symbols, labels):
    """plot y vs y_predict.

    Args:
        y (np.array): target values.
        y_predict (np.array): predicted target values.
        proba (np.array): probability.
        symbols (list[str]): material name
        labels (list[str]): target labels.
    """
    plt.plot(y, "b-", label="y")
    plt.plot(y_predict, "r-", label="predict_y")
    plt.legend()
    plt.show()
    failedlist = []
    for i, (p1, p2, pro, s) in enumerate(zip(y, y_predict, proba, symbols)):
        if p1 != p2:
            failed = [i, s, p1, p2]
            failed.extend(pro)
            failedlist.append(failed)
            
    columns = ["index","element","actual","pred"]
    for i in labels:
        columns.append("P({})".format(i))
    
    return pd.DataFrame(failedlist, columns=columns)
    
g_df_failed = plot_y(g_y, g_yp, g_yproba, g_df["symbol"], g_cls.classes_)
print("failed at ")
g_df_failed

In [None]:
# 確率の表示

def plot_proba(y, yp, proba, labels):
    """plot y vs yp and probability

    Args:
        y (np.array): target values
        yp (np.array): predicted target values
        proba (np.array): probability
    """
    fig, axes = plt.subplots(2,1,figsize=(12, 8))
    ax = axes[0]
    ax.plot(y, "o-", label="$y^{obs}$")
    ax.plot(yp, "o-", label="$y^{pred}$")
    ax.legend()
    
    ax = axes[1]
    n = proba.shape[1]
    for i in range(n):
        ax.plot(proba[:, i], "o-", label=labels[i])

    ax.legend()
    fig.show()


plot_proba(g_y, g_yp, g_yproba, g_cls.classes_)

In [None]:
# 失敗したデータの内訳を示します．
def show_failed_data(cls, df_failed):
    """show df_failed

    Args:
        df_failed (pd.DataFrame): 説明変数

    Returns:
        pd.DataFrame: データ
    """
    classes = cls.classes_.tolist()
    occur = np.zeros( (len(classes), len(classes)) )
    for act,pred in zip(df_failed["actual"],df_failed["pred"]):
        i1 = classes.index(act)
        i2 = classes.index(pred)
        occur[i1,i2] +=1
    index = []
    columns = []
    for s in classes:
        index.append("actual({})".format(s))
        columns.append("pred({})".format(s))

    # 混同行列ではありません．
    df = pd.DataFrame(occur, index=index, columns=columns).astype(int)
    return df

show_failed_data(g_cls, g_df_failed)