特徴抽出をしてみる。手始めにV1~V3まで可視化

In [None]:
##Visualization
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()

##import
import pandas as pd
import numpy as np
from IPython.core.display import display
from tqdm import tqdm_notebook as tqdm
from copy import deepcopy as cp

##visualization
from ipywidgets import interact
from bokeh.plotting import figure
from bokeh.io import output_notebook, show, push_notebook
from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label
from bokeh.palettes import Category10 as palette
from bokeh.resources import INLINE
output_notebook(resources=INLINE)
import itertools

##import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, precision_recall_curve, average_precision_score, auc


In [None]:
def showCurves(recalls, precisions, fprs, tprs):
    ##make figure
    s1 = figure(
        title = "PR曲線", 
        plot_width=300, plot_height=300,
        x_range=(-0.02,1.02), y_range=(0,1.02)
    )

    s2 = figure(
        title = "ROC曲線", 
        plot_width=300, plot_height=300,
        x_range=(-0.02,1.02), y_range=(0,1.02)
    )

    colors = palette[10]  

    for i in tqdm(range(len(recalls))):

        ##add line
        s1.line(
            recalls[i],
            precisions[i],
            line_width = 1,
            color=colors[i]
        )

        ##add line
        s2.line(
            fprs[i],
            tprs[i],
            line_width = 1,
            color=colors[i]
        )


    s1.xaxis.axis_label = 'Recall'
    s1.yaxis.axis_label = 'Precision'

    s2.xaxis.axis_label = 'FPR'
    s2.yaxis.axis_label = 'TPR'

    p = gridplot([[s1, s2]])

    show(p)
    
def getMeans(recalls, precisions, fprs, tprs):
    tprs_ = []
    precisions_=[]
    ##リスト型は関数内での操作であっても参照先が変更されてしまうので演算用につくった
    mean_fpr = np.linspace(0, 1, 100)
    mean_recall = np.linspace(0, 1, 100)
    for i in range(len(recalls)):
        tprs_.append( np.interp(mean_fpr, fprs[i], tprs[i]) )#tprの線形補間
        precisions_.append( np.interp(mean_recall, recalls[i][::-1], precisions[i][::-1]))
        ##recallが1から返されてるので逆順に行列をいれて上げないとバグる
    mean_tpr = np.mean(tprs_, axis=0)
    mean_precision = np.mean(precisions_, axis=0)
    
    return mean_recall, mean_precision, mean_fpr, mean_tpr

def showMeanCurve(mean_recall, mean_precision, mean_fpr, mean_tpr):
    ##make figure
    s1 = figure(
        title = "PR曲線", 
        plot_width=300, plot_height=300,
        x_range=(-0.02,1.02), y_range=(0,1.02)
    )

    s2 = figure(
        title = "ROC曲線", 
        plot_width=300, plot_height=300,
        x_range=(-0.02,1.02), y_range=(0,1.02)
    )

    ##add line
    s1.line(
        mean_recall,
        mean_precision,
        line_width = 2
    )

    ##add patch
    s1.patch(
        np.hstack((mean_recall, 1, 0)),
        np.hstack((mean_precision, 0, 0)),
        alpha = 0.1,
    )


    ##add line
    s2.line(
        mean_fpr,
        mean_tpr,
        line_width = 2,

    )

    s2.patch(
        np.hstack((mean_fpr, 1, 0)),
        np.hstack((mean_tpr, 0, 0)),
        alpha = 0.1,

    )

    s1.xaxis.axis_label = 'Recall'
    s1.yaxis.axis_label = 'Precision'

    s2.xaxis.axis_label = 'FPR'
    s2.yaxis.axis_label = 'TPR'

    p = gridplot([[s1, s2]])

    show(p)
    print('PR曲線のAUCスコア:',auc(mean_recall,mean_precision),'\tROC曲線のAUCスコア:',auc(mean_fpr, mean_tpr))

In [None]:
##acquire data
df = pd.read_csv('./creditcard.csv')
df0 = df[df.Class == 0]
df1 = df[df.Class == 1]
df1.head()

In [None]:
##random under sampling
df0u = df0.sample(frac = 0.05)
print('Class 0:',len(df0u),', Class 1:',len(df1))

In [None]:
## make trace
trace0 = go.Scatter3d(
    x = df0u.V1,
    y = df0u.V2,
    z = df0u.V3,
    name = 'class0',
    mode = 'markers',
    opacity = 0.4,
    marker = dict(
        size = 2
    )
)
trace1 = go.Scatter3d(
    x = df1.V1,
    y = df1.V2,
    z = df1.V3,
    name = 'class1',
    mode = 'markers',
    marker = dict(
        size = 3
    )
)
## concatnate traces
data = [trace0, trace1]

## define layout
layout = go.Layout(
    title='3D-PCA',
    width=600,
    height=500,
    scene = dict(
        xaxis = dict(
            nticks=4, range = [min(df.V1),max(df.V1)], title='V1'),
        yaxis = dict(
            nticks=4, range = [min(df.V2),max(df.V2)], title='V2'),
        zaxis = dict(
            nticks=4, range = [min(df.V3),max(df.V3)], title='V3')
    ),
    showlegend=True)

fig = dict(data=data, layout=layout)
offline.iplot(fig)

ちょっとグリグリ動かしただけでもわかるように正常(0)と異常(1)は結構よく分かれそう。
とりあえず何も考えずに、判定機に突っ込んでみる。

In [None]:
X = df.drop('Class', axis=1)
y = df.Class

In [None]:
probs=[]
pr_aucs=[]
roc_aucs=[]
fprs=[]
tprs=[]
precisions=[]
recalls=[]

## cross validation
cv = StratifiedKFold(n_splits=10)

lr = LogisticRegression()
for train, test in tqdm(cv.split(X,y)): 
    ##インデックス番号の分割をしている。
    ##ロジスティック回帰
    prob = lr.fit(X.iloc[train], y.iloc[train]).predict_proba(X.iloc[test])
    probs.append(prob)
    ##ROC曲線
    fpr, tpr, thresholds = roc_curve(y[test], prob[:, 1], pos_label=1)
    fprs.append(fpr)
    tprs.append(tpr)
    roc_aucs.append(auc(fpr, tpr))
    
    ##PR曲線
    precision, recall, thresholds = precision_recall_curve(y[test], prob[:, 1], pos_label=1)
    precisions.append(precision)
    recalls.append(recall)
    pr_aucs.append(auc(recall, precision))


In [None]:
showCurves(recalls, precisions, fprs, tprs)

In [None]:
mean_recall, mean_precision, mean_fpr, mean_tpr = getMeans(recalls, precisions, fprs, tprs)
showMeanCurve(mean_recall, mean_precision, mean_fpr, mean_tpr)

In [None]:
np.mean(pr_aucs)

In [None]:
np.mean(roc_aucs)

横軸揃えて各y軸の平均を取ったAUCとただ単に各foldで得られたAUCの平均値異なるので注意

## RF

In [None]:
probs=[]
pr_aucs=[]
roc_aucs=[]
fprs=[]
tprs=[]
precisions=[]
recalls=[]


random_forest = RandomForestClassifier(n_estimators=10) ##n is decided roughly.

for train, test in tqdm(cv.split(X,y)): 
    ##インデックス番号の分割をしている。
    ##Random Forest
    prob = random_forest.fit(X.iloc[train], y.iloc[train]).predict_proba(X.iloc[test])
    
    probs.append(prob)
    ##ROC曲線
    fpr, tpr, thresholds = roc_curve(y[test], prob[:, 1], pos_label=1)
    fprs.append(fpr)
    tprs.append(tpr)
    roc_aucs.append(auc(fpr, tpr))
    
    ##PR曲線
    precision, recall, thresholds = precision_recall_curve(y[test], prob[:, 1], pos_label=1)
    precisions.append(precision)
    recalls.append(recall)
    pr_aucs.append(auc(recall, precision))


    

In [None]:
showCurves(recalls, precisions, fprs, tprs)

In [None]:
mean_recall, mean_precision, mean_fpr, mean_tpr = getMeans(recalls, precisions, fprs, tprs)
showMeanCurve(mean_recall, mean_precision, mean_fpr, mean_tpr)

Classに対して有意に異なる特徴だけを用いることにする。

In [None]:
## statistical visualization
from string import ascii_letters
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
for i in tqdm(range(len(df.columns)-1)):
    g = sns.distplot(df0.iloc[:,i], color='green')
    g = sns.distplot(df1.iloc[:,i], color='red') 
    plt.show()

上の確率分布から、使う変数を選択する。
ロジスティック回帰には線形分離でうまく両分布が分かれそうな特徴を選ぶ。例えば今回ならば、V3, V4, V10, V11, V12, V14, V16を使う。
ランダムフォレストは非線形にClassを判別できるため、上記の特徴に加えて、分布の形が異なる特徴を加えることにする。V3, V4, V10, V11, V12, V14, V16に加えて(Time), V17, V18を説明変数とする。

In [None]:
X_LR = X[['V3','V4','V10','V11','V12','V14','V16']]
probs=[]
pr_aucs=[]
roc_aucs=[]
fprs=[]
tprs=[]
precisions=[]
recalls=[]

## cross validation
cv = StratifiedKFold(n_splits=10)

lr = LogisticRegression()
for train, test in tqdm(cv.split(X_LR,y)): 
    ##インデックス番号の分割をしている。
    ##ロジスティック回帰
    prob = lr.fit(X_LR.iloc[train], y.iloc[train]).predict_proba(X_LR.iloc[test])
    probs.append(prob)
    ##ROC曲線
    fpr, tpr, thresholds = roc_curve(y[test], prob[:, 1], pos_label=1)
    fprs.append(fpr)
    tprs.append(tpr)
    roc_aucs.append(auc(fpr, tpr))
    
    ##PR曲線
    precision, recall, thresholds = precision_recall_curve(y[test], prob[:, 1], pos_label=1)
    precisions.append(precision)
    recalls.append(recall)
    pr_aucs.append(auc(recall, precision))

In [None]:
showCurves(recalls, precisions, fprs, tprs)

In [None]:
mean_recall, mean_precision, mean_fpr, mean_tpr = getMeans(recalls, precisions, fprs, tprs)
showMeanCurve(mean_recall, mean_precision, mean_fpr, mean_tpr)

In [None]:
X_RF = X[['V3','V4','V10','V11','V12','V14','V16','V17','V18']]
probs=[]
pr_aucs=[]
roc_aucs=[]
fprs=[]
tprs=[]
precisions=[]
recalls=[]


random_forest = RandomForestClassifier(n_estimators=10) ##n is decided roughly.

for train, test in tqdm(cv.split(X_RF,y)): 
    ##インデックス番号の分割をしている。
    ##Random Forest
    prob = random_forest.fit(X_RF.iloc[train], y.iloc[train]).predict_proba(X_RF.iloc[test])
    
    probs.append(prob)
    ##ROC曲線
    fpr, tpr, thresholds = roc_curve(y[test], prob[:, 1], pos_label=1)
    fprs.append(fpr)
    tprs.append(tpr)
    roc_aucs.append(auc(fpr, tpr))
    
    ##PR曲線
    precision, recall, thresholds = precision_recall_curve(y[test], prob[:, 1], pos_label=1)
    precisions.append(precision)
    recalls.append(recall)
    pr_aucs.append(auc(recall, precision))


    

In [None]:
showCurves(recalls, precisions, fprs, tprs)

In [None]:
mean_recall, mean_precision, mean_fpr, mean_tpr = getMeans(recalls, precisions, fprs, tprs)
showMeanCurve(mean_recall, mean_precision, mean_fpr, mean_tpr)

RFの方がLRよりスコアが悪いのは、インバランスのためクラス境界において圧倒的に多い0に埋もれてしまうため。RFだとインバランスが弱点になりがちである。

todo
* 10-fold CV
* LR, RF in raw data
* PR, ROC
* select feature by filter method
* LR, RF in filtered feature
* evaluation
* select feature by wrapper method
* LR, RF in selected feature
* evaluation
* SMOTE and select feature by wrapper method
* LR, RF in selected feature
* evaluation