<a href="https://colab.research.google.com/github/naomori/codexa_LinearRegression_Practice/blob/master/Chapter5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Section.1: データとライブラリのインポート


In [0]:
## データ処理と可視化のためのライブラリー
import numpy as np 
import seaborn as sns
import pandas as pd 
import matplotlib.pyplot as plt

In [0]:
# 機械学習ライブラリ「Scikit-learn」のインポート
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [0]:
# スタイルと設定
sns.set(style="dark")
sns.set(style="darkgrid", color_codes=True)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

In [0]:
# CSVデータをデータフレームへ変換
mushroom = pd.read_csv('mushrooms.csv')

# Section.2: データの前処理

In [0]:
# 特徴量をダミー変数化
mushroom2 = pd.get_dummies(mushroom, columns = mushroom.columns[1:], drop_first=True)

In [8]:
mushroom2.describe()

Unnamed: 0,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,cap-color_e,cap-color_g,cap-color_n,cap-color_p,cap-color_r,cap-color_u,cap-color_w,cap-color_y,bruises_t,odor_c,odor_f,odor_l,odor_m,odor_n,odor_p,odor_s,...,ring-number_o,ring-number_t,ring-type_f,ring-type_l,ring-type_n,ring-type_p,spore-print-color_h,spore-print-color_k,spore-print-color_n,spore-print-color_o,spore-print-color_r,spore-print-color_u,spore-print-color_w,spore-print-color_y,population_c,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
count,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,...,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0
mean,0.000492,0.387986,0.10192,0.003939,0.450025,0.000492,0.314623,0.399311,0.005416,0.184638,0.226489,0.281142,0.017725,0.001969,0.001969,0.128016,0.131955,0.415559,0.023634,0.265879,0.049237,0.004431,0.434269,0.031512,0.070901,...,0.921713,0.073855,0.005908,0.159527,0.004431,0.488429,0.200886,0.230428,0.242245,0.005908,0.008863,0.005908,0.293944,0.005908,0.041851,0.049237,0.153619,0.497292,0.210734,0.264402,0.102413,0.035943,0.140817,0.045298,0.023634
std,0.022185,0.487321,0.302562,0.062641,0.497527,0.022185,0.464394,0.489787,0.073399,0.388028,0.418585,0.449584,0.131959,0.044338,0.044338,0.334128,0.338462,0.492848,0.151914,0.441827,0.216375,0.066425,0.495691,0.174706,0.256675,...,0.268639,0.261551,0.076644,0.36619,0.066425,0.499897,0.400688,0.421133,0.428468,0.076644,0.093729,0.076644,0.455595,0.076644,0.200262,0.216375,0.360605,0.500023,0.407855,0.441041,0.303209,0.186159,0.347854,0.207969,0.151914
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
mushroom.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,2,5,4,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,t,b,s,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,4608,3776,5176,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [12]:
# データフレームのサイズを確認
mushroom.shape, mushroom2.shape

((8124, 23), (8124, 96))

In [0]:
# ターゲットの値を文字列から数値へ変換
labelencoder=LabelEncoder()
mushroom2['class'] = labelencoder.fit_transform(mushroom2['class'])

In [14]:
# 最初の五行を確認
mushroom2.head()

Unnamed: 0,class,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,cap-color_e,cap-color_g,cap-color_n,cap-color_p,cap-color_r,cap-color_u,cap-color_w,cap-color_y,bruises_t,odor_c,odor_f,odor_l,odor_m,odor_n,odor_p,...,ring-number_o,ring-number_t,ring-type_f,ring-type_l,ring-type_n,ring-type_p,spore-print-color_h,spore-print-color_k,spore-print-color_n,spore-print-color_o,spore-print-color_r,spore-print-color_u,spore-print-color_w,spore-print-color_y,population_c,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,...,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,...,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


# Section.3: 特徴選択（Feature Selection）

特徴選択とは...
**複数ある特徴量のうち、機械学習の学習モデルの質を高める特徴量のみを選択/検討する手法**
のことを意味します。

特徴選択は、特徴量選択、変数選択、特徴削減と呼ばれることもある。

In [0]:
# 訓練データ（80%）とテストデータ（20%)へスプリット
train_set, test_set = train_test_split(mushroom2, test_size = 0.2, random_state = 42)

In [0]:
# 特徴量（x）とターゲット（y）へ切り分け
X_train = train_set.drop('class',axis=1)
y_train = train_set['class'].copy()

X_test = test_set.drop('class',axis=1)
y_test = test_set['class'].copy()

特徴選択には、様々な手法があるが、今回は、
Scikit-learn の Recursive Feature Elimination (RFE) を使ってみる
（再帰的特徴量削減）。

1. 与えられたすべての特徴量を使って、モデルを訓練する
2. 訓練済みモデルから、重要度が低い特徴量を削減する
3. 最終的に、設定した特徴量の数になるまで 1,2 を繰り返す



In [18]:
# RFEを使って特徴選択を行います
logreg = LogisticRegression(solver='lbfgs')
rfe = RFE(logreg, 5, verbose=1)
rfe = rfe.fit(X_train, y_train)

Fitting estimator with 95 features.
Fitting estimator with 94 features.
Fitting estimator with 93 features.
Fitting estimator with 92 features.
Fitting estimator with 91 features.
Fitting estimator with 90 features.
Fitting estimator with 89 features.
Fitting estimator with 88 features.
Fitting estimator with 87 features.
Fitting estimator with 86 features.
Fitting estimator with 85 features.
Fitting estimator with 84 features.
Fitting estimator with 83 features.
Fitting estimator with 82 features.
Fitting estimator with 81 features.
Fitting estimator with 80 features.
Fitting estimator with 79 features.
Fitting estimator with 78 features.
Fitting estimator with 77 features.
Fitting estimator with 76 features.
Fitting estimator with 75 features.
Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.
Fitting estimator with 68 fe

In [19]:
# 選択した特徴量を切り分けます
# RFEが最終的に選択した特徴量ですが、「rfe.support_」にboolean型で入っています。
X_train = X_train[X_train.columns[rfe.support_]]
X_test = X_test[X_test.columns[rfe.support_]]

# データフレームの確認
X_train.head()

Unnamed: 0,odor_c,odor_n,odor_p,spore-print-color_k,spore-print-color_n
7873,0,0,0,0,0
6515,0,0,0,0,0
6141,0,0,0,0,0
2764,0,1,0,0,1
438,0,0,0,0,1


# Section.4: モデルの訓練と評価

In [21]:
# 訓練データを使ってモデルの訓練
logclassifier = LogisticRegression(solver='lbfgs')
logclassifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
# 訓練データの予測
y_pred = logclassifier.predict(X_train)

In [23]:
# 混同行列で訓練データの予測結果を評価
cnf_matrix = confusion_matrix(y_train,y_pred)
cnf_matrix

array([[3331,   34],
       [  95, 3039]])

In [24]:
# 正解率を算出
accuracy_score(y_train, y_pred)

0.9801507924296046

最終的にテストデータを使って予測と評価をします。

In [0]:
# テストデータの予測
y_pred_test = logclassifier.predict(X_test)

In [26]:
# 混同行列（テストデータ）
cnf_matrix_test = confusion_matrix(y_test,y_pred_test)
cnf_matrix_test

array([[829,  14],
       [ 25, 757]])

In [27]:
# 正解率（テストデータ）
accuracy_score(y_test, y_pred_test)

0.976