In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.datasets import load_iris
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from src.decorator import add_print

In [5]:
# sklearnデータセットに収録されたiris(アヤメ)のデータセットをロードしてデータフレームを作成
def load_iris_data():
    data = load_iris()
    x = pd.DataFrame(data["data"],columns=data["feature_names"])
    y = pd.DataFrame(data["target"],columns=["target"])
    return x, y

# 手書き文字のデータセットをダウンロードして、実験用データを準備 (70000枚のうち7000枚を利用)
def load_mnist_data():
    data = fetch_openml('mnist_784', version=1)
    _x = np.array(data['data'].astype(np.float32))
    _y = np.array(data['target'].astype(np.int32))
    _, x, _, y = train_test_split(_x, _y, test_size=0.1, random_state=1, stratify=_y)
    return x, y

In [3]:
# 一括処理のためにモデルの辞書を作成
model = {
    # k近傍法のモデル
    'kNN(k=3)':
    KNeighborsClassifier(n_neighbors=3, # k を指定 (デフォルトは 5)
                         weights='uniform',  # 距離を考慮しない(uniform:デフォルト)、する(distance)
                         algorithm='auto', # 近傍点計算アルゴリズム (auto:デフォルト,ball_tree,kd_tree,brute)
                         leaf_size=30,  # ball_tree,kd_tree指定時のリーフサイズの設定 (デフォルトは 30)
                         p=2),  # 距離計算の次元 (2:デフォルト、1)
    # svm (kernel="linear", C=1.0) のモデル
    'SVC(kernel="linear", C=1)':
    svm.SVC(kernel="linear", C=1, max_iter=100000, verbose=True, random_state=1),
    # svm (kernel="rbf", C=1) のモデル
    'SVC(kernel="rbf", C=1)':
    svm.SVC(kernel="rbf", C=1, max_iter=100000, verbose=True, random_state=1),
    # 決定木
    'DecisionTree(max_depth=10)':
    DecisionTreeClassifier(max_depth=10, # 木の深さの最大
                                random_state=2), # 乱数シード
    # ランダムフォレストのモデル
    'randomforest(max_depth=10, n_estimators=10)':
    RandomForestClassifier(max_depth=10, # 木の深さの最大
                             n_estimators=10, # 木の数
                             random_state=2), # 乱数シード
    # アダブーストのモデル
    'Adaboost(dct(max_depth=10), n_estimators=170)':
    AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10, random_state=1), # ベースモデルを指定
                            n_estimators=170, # 木の数
                            random_state=1), # 乱数シード
    # 勾配ブースティングのモデル
    'GradientBoostingClassifier(max_depth=5, n_estimators=170)':
    GradientBoostingClassifier(max_depth=5, # 木の深さの最大
                               n_estimators=170, # 木の数
                               random_state=1), # 乱数シード
}


In [4]:
dataset_key = "Iris"
# データを学習用と検証用に分割
x, y = load_iris_data()

x_train, x_test, y_train, y_test = \
    train_test_split(x, y, test_size=0.25, random_state=1, stratify=y) # 検証用データに25%を割当て
print(f'## dataset:{dataset_key} ',
        f'x_train:{len(x_train)} x_test:{len(x_test)} y_train:{len(y_train)} y_test:{len(y_test)}')

# データを標準化
print('# with scaling')
scaler = StandardScaler()
scaler.fit(x)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# 辞書に格納したモデルそれぞれについて性能を測定
for model_key in model.keys():
    # 学習用データを利用してモデルを学習
    clf = model[model_key]
    clf = clf.fit(x_train, np.array(y_train).ravel()) 

    # 学習したモデルの性能(正答率)を学習用データと検証用データで評価
    predict_train = clf.predict(x_train)
    train_score = metrics.accuracy_score(y_train, predict_train)
    predict_test = clf.predict(x_test)
    test_score = metrics.accuracy_score(y_test, predict_test)
    print(f'dataset:{dataset_key} model:{model_key}', 
        f'accuracy_score: train_data:{train_score: 0.5} test_data:{test_score: 0.5}')

## dataset:Iris  x_train:112 x_test:38 y_train:112 y_test:38
# with scaling
dataset:Iris model:kNN(k=3) accuracy_score: train_data: 0.95536 test_data: 0.94737
[LibSVM]dataset:Iris model:SVC(kernel="linear", C=1) accuracy_score: train_data: 0.97321 test_data: 0.97368
[LibSVM]dataset:Iris model:SVC(kernel="rbf", C=1) accuracy_score: train_data: 0.97321 test_data: 0.97368
dataset:Iris model:DecisionTree(max_depth=10) accuracy_score: train_data: 1.0 test_data: 0.97368
dataset:Iris model:randomforest(max_depth=10, n_estimators=10) accuracy_score: train_data: 0.99107 test_data: 0.97368
dataset:Iris model:Adaboost(dct(max_depth=10), n_estimators=170) accuracy_score: train_data: 1.0 test_data: 0.97368




dataset:Iris model:GradientBoostingClassifier(max_depth=5, n_estimators=170) accuracy_score: train_data: 1.0 test_data: 0.97368


In [6]:
dataset_key = "MNIST"
# データを学習用と検証用に分割
x, y = load_mnist_data()

x_train, x_test, y_train, y_test = \
    train_test_split(x, y, test_size=0.25, random_state=1, stratify=y) # 検証用データに25%を割当て
print(f'## dataset:{dataset_key} ',
        f'x_train:{len(x_train)} x_test:{len(x_test)} y_train:{len(y_train)} y_test:{len(y_test)}')

# データを標準化
print('# with scaling')
scaler = StandardScaler()
scaler.fit(x)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# 辞書に格納したモデルそれぞれについて性能を測定
for model_key in model.keys():
    # 学習用データを利用してモデルを学習
    clf = model[model_key]
    clf = clf.fit(x_train, np.array(y_train).ravel()) 

    # 学習したモデルの性能(正答率)を学習用データと検証用データで評価
    predict_train = clf.predict(x_train)
    train_score = metrics.accuracy_score(y_train, predict_train)
    predict_test = clf.predict(x_test)
    test_score = metrics.accuracy_score(y_test, predict_test)
    print(f'dataset:{dataset_key} model:{model_key}', 
        f'accuracy_score: train_data:{train_score: 0.5} test_data:{test_score: 0.5}')

  warn(


## dataset:MNIST  x_train:5250 x_test:1750 y_train:5250 y_test:1750
# with scaling
dataset:MNIST model:kNN(k=3) accuracy_score: train_data: 0.94571 test_data: 0.89943
[LibSVM]dataset:MNIST model:SVC(kernel="linear", C=1) accuracy_score: train_data: 1.0 test_data: 0.91371
[LibSVM]dataset:MNIST model:SVC(kernel="rbf", C=1) accuracy_score: train_data: 0.98457 test_data: 0.92686
dataset:MNIST model:DecisionTree(max_depth=10) accuracy_score: train_data: 0.95619 test_data: 0.77486
dataset:MNIST model:randomforest(max_depth=10, n_estimators=10) accuracy_score: train_data: 0.97962 test_data: 0.87886




dataset:MNIST model:Adaboost(dct(max_depth=10), n_estimators=170) accuracy_score: train_data: 1.0 test_data: 0.93543
dataset:MNIST model:GradientBoostingClassifier(max_depth=5, n_estimators=170) accuracy_score: train_data: 1.0 test_data: 0.93429


In [7]:
# Fashion-MNISTデータセットをダウンロードして、実験用データを準備 (70000枚のうち7000枚を利用)
def load_fashion_mnist_data():
    data = fetch_openml('Fashion-MNIST')
    _x = np.array(data['data'].astype(np.float32))
    _y = np.array(data['target'].astype(np.int32))
    _, x, _, y = train_test_split(_x, _y, test_size=0.1, random_state=1, stratify=_y) 
    return x, y

dataset_key = "Fashion-MNIST"
# データを学習用と検証用に分割
x, y = load_fashion_mnist_data()

x_train, x_test, y_train, y_test = \
    train_test_split(x, y, test_size=0.25, random_state=1, stratify=y) # 検証用データに25%を割当て
print(f'## dataset:{dataset_key} ',
        f'x_train:{len(x_train)} x_test:{len(x_test)} y_train:{len(y_train)} y_test:{len(y_test)}')

# データを標準化
print('# with scaling')
scaler = StandardScaler()
scaler.fit(x)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# 辞書に格納したモデルそれぞれについて性能を測定
for model_key in model.keys():
    # 学習用データを利用してモデルを学習
    clf = model[model_key]
    clf = clf.fit(x_train, np.array(y_train).ravel()) 

    # 学習したモデルの性能(正答率)を学習用データと検証用データで評価
    predict_train = clf.predict(x_train)
    train_score = metrics.accuracy_score(y_train, predict_train)
    predict_test = clf.predict(x_test)
    test_score = metrics.accuracy_score(y_test, predict_test)
    print(f'dataset:{dataset_key} model:{model_key}', 
        f'accuracy_score: train_data:{train_score: 0.5} test_data:{test_score: 0.5}')

  warn(


## dataset:Fashion-MNIST  x_train:5250 x_test:1750 y_train:5250 y_test:1750
# with scaling
dataset:Fashion-MNIST model:kNN(k=3) accuracy_score: train_data: 0.89257 test_data: 0.79886
[LibSVM]dataset:Fashion-MNIST model:SVC(kernel="linear", C=1) accuracy_score: train_data: 1.0 test_data: 0.79943
[LibSVM]dataset:Fashion-MNIST model:SVC(kernel="rbf", C=1) accuracy_score: train_data: 0.9141 test_data: 0.82571
dataset:Fashion-MNIST model:DecisionTree(max_depth=10) accuracy_score: train_data: 0.90895 test_data: 0.73029
dataset:Fashion-MNIST model:randomforest(max_depth=10, n_estimators=10) accuracy_score: train_data: 0.94 test_data: 0.80343




dataset:Fashion-MNIST model:Adaboost(dct(max_depth=10), n_estimators=170) accuracy_score: train_data: 1.0 test_data: 0.82857
dataset:Fashion-MNIST model:GradientBoostingClassifier(max_depth=5, n_estimators=170) accuracy_score: train_data: 1.0 test_data: 0.83486


In [8]:
import lightgbm as lgb

x, y = load_fashion_mnist_data()

x_train, x_test, y_train, y_test = \
    train_test_split(x, y, test_size=0.25, random_state=1, stratify=y) # 検証用データに25%を割当て

print(
f"""

Total size: {len(x)}
Train size: {len(x_train)}
Test size : {len(x_test)}

"""
)

# データを標準化
print('# with scaling')
scaler = StandardScaler()
scaler.fit(x)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

lgb_train = lgb.Dataset(x_train, y_train)

gbm = lgb.LGBMClassifier(
    num_leaves=15,
    max_depth=10,
    n_estimators=170,
    verbosity=-1
)

gbm.fit(x_train, y_train)
p = gbm.predict(x_test)

print("Test Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_test, p)))
print("Train Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_train, gbm.predict(x_train))))

  warn(




Total size: 7000
Train size: 5250
Test size : 1750


# with scaling
Test Accuracy	: 0.8480
Train Accuracy	: 1.0000


In [11]:
# データを正規化
x_train, x_test, y_train, y_test = \
    train_test_split(x, y, test_size=0.25, random_state=1, stratify=y) # 検証用データに25%を割当て
x_train, x_test = x_train / 255.0, x_test / 255.0

lgb_train = lgb.Dataset(x_train, y_train)

num_leaves = 31
max_depth = -1
n_estimators = 170

print(
f"""
# Params
- `num_leaves`: {num_leaves}
- `max_depth`: {max_depth}
- `n_estimators`: {n_estimators}
- 正規化
"""
)

gbm = lgb.LGBMClassifier(
    num_leaves=num_leaves,
    max_depth=max_depth,
    n_estimators=n_estimators,
    verbosity=-1
)

gbm.fit(x_train, y_train)
p = gbm.predict(x_test)

@add_print()
def print_acc():
    print("Test Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_test, p)))
    print("Train Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_train, gbm.predict(x_train))))
print_acc()



Total size: 7000
Train size: 5250
Test size : 1750


# with scaling


Test Accuracy	: 0.8417
Train Accuracy	: 1.0000




In [12]:
# データを正規化
x_train, x_test, y_train, y_test = \
    train_test_split(x, y, test_size=0.25, random_state=1, stratify=y) # 検証用データに25%を割当て
x_train, x_test = x_train / 255.0, x_test / 255.0

lgb_train = lgb.Dataset(x_train, y_train)

num_leaves = 31
max_depth = -1
n_estimators = 170

print(
f"""
# Params
- `num_leaves`: {num_leaves}
- `max_depth`: {max_depth}
- `n_estimators`: {n_estimators}
- 正規化
"""
)

gbm = lgb.LGBMClassifier(
    num_leaves=num_leaves,
    max_depth=max_depth,
    n_estimators=n_estimators,
    verbosity=-1
)

gbm.fit(x_train, y_train)
p = gbm.predict(x_test)

@add_print()
def print_acc():
    print("Test Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_test, p)))
    print("Train Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_train, gbm.predict(x_train))))
print_acc()


# Params
- `num_leaves`: 31
- `max_depth`: -1
- `n_estimators`: 170
- 正規化



Test Accuracy	: 0.8400
Train Accuracy	: 1.0000




In [13]:
# データを正規化
x_train, x_test, y_train, y_test = \
    train_test_split(x, y, test_size=0.25, random_state=1, stratify=y) # 検証用データに25%を割当て
x_train, x_test = x_train / 255.0, x_test / 255.0

lgb_train = lgb.Dataset(x_train, y_train)

num_leaves = 63
max_depth = -1
n_estimators = 170

print(
f"""
# Params
- `num_leaves`: {num_leaves}
- `max_depth`: {max_depth}
- `n_estimators`: {n_estimators}
- 正規化
"""
)

gbm = lgb.LGBMClassifier(
    num_leaves=num_leaves,
    max_depth=max_depth,
    n_estimators=n_estimators,
    verbosity=-1
)

gbm.fit(x_train, y_train)
p = gbm.predict(x_test)

@add_print()
def print_acc():
    print("Test Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_test, p)))
    print("Train Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_train, gbm.predict(x_train))))
print_acc()


# Params
- `num_leaves`: 63
- `max_depth`: -1
- `n_estimators`: 170
- 正規化



Test Accuracy	: 0.8463
Train Accuracy	: 1.0000




In [14]:
# データを正規化
x_train, x_test, y_train, y_test = \
    train_test_split(x, y, test_size=0.25, random_state=1, stratify=y) # 検証用データに25%を割当て
x_train, x_test = x_train / 255.0, x_test / 255.0

lgb_train = lgb.Dataset(x_train, y_train)

num_leaves = 31
max_depth = -1
n_estimators = 100

print(
f"""
# Params
- `num_leaves`: {num_leaves}
- `max_depth`: {max_depth}
- `n_estimators`: {n_estimators}
- 正規化
"""
)

gbm = lgb.LGBMClassifier(
    num_leaves=num_leaves,
    max_depth=max_depth,
    n_estimators=n_estimators,
    verbosity=-1
)

gbm.fit(x_train, y_train)
p = gbm.predict(x_test)

@add_print()
def print_acc():
    print("Test Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_test, p)))
    print("Train Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_train, gbm.predict(x_train))))
print_acc()


# Params
- `num_leaves`: 31
- `max_depth`: -1
- `n_estimators`: 100
- 正規化



Test Accuracy	: 0.8394
Train Accuracy	: 1.0000




In [20]:
# データを正規化
x_train, x_test, y_train, y_test = \
    train_test_split(x, y, test_size=0.25, random_state=1, stratify=y) # 検証用データに25%を割当て
# x_train, x_test = x_train / 255.0, x_test / 255.0

# lgb_train = lgb.Dataset(x_train, y_train)
scaling = True
num_leaves = 31
max_depth = 10
n_estimators = 170

if scaling:
    scaler = StandardScaler()
    scaler.fit(x)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

print(
f"""
# Params
- Defalut
- `num_leaves`: {num_leaves}
- `max_depth`: {max_depth}
- `n_estimators`: {n_estimators}
- `scalign`: {scaling}
"""
)

gbm = lgb.LGBMClassifier(
    # num_leaves=num_leaves,
    # max_depth=max_depth,
    # n_estimators=n_estimators,
    verbosity=-1
)

gbm.fit(x_train, y_train)
p = gbm.predict(x_test)

@add_print()
def print_acc():
    print("Test Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_test, p)))
    print("Train Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_train, gbm.predict(x_train))))
print_acc()


# Params
- Defalut
- `num_leaves`: 31
- `max_depth`: 10
- `n_estimators`: 170
- `scalign`: True



Test Accuracy	: 0.8440
Train Accuracy	: 1.0000




In [None]:
# データを正規化
x_train, x_test, y_train, y_test = \
    train_test_split(x, y, test_size=0.25, random_state=1, stratify=y) # 検証用データに25%を割当て
# x_train, x_test = x_train / 255.0, x_test / 255.0

lgb_train = lgb.Dataset(x_train, y_train)

num_leaves = 31
max_depth = 10
n_estimators = 170

print(
f"""
# Params
- Defalut
- `num_leaves`: {num_leaves}
- `max_depth`: {max_depth}
- `n_estimators`: {n_estimators}
- `norm`: `False`
"""
)

gbm = lgb.LGBMClassifier(
    num_leaves=num_leaves,
    max_depth=max_depth,
    n_estimators=n_estimators,
    verbosity=-1
)

gbm.fit(x_train, y_train)
p = gbm.predict(x_test)

@add_print()
def print_acc():
    print("Test Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_test, p)))
    print("Train Accuracy\t: {:.4f}".format(metrics.accuracy_score(y_train, gbm.predict(x_train))))
print_acc()