In [253]:
# Chapter1
# NN構築試行
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score  # モデルの評価を行うための関数
# 学習データ読み込み
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,purpose,credit_score,application_type,loan_status
0,0,609.296068,3 years,8.421982,A5,0 years,debt_consolidation,714.061803,Individual,FullyPaid
1,1,1183.266999,5 years,10.286776,B1,10 years,credit_card,697.706701,Individual,ChargedOff
2,2,695.783256,3 years,14.723425,C2,1 year,debt_consolidation,656.419357,Individual,FullyPaid
3,3,738.392546,3 years,14.260708,C1,0 years,credit_card,657.906852,Individual,FullyPaid
4,4,1642.400654,5 years,25.217452,E5,10 years,debt_consolidation,662.972297,Individual,FullyPaid


In [254]:
# # Chapter2
# 学習の再現を行うために乱数シードを固定するためのコード
import os
import random as rn
from tensorflow.compat.v1.keras import backend as K

os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(0)
rn.seed(0)

tf.compat.v1.set_random_seed(0)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph())
K.set_session(sess)

pd.set_option("max_columns", None)
# pd.set_option('max_rows', None)

In [255]:
# print("デフォルトした人:", len(data[data["loan_status"]=="ChargedOff"]))
# print("デフォルトしていない人:", len(data[data["loan_status"]=="FullyPaid"]))

In [256]:
#Down Dampling
Train1 = data.where(data["loan_status"] == "ChargedOff").dropna()
Train2 = data.where(data["loan_status"] == "FullyPaid").dropna()
Train3 = Train2.sample(n=len(Train1), random_state=0)
TrainData = pd.concat([Train1, Train3])

In [257]:
TrainData.shape
data = TrainData

In [258]:
data.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,purpose,credit_score,application_type,loan_status
1,1.0,1183.266999,5 years,10.286776,B1,10 years,credit_card,697.706701,Individual,ChargedOff
7,7.0,2147.822844,5 years,23.738449,A3,10 years,debt_consolidation,656.789397,Individual,ChargedOff
9,9.0,701.82435,3 years,11.321683,B4,0 years,credit_card,656.609116,Individual,ChargedOff
12,12.0,1244.631171,3 years,18.932798,D3,0 years,debt_consolidation,656.933143,Individual,ChargedOff
24,24.0,2278.04473,5 years,13.757983,C5,1 year,credit_card,719.038856,Individual,ChargedOff


In [259]:
# Chapter3
# def pre_process(input_data):
# データ前処理
# One-Hotエンコーディング
# pre_X = input_data.drop(["loan_status", "id"], axis=1).replace({
pre_X = data.drop(["loan_status", "purpose"], axis=1).replace({
    "A1": 1, "A2": 2, "A3": 3, "A4": 4, "A5": 5,
    "B1": 6, "B2": 7, "B3": 8, "B4": 9, "B5": 10,
    "C1": 11, "C2": 12, "C3": 13, "C4": 14, "C5": 15,
    "D1": 16, "D2": 17, "D3": 18, "D4": 19, "D5": 20,
    "E1": 21, "E2": 22, "E3": 23, "E4": 24, "E5": 25,
    "F1": 26, "F2": 27, "F3": 28, "F4": 29, "F5": 30,
    "0 year": 0, "0 years": 0, "1 years": 1, "1 year": 1, "2 years": 2, "3 years": 3, "4 years": 4, "5 years": 5,
    "6 years": 6, "7 years": 7, "8 years": 8, "9 years": 9, "10 years": 10,
    "Individual": 1, "Joint App": 0,
}).astype({
    "term": "int8",
    "grade": "int8",
    "employment_length": "int8",
    "application_type": "int8",
})
pre_y = data["loan_status"]
X = pd.get_dummies(pre_X)
y = pre_y.replace({"ChargedOff": 1, "FullyPaid": 0})
# pre_X_train, pre_X_test = train_test_split(X, test_size=0.1, shuffle=False)
# y_train, y_test = train_test_split(y, test_size=0.1, shuffle=False)

# X_train = X
# X_test = X
y_train = y
y_test = y

# 加工が終わった時点でCSVに出すと、ほかのモデルでの使いまわしがきく。
# # Min-Maxスケーリング
# X_train = ((pre_X_train - pre_X_train.min()) / (pre_X_train.max() - pre_X_train.min()))
pre_X_train = ((X - X.min()) / (X.max() - X.min()))
# X_train = pre_X_train
# # Min-Maxスケーリング
# X_test = ((pre_X_test - pre_X_test.min()) / (pre_X_test.max() - pre_X_test.min()))
pre_X_test = ((X - X.min()) / (X.max() - X.min()))
# X_test = pre_X_test
    # return X_train, X_test, y_train, y_test
# X_train, X_test, y_train, y_test = pre_process(data)

In [260]:
pre_X_train.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,credit_score,application_type
1,0.0,0.242699,1.0,0.206435,0.172414,1.0,0.27814,1.0
7,2.5e-05,0.516423,1.0,0.814364,0.068966,1.0,0.008612,1.0
9,3.3e-05,0.106075,0.0,0.253206,0.275862,0.0,0.007425,1.0
12,4.5e-05,0.260113,0.0,0.597179,0.586207,0.0,0.009559,1.0
24,9.5e-05,0.553377,1.0,0.363311,0.482759,0.1,0.418657,1.0


In [261]:
pre_X_train = pre_X_train.astype("float32")
X_train = pre_X_train.astype({
    "term": "int8",
    "grade": "int8",
    "employment_length": "int8",
    "application_type": "int8",
    # "purpose_car": "int8",
    # "purpose_credit_card": "int8",
    # "purpose_debt_consolidation": "int8",
    # "purpose_home_improvement": "int8",
    # "purpose_house": "int8",
    # "purpose_major_purchase": "int8",
    # "purpose_medical": "int8",
    # "purpose_other": "int8",
    # "purpose_small_business": "int8",
})
X_test = X_train

In [262]:
print("訓練データの特徴量", X_train.shape)
print("訓練データのターゲット", y_train.shape)


訓練データの特徴量 (84712, 8)
訓練データのターゲット (84712,)


In [267]:
from tensorflow import keras  # Keras
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.metrics import binary_accuracy, Accuracy
import optuna


def create_model(units1, units2, lr):
    keras.backend.clear_session()
    # Keras以外を利用するのも一つの手段。
    model = keras.models.Sequential()
    model.add(Dense(units=units1, input_dim = 8, activation=tf.nn.relu)) 
    # model.add(Dropout(0.5))
    model.add(keras.layers.BatchNormalization())  # バッチ正規化層
    model.add(Dense(units=units2, activation=tf.nn.relu)) 
    # model.add(Dropout(0.5))
    model.add(keras.layers.BatchNormalization())  # バッチ正規化層
    model.add(Dense(units=2, activation="softmax"))  # 出力層（活性化関数はソフトマックス関数）
    model.compile(optimizer = keras.optimizers.SGD(lr=lr),
           loss="sparse_categorical_crossentropy",
            metrics="accuracy")
         #    f1scoreにするのがよい

    return model


def objective(trial):
    keras.backend.clear_session()
    # 調整したいハイパーパラメータの設定
    # n_layer = trial.suggest_int('n_layer', 1, 2) # 追加する層を1-3から選ぶ
    units1 = int(trial.suggest_discrete_uniform('units1', 100, 300, 1)) # ユニット数
    units2 = int(trial.suggest_discrete_uniform('units2', 100, 300, 1)) # ユニット数
    epochs = int(trial.suggest_discrete_uniform('epochs', 100, 300, 10)) # ユニット数
    lr = trial.suggest_float('lr', 0.01, 0.1,step=0.01) # 学習率

    model = create_model(units1, units2, lr)

        # (5)学習を実行してください
    es_cb = keras.callbacks.EarlyStopping(monitor = "val_loss", patience = 3)
    history = model.fit(x = X_train,
        y = y_train,
        batch_size = 256, # バッチサイズ
        # epochs = 100, # エポック数
        epochs = epochs, # エポック数
        validation_split = 0.2, # 検証データの割合
        # callbacks = [es_cb], # 早期終了の設定
        verbose = 0) # 進捗の確認を行うか（0:行わない, 1:行う）

    # return history.history["loss"][-1]
    # return history.history["val_loss"][-1]
    # return 1 - history.history["accuracy"][-1]
    return 1 - history.history["val_accuracy"][-1]



In [268]:
study = optuna.create_study()
study.optimize(objective, n_trials=10)

[I 2020-10-18 17:51:58,024] Trial 0 finished with value: 0.7402467131614685 and parameters: {'units1': 275.0, 'units2': 190.0, 'epochs': 110.0, 'lr': 0.06999999999999999}. Best is trial 0 with value: 0.7402467131614685.
[I 2020-10-18 17:53:18,095] Trial 1 finished with value: 0.6169509589672089 and parameters: {'units1': 204.0, 'units2': 259.0, 'epochs': 200.0, 'lr': 0.09999999999999999}. Best is trial 1 with value: 0.6169509589672089.
[I 2020-10-18 17:55:00,071] Trial 2 finished with value: 0.6220858097076416 and parameters: {'units1': 184.0, 'units2': 297.0, 'epochs': 250.0, 'lr': 0.04}. Best is trial 1 with value: 0.6169509589672089.
[I 2020-10-18 17:56:49,285] Trial 3 finished with value: 0.7036534249782562 and parameters: {'units1': 121.0, 'units2': 189.0, 'epochs': 270.0, 'lr': 0.03}. Best is trial 1 with value: 0.6169509589672089.
[I 2020-10-18 17:57:51,137] Trial 4 finished with value: 0.7333412170410156 and parameters: {'units1': 271.0, 'units2': 178.0, 'epochs': 150.0, 'lr': 

In [269]:
sorted_best_params = sorted(study.best_params.items(), key=lambda x : x[0])
best_param = dict()
for i, k in sorted_best_params:
    print(i + ' : ' + str(k))
    best_param[i] = k

epochs : 200.0
lr : 0.09999999999999999
units1 : 204.0
units2 : 259.0


In [270]:
study.best_value

0.6169509589672089

In [273]:
# Keras以外を利用するのも一つの手段。
keras.backend.clear_session()
# Keras以外を利用するのも一つの手段。
model = keras.models.Sequential()
model.add(Dense(units=best_param["units1"], input_dim = 8, activation=tf.nn.relu)) 
# model.add(Dropout(0.5))
model.add(keras.layers.BatchNormalization())  # バッチ正規化層
model.add(Dense(units=best_param["units2"], activation=tf.nn.relu)) 
# model.add(Dropout(0.5))
model.add(keras.layers.BatchNormalization())  # バッチ正規化層
model.add(Dense(units=2, activation="softmax"))  # 出力層（活性化関数はソフトマックス関数）
model.compile(optimizer = keras.optimizers.SGD(lr=best_param["lr"]),
        loss="sparse_categorical_crossentropy",
        metrics="accuracy")

In [274]:
hoge = ["mean_squared_error","mean_absolute_error","mean_absolute_percentage_error","mean_squared_logarithmic_error","squared_hinge","hinge","categorical_hinge","sparse_categorical_crossentropy","kullback_leibler_divergence","poisson"]

# for loss in hoge:
#     # Keras以外を利用するのも一つの手段。
#     try:
#         keras.backend.clear_session()
#         model = keras.models.Sequential()
#         model.add(Dense(units=10, activation=tf.nn.relu)) 
#         model.add(keras.layers.BatchNormalization())  # バッチ正規化層
#         model.add(Dense(units=10, activation=tf.nn.relu)) 
#         model.add(keras.layers.BatchNormalization())  # バッチ正規化層
#         model.add(Dense(units=2, activation=tf.nn.sigmoid))  # 出力層（活性化関数はソフトマックス関数）
#         model.compile(optimizer = keras.optimizers.SGD(lr=best_param["lr"]),
#                 loss=loss,
#                 metrics="accuracy")
#         model.fit(x=X_train,
#         y=y_train,
#         batch_size=256,  # バッチサイズ
#         epochs=int(10),  # エポック数
#         validation_split=0.2,  # 検証データの割合
#         verbose=0)  # 進捗の確認を行うか（0:行わない, 1:行う）
#     except:
#         print("unknown loss:" + loss)

In [275]:
history = model.fit(x=X_train,
       y=y_train,
       batch_size=256,  # バッチサイズ
       epochs=int(best_param["epochs"]),  # エポック数
       validation_split=0.2,  # 検証データの割合
       verbose=1)  # 進捗の確認を行うか（0:行わない, 1:行う）

338 - val_accuracy: 0.2968
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Ep

In [276]:
# 混同行列による評価
y_proba = model.predict(X_test)
y_pred = np.argmax(y_proba, axis=1)
confmat = confusion_matrix(y_test, y_pred)
print(confmat)

[[17093 25263]
 [ 6615 35741]]


In [277]:
# 正答率による評価
print("正解率:", accuracy_score(y_test, y_pred))

正解率: 0.6236896779677024


In [278]:
print(pd.DataFrame({'y': y_test, 'y_pred': y_pred}).head())  # 実際のクラスと分類結果を上から5つだけ表示

    y  y_pred
1   1       0
7   1       1
9   1       1
12  1       1
24  1       1


In [279]:
# 重回帰
from sklearn import linear_model  # 線形回帰を行うためのモジュール
# 変数の準備
# X = data_oh.drop(["loan_status_ChargedOff", "loan_status_FullyPaid"], axis=1)  # 説明変数の設定
# y = data_oh["loan_status_FullyPaid"]  # 目的変数の設定

# 学習
lr = linear_model.LinearRegression()  # 線形回帰モデルのインスタンスを作成
lr.fit(X_train, y_train)  # 回帰の実行

# 結果の確認
print("回帰係数:")
print(pd.DataFrame({"Name": X_train.columns,
                    "Coefficients": lr.coef_}).sort_values(by='Coefficients'))  # 回帰係数
print("切片:", lr.intercept_)  # 切片
print("決定係数:", lr.score(X_train, y_train))  # 決定係数

回帰係数:
                Name  Coefficients
6       credit_score     -0.198829
0                 id     -0.010394
5  employment_length      0.018097
7   application_type      0.044508
2               term      0.050185
1          loan_amnt      0.057248
4              grade      0.173729
3      interest_rate      0.631489
切片: 0.19068694
決定係数: 0.11894131042361988


In [280]:
X_train.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,credit_score,application_type
1,0.0,0.242699,1,0.206435,0,1,0.27814,1
7,2.5e-05,0.516423,1,0.814364,0,1,0.008612,1
9,3.3e-05,0.106075,0,0.253206,0,0,0.007425,1
12,4.5e-05,0.260113,0,0.597179,0,0,0.009559,1
24,9.5e-05,0.553377,1,0.363311,0,0,0.418657,1


In [281]:
# # ランダムフォレスト
# # データ前処理
# # データ分割
# X = data.drop(["id", "term", "grade", "employment_length", "purpose", "application_type", "loan_status"], axis=1)
# y_train = data["loan_status"].values
# # # Min-Maxスケーリング
# X = ((X - X.min()) / (X.max() - X.min()))

In [282]:
from sklearn.ensemble import RandomForestClassifier  # ランダムフォレストを実行するためのクラス
# 学習
RFC = RandomForestClassifier(max_depth=3, random_state=1)  # ランダムフォレストのインスタンスを作成
RFC.fit(X_train, y_train)  # ランダムフォレストの学習

# 分類結果の確認
y_pred = RFC.predict(X_train)  # 分類結果
print(pd.DataFrame({'y': y_train, 'y_pred': y_pred}).head())  # 実際のクラスと分類結果を上から5つだけ表示

    y  y_pred
1   1       0
7   1       1
9   1       0
12  1       1
24  1       1


In [283]:
# コード例2
# モデルの精度（正解率）の確認
print('正解率:', RFC.score(X_train, y_train))

正解率: 0.6346798564548116


In [284]:
# 混同行列による評価
y_proba = model.predict(X_train)
y_pred = np.argmax(y_proba, axis=1)
confmat = confusion_matrix(y_train, y_pred)
print(confmat)

[[17093 25263]
 [ 6615 35741]]


In [285]:
# history.history["val_accuracy"]
np.savetxt("C:\work\AI\hoge.dat", y_pred)

In [291]:
# データ前処理
data_test = pd.read_csv("test.csv")
# Chapter3
# データ前処理
# One-Hotエンコーディング
# pre_X = input_data.drop(["loan_status", "id"], axis=1).replace({
pre_X = data_test.drop(["purpose"], axis=1).replace({
    "A1": 1, "A2": 2, "A3": 3, "A4": 4, "A5": 5,
    "B1": 6, "B2": 7, "B3": 8, "B4": 9, "B5": 10,
    "C1": 11, "C2": 12, "C3": 13, "C4": 14, "C5": 15,
    "D1": 16, "D2": 17, "D3": 18, "D4": 19, "D5": 20,
    "E1": 21, "E2": 22, "E3": 23, "E4": 24, "E5": 25,
    "F1": 26, "F2": 27, "F3": 28, "F4": 29, "F5": 30,
    "0 year": 0, "0 years": 0, "1 years": 1, "1 year": 1, "2 years": 2, "3 years": 3, "4 years": 4, "5 years": 5,
    "6 years": 6, "7 years": 7, "8 years": 8, "9 years": 9, "10 years": 10,
    "Individual": 1, "Joint App": 0,
}).astype({
    "term": "int8",
    "grade": "int8",
    "employment_length": "int8",
    "application_type": "int8",
})
X = pd.get_dummies(pre_X)

# 加工が終わった時点でCSVに出すと、ほかのモデルでの使いまわしがきく。
X_test = ((X - X.min()) / (X.max() - X.min()))

In [292]:
X_test = X_test.astype("float32")
X_test = X_test.astype({
    "term": "int8",
    "grade": "int8",
    "employment_length": "int8",
    "application_type": "int8"
})

In [293]:
X_test.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,credit_score,application_type
0,0.0,0.233748,0,0.338255,0,1,0.170125,1
1,3.7e-05,0.550417,1,0.388391,0,1,0.081955,1
2,7.4e-05,0.528645,0,0.11775,0,1,0.306207,1
3,0.000112,0.048324,0,0.055643,0,0,0.484529,1
4,0.000149,0.563678,0,0.271754,0,0,0.27479,1


In [296]:
y_proba = model.predict(X_test)
y_pred = np.argmax(y_proba, axis=1)

In [297]:
np.savetxt("C:\work\AI\hoge.dat", y_pred)

In [298]:
type(y_pred[0])

NameError: name 'pred' is not defined