In [1]:
# Chapter1
# NN構築試行
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score  # モデルの評価を行うための関数
# 学習データ読み込み
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,purpose,credit_score,application_type,loan_status
0,0,609.296068,3 years,8.421982,A5,0 years,debt_consolidation,714.061803,Individual,FullyPaid
1,1,1183.266999,5 years,10.286776,B1,10 years,credit_card,697.706701,Individual,ChargedOff
2,2,695.783256,3 years,14.723425,C2,1 year,debt_consolidation,656.419357,Individual,FullyPaid
3,3,738.392546,3 years,14.260708,C1,0 years,credit_card,657.906852,Individual,FullyPaid
4,4,1642.400654,5 years,25.217452,E5,10 years,debt_consolidation,662.972297,Individual,FullyPaid


In [2]:
# # Chapter2
# 学習の再現を行うために乱数シードを固定するためのコード
import os
import random as rn
from tensorflow.compat.v1.keras import backend as K

os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(0)
rn.seed(0)

tf.compat.v1.set_random_seed(0)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph())
K.set_session(sess)

pd.set_option("max_columns", None)
pd.set_option('max_rows', None)

In [3]:
# print("デフォルトした人:", len(data[data["loan_status"]=="ChargedOff"]))
# print("デフォルトしていない人:", len(data[data["loan_status"]=="FullyPaid"]))

In [4]:
#Down Dampling
Train1 = data.where(data["loan_status"] == "ChargedOff").dropna()
Train2 = data.where(data["loan_status"] == "FullyPaid").dropna()
Train3 = Train2.sample(n=len(Train1), random_state=0)
TrainData = pd.concat([Train1, Train3])

In [5]:
TrainData.shape
data = TrainData

In [6]:
data.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,purpose,credit_score,application_type,loan_status
1,1.0,1183.266999,5 years,10.286776,B1,10 years,credit_card,697.706701,Individual,ChargedOff
7,7.0,2147.822844,5 years,23.738449,A3,10 years,debt_consolidation,656.789397,Individual,ChargedOff
9,9.0,701.82435,3 years,11.321683,B4,0 years,credit_card,656.609116,Individual,ChargedOff
12,12.0,1244.631171,3 years,18.932798,D3,0 years,debt_consolidation,656.933143,Individual,ChargedOff
24,24.0,2278.04473,5 years,13.757983,C5,1 year,credit_card,719.038856,Individual,ChargedOff


In [7]:
# Chapter3
# def pre_process(input_data):
# データ前処理
# One-Hotエンコーディング
# pre_X = input_data.drop(["loan_status", "id"], axis=1).replace({
pre_X = data.drop(["loan_status"], axis=1).replace({
    "A1": 1, "A2": 2, "A3": 3, "A4": 4, "A5": 5,
    "B1": 6, "B2": 7, "B3": 8, "B4": 9, "B5": 10,
    "C1": 11, "C2": 12, "C3": 13, "C4": 14, "C5": 15,
    "D1": 16, "D2": 17, "D3": 18, "D4": 19, "D5": 20,
    "E1": 21, "E2": 22, "E3": 23, "E4": 24, "E5": 25,
    "F1": 26, "F2": 27, "F3": 28, "F4": 29, "F5": 30,
    "0 year": 0, "0 years": 0, "1 years": 1, "1 year": 1, "2 years": 2, "3 years": 3, "4 years": 4, "5 years": 5,
    "6 years": 6, "7 years": 7, "8 years": 8, "9 years": 9, "10 years": 10
}).astype({
    "term": "int8",
    "grade": "int8",
    "employment_length": "int8",
})
pre_y = data["loan_status"]
X = pd.get_dummies(pre_X)
y = pre_y.replace({"ChargedOff": 1, "FullyPaid": 0})
# pre_X_train, pre_X_test = train_test_split(X, test_size=0.1, shuffle=False)
# y_train, y_test = train_test_split(y, test_size=0.1, shuffle=False)

# X_train = X
# X_test = X
y_train = y
y_test = y

# 加工が終わった時点でCSVに出すと、ほかのモデルでの使いまわしがきく。
# # Min-Maxスケーリング
# X_train = ((pre_X_train - pre_X_train.min()) / (pre_X_train.max() - pre_X_train.min()))
X_train = ((X - X.min()) / (X.max() - X.min()))
# X_train = pre_X_train
# # Min-Maxスケーリング
# X_test = ((pre_X_test - pre_X_test.min()) / (pre_X_test.max() - pre_X_test.min()))
X_test = ((X - X.min()) / (X.max() - X.min()))
# X_test = pre_X_test
    # return X_train, X_test, y_train, y_test
# X_train, X_test, y_train, y_test = pre_process(data)

In [8]:
# pre_X_train.head()

In [9]:
print("訓練データの特徴量", X_train.shape)
print("訓練データのターゲット", y_train.shape)


訓練データの特徴量 (84712, 18)
訓練データのターゲット (84712,)


In [10]:
from tensorflow import keras  # Keras
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.metrics import binary_accuracy, Accuracy
# Keras以外を利用するのも一つの手段。
model = keras.models.Sequential()
model.add(Dense(units=17, activation=tf.nn.relu)) 
model.add(keras.layers.BatchNormalization())  # バッチ正規化層
model.add(Dense(units=100, activation=tf.nn.relu)) 
model.add(keras.layers.BatchNormalization())  # バッチ正規化層
model.add(Dense(units=2, activation=tf.nn.softmax))  # 出力層（活性化関数はソフトマックス関数）
model.compile(optimizer = keras.optimizers.SGD(lr=0.1),
           loss="sparse_categorical_crossentropy",
           metrics="accuracy")
        #    f1scoreにするのがよい

In [11]:
history = model.fit(x=X_train,
       y=y_train,
       batch_size=100,  # バッチサイズ
       epochs=100,  # エポック数
       validation_split=0.2,  # 検証データの割合
       verbose=1)  # 進捗の確認を行うか（0:行わない, 1:行う）

Epoch 1/100


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 

In [12]:
# 混同行列による評価
y_proba = model.predict(X_test)
y_pred = np.argmax(y_proba, axis=1)
confmat = confusion_matrix(y_test, y_pred)
print(confmat)

[[16187 26169]
 [ 5594 36762]]


In [13]:
# 正答率による評価
print("正解率:", accuracy_score(y_test, y_pred))

正解率: 0.6250472188119747


In [14]:
print(pd.DataFrame({'y': y_test, 'y_pred': y_pred}).head())  # 実際のクラスと分類結果を上から5つだけ表示

    y  y_pred
1   1       0
7   1       1
9   1       1
12  1       1
24  1       1


In [15]:
# 重回帰
from sklearn import linear_model  # 線形回帰を行うためのモジュール
# 変数の準備
# X = data_oh.drop(["loan_status_ChargedOff", "loan_status_FullyPaid"], axis=1)  # 説明変数の設定
# y = data_oh["loan_status_FullyPaid"]  # 目的変数の設定

# 学習
lr = linear_model.LinearRegression()  # 線形回帰モデルのインスタンスを作成
lr.fit(X_train, y_train)  # 回帰の実行

# 結果の確認
print("回帰係数:")
print(pd.DataFrame({"Name": X_train.columns,
                    "Coefficients": lr.coef_}).sort_values(by='Coefficients'))  # 回帰係数
print("切片:", lr.intercept_)  # 切片
print("決定係数:", lr.score(X_train, y_train))  # 決定係数

回帰係数:
                           Name  Coefficients
6                  credit_score -1.918146e-01
0                            id -1.080256e-02
5             employment_length  8.147682e-03
4                         grade  3.802709e-02
2                          term  5.125555e-02
1                     loan_amnt  5.847499e-02
3                 interest_rate  6.008798e-01
11                purpose_house  2.795343e+10
15       purpose_small_business  2.795343e+10
7                   purpose_car  2.795343e+10
10     purpose_home_improvement  2.795343e+10
8           purpose_credit_card  2.795343e+10
14                purpose_other  2.795343e+10
9    purpose_debt_consolidation  2.795343e+10
13              purpose_medical  2.795343e+10
12       purpose_major_purchase  2.795343e+10
17   application_type_Joint App  5.214775e+12
16  application_type_Individual  5.214775e+12
切片: -5242728544542.257
決定係数: 0.12068743619739596


In [16]:
X_train.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,credit_score,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_other,purpose_small_business,application_type_Individual,application_type_Joint App
1,0.0,0.242699,1.0,0.206435,0.172414,1.0,0.27814,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,2.5e-05,0.516423,1.0,0.814364,0.068966,1.0,0.008612,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,3.3e-05,0.106075,0.0,0.253206,0.275862,0.0,0.007425,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12,4.5e-05,0.260113,0.0,0.597179,0.586207,0.0,0.009559,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
24,9.5e-05,0.553377,1.0,0.363311,0.482759,0.1,0.418657,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [17]:
# # ランダムフォレスト
# # データ前処理
# # データ分割
# X = data.drop(["id", "term", "grade", "employment_length", "purpose", "application_type", "loan_status"], axis=1)
# y_train = data["loan_status"].values
# # # Min-Maxスケーリング
# X = ((X - X.min()) / (X.max() - X.min()))

In [18]:
from sklearn.ensemble import RandomForestClassifier  # ランダムフォレストを実行するためのクラス
# 学習
RFC = RandomForestClassifier(max_depth=3, random_state=1)  # ランダムフォレストのインスタンスを作成
RFC.fit(X_train, y_train)  # ランダムフォレストの学習

# 分類結果の確認
y_pred = RFC.predict(X_train)  # 分類結果
print(pd.DataFrame({'y': y_train, 'y_pred': y_pred}).head())  # 実際のクラスと分類結果を上から5つだけ表示

    y  y_pred
1   1       0
7   1       1
9   1       0
12  1       1
24  1       1


In [19]:
# コード例2
# モデルの精度（正解率）の確認
print('正解率:', RFC.score(X_train, y_train))

正解率: 0.6392836906223439


In [20]:
# 混同行列による評価
y_proba = model.predict(X_train)
y_pred = np.argmax(y_proba, axis=1)
confmat = confusion_matrix(y_train, y_pred)
print(confmat)

[[16187 26169]
 [ 5594 36762]]


In [21]:
# history.history["val_accuracy"]
np.savetxt("C:\work\AI\hoge.dat", y_pred)

In [22]:
# # データ前処理

# data_test = pd.read_csv("test.csv")
# # 事前処理
# X_train, X_test, y_train, y_test = pre_process(data_test)
# data_test_oh.head()

In [23]:
X_train.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,credit_score,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_other,purpose_small_business,application_type_Individual,application_type_Joint App
1,0.0,0.242699,1.0,0.206435,0.172414,1.0,0.27814,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,2.5e-05,0.516423,1.0,0.814364,0.068966,1.0,0.008612,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,3.3e-05,0.106075,0.0,0.253206,0.275862,0.0,0.007425,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12,4.5e-05,0.260113,0.0,0.597179,0.586207,0.0,0.009559,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
24,9.5e-05,0.553377,1.0,0.363311,0.482759,0.1,0.418657,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
