# 健診データによる肝疾患判定

##### 健康診断（血液検査）のデータを使って、肝疾患の有無を判定するモデルを構築。

- データ概要
- 課題種別：分類
- データ種別：多変量
- 学習データサンプル数：891
- 説明変数の数：10
- 欠損値：有り

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

import lightgbm as lgb

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submit = pd.read_csv('sample_submit.csv', header=None)

In [3]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)


In [4]:
train = train.fillna(train['AG_ratio'].mean())
train.isnull().sum()

id               0
Age              0
T_Bil            0
D_Bil            0
ALP              0
ALT_GPT          0
AST_GOT          0
TP               0
Alb              0
AG_ratio         0
disease          0
Gender_Female    0
Gender_Male      0
dtype: int64

In [5]:
sc = MinMaxScaler()
X_train = sc.fit_transform(train[['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb', 'AG_ratio', 'Gender_Female', 'Gender_Male']])
test = sc.fit_transform(test[['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb', 'AG_ratio', 'Gender_Female', 'Gender_Male']])
Y_train = train['disease']

In [6]:
print(X_train.shape)
print(Y_train.shape)

(891, 11)
(891,)


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=1)

In [29]:
model = lgb.LGBMClassifier(boosting_type='goss', max_depth=10, random_state=0)

In [30]:
eval_set = [(X_test, Y_test)]
callbacks = []
callbacks.append(lgb.early_stopping(stopping_rounds=100))
callbacks.append(lgb.log_evaluation())
model.fit(X_train, Y_train, eval_set=eval_set, callbacks=callbacks)

[1]	valid_0's binary_logloss: 0.652314
Training until validation scores don't improve for 100 rounds
[2]	valid_0's binary_logloss: 0.613836
[3]	valid_0's binary_logloss: 0.58265
[4]	valid_0's binary_logloss: 0.553831
[5]	valid_0's binary_logloss: 0.533418
[6]	valid_0's binary_logloss: 0.512729
[7]	valid_0's binary_logloss: 0.493213
[8]	valid_0's binary_logloss: 0.478757
[9]	valid_0's binary_logloss: 0.46604
[10]	valid_0's binary_logloss: 0.451123
[11]	valid_0's binary_logloss: 0.445293
[12]	valid_0's binary_logloss: 0.439121
[13]	valid_0's binary_logloss: 0.430958
[14]	valid_0's binary_logloss: 0.428249
[15]	valid_0's binary_logloss: 0.426883
[16]	valid_0's binary_logloss: 0.425719
[17]	valid_0's binary_logloss: 0.423844
[18]	valid_0's binary_logloss: 0.419396
[19]	valid_0's binary_logloss: 0.418157
[20]	valid_0's binary_logloss: 0.413765
[21]	valid_0's binary_logloss: 0.410778
[22]	valid_0's binary_logloss: 0.412603
[23]	valid_0's binary_logloss: 0.409278
[24]	valid_0's binary_logloss

LGBMClassifier(boosting_type='goss', max_depth=10, random_state=0)

In [31]:
Y_pred = model.predict(test)
submit[1] = Y_pred
submit.to_csv('submit_lightgbm.csv', index=None, header=None)

In [32]:
Y_pred = model.predict(X_test)
accuracy_score(Y_test, Y_pred)

0.8544776119402985