# 健診データによる肝疾患判定

##### 健康診断（血液検査）のデータを使って、肝疾患の有無を判定するモデルを構築。

- データ概要
- 課題種別：分類
- データ種別：多変量
- 学習データサンプル数：891
- 説明変数の数：10
- 欠損値：有り

In [148]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

In [159]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submit = pd.read_csv('sample_submit.csv', header=None)

In [160]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)


In [161]:
train = train.fillna(train['AG_ratio'].mean())
train.isnull().sum()

id               0
Age              0
T_Bil            0
D_Bil            0
ALP              0
ALT_GPT          0
AST_GOT          0
TP               0
Alb              0
AG_ratio         0
disease          0
Gender_Female    0
Gender_Male      0
dtype: int64

In [162]:
sc = MinMaxScaler()
X_train = sc.fit_transform(train[['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb', 'AG_ratio', 'Gender_Female', 'Gender_Male']])
test = sc.fit_transform(test[['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb', 'AG_ratio', 'Gender_Female', 'Gender_Male']])
Y_train = train['disease']

In [163]:
print(X_train.shape)
print(Y_train.shape)

(891, 11)
(891,)


In [164]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=1)

In [165]:
model_tree = DecisionTreeClassifier()
model_tree.fit(X_train, Y_train)
Y_pred = model_tree.predict(X_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.74      0.82      0.78       125
           1       0.82      0.75      0.78       143

    accuracy                           0.78       268
   macro avg       0.78      0.78      0.78       268
weighted avg       0.78      0.78      0.78       268



In [166]:
from sklearn.model_selection import GridSearchCV

model = DecisionTreeClassifier()
params = {'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 50, 100], 'min_samples_leaf': [2, 4, 8, 16, 32, 64]}
gcv = GridSearchCV(model, param_grid=params, cv=3)
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=0)
gcv.fit(X_train, Y_train)


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [10, 20, 50, 100],
                         'min_samples_leaf': [2, 4, 8, 16, 32, 64]})

In [167]:
gcv.best_params_

{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 32}

In [168]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=0)
model_tree = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=32)
model_tree.fit(X_train, Y_train)
Y_pred = model_tree.predict(X_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83        74
           1       0.80      0.72      0.76        57

    accuracy                           0.80       131
   macro avg       0.80      0.79      0.80       131
weighted avg       0.80      0.80      0.80       131



In [169]:
Y_pred = model_tree.predict(test)
submit[1] = Y_pred
submit.to_csv('submit2.csv', index=None, header=None)