# 健診データによる肝疾患判定

##### 健康診断（血液検査）のデータを使って、肝疾患の有無を判定するモデルを構築。

- データ概要
- 課題種別：分類
- データ種別：多変量
- 学習データサンプル数：891
- 説明変数の数：10
- 欠損値：有り

In [29]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [30]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submit = pd.read_csv('sample_submit.csv', header=None)

In [31]:
enc = LabelEncoder()
train['Gender'] = enc.fit_transform(train[['Gender']])
test['Gender'] = enc.fit_transform(test[['Gender']])

  return f(*args, **kwargs)


In [32]:
train = train.fillna(train['AG_ratio'].mean())
train.isnull().sum()

id          0
Age         0
Gender      0
T_Bil       0
D_Bil       0
ALP         0
ALT_GPT     0
AST_GOT     0
TP          0
Alb         0
AG_ratio    0
disease     0
dtype: int64

In [33]:
sc = MinMaxScaler()
X_train = sc.fit_transform(train[['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb', 'AG_ratio','Gender']])
test = sc.fit_transform(test[['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb', 'AG_ratio','Gender']])
Y_train = train['disease']

In [34]:
print(X_train.shape)
print(Y_train.shape)

(2546, 10)
(2546,)


In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=0)

In [36]:
model_rfc = RandomForestClassifier()
model_rfc.fit(X_train, Y_train)
Y_pred = model_rfc.predict(X_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       438
           1       0.93      0.96      0.95       326

    accuracy                           0.95       764
   macro avg       0.95      0.95      0.95       764
weighted avg       0.95      0.95      0.95       764



In [37]:
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier()
params = {'n_estimators': [10, 20, 50, 100], 'max_depth': [5, 10, 50]}
gcv = GridSearchCV(model, param_grid=params, cv=3)
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=0)
gcv.fit(X_train, Y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [5, 10, 50],
                         'n_estimators': [10, 20, 50, 100]})

In [38]:
gcv.best_params_

{'max_depth': 10, 'n_estimators': 100}

In [39]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=0)

model_rfc = RandomForestClassifier(n_estimators=100, max_depth=10)
model_rfc.fit(X_train, Y_train)
Y_pred = model_rfc.predict(X_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89       196
           1       0.90      0.85      0.87       179

    accuracy                           0.88       375
   macro avg       0.88      0.88      0.88       375
weighted avg       0.88      0.88      0.88       375



In [40]:
Y_pred = model_rfc.predict(test)
submit[1] = Y_pred
submit.to_csv('submit.csv', index = None, header = None)