In [60]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import lightgbm as lgb
import seaborn as sns
from sklearn.preprocessing import StandardScaler


data = pd.read_csv('data/train.csv')
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})
data['Age_Tbill_ratio'] = data['Age']/data['T_Bil'] 
data['Age_Dbill_ratio'] = data['Age']/data['D_Bil'] 
data['ALTgpt_ASTgot_diff'] = data['ALT_GPT'] - data['AST_GOT'] 
data['Tbil_Dbil_diff'] = data['T_Bil'] - data['D_Bil'] 
data['ALTgpt_ASTgot_prod'] = data['ALT_GPT'] * data['AST_GOT'] 
data['Tbil_Dbil_prod'] = data['T_Bil'] * data['D_Bil'] 

In [39]:
data.describe()

Unnamed: 0,Age,Gender,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,disease,Age_Tbill_ratio,Age_Dbill_ratio,ALTgpt_ASTgot_diff
count,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0
mean,46.610588,0.174118,1.607337,0.606737,272.211316,41.978476,61.01812,7.03229,3.517458,1.14457,0.444706,48.452393,246.836534,-19.039644
std,16.453672,0.379434,2.443585,1.569032,200.348515,145.927366,110.12539,0.840664,0.559942,0.227237,0.497226,24.98737,182.699783,111.146954
min,10.0,0.0,0.585961,0.034861,163.261838,3.924518,11.283497,4.858679,2.180884,0.627133,0.0,1.129599,1.80871,-764.72605
25%,32.0,0.0,0.781258,0.144889,213.991414,13.644658,21.239157,6.730612,3.139153,1.004466,0.0,28.893981,104.017485,-32.133928
50%,48.0,0.0,0.835591,0.194085,220.092503,16.638001,27.056026,6.91558,3.621339,1.205016,0.0,45.313821,233.474962,-9.450084
75%,62.0,0.0,1.196759,0.335447,229.455927,23.056081,56.461568,7.536151,3.712524,1.28788,1.0,71.082653,345.318383,-5.243752
max,78.0,1.0,23.017027,17.692164,2108.483728,1423.186473,814.439397,8.739,5.01697,1.821496,1.0,114.370398,1864.571939,1069.522757


In [40]:
df_disease = data[data['disease']==1]
df_good = data[data['disease']==0]
df_disease.describe()

Unnamed: 0,Age,Gender,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,disease,Age_Tbill_ratio,Age_Dbill_ratio,ALTgpt_ASTgot_diff
count,378.0,378.0,378.0,378.0,378.0,378.0,378.0,378.0,378.0,378.0,378.0,378.0,378.0,378.0
mean,47.055556,0.12963,2.583358,1.101431,319.63437,74.448314,99.937924,6.963035,3.407023,1.066434,1.0,37.111727,165.020609,-25.48961
std,16.64912,0.336341,3.418749,2.172383,284.042393,214.391912,153.54267,0.810228,0.545244,0.234328,0.0,23.977268,143.229487,164.431329
min,16.0,0.0,0.642805,0.074207,185.491189,8.570383,11.601122,4.971703,2.180884,0.627133,1.0,1.129599,1.80871,-764.72605
25%,32.0,0.0,0.857066,0.187951,220.623552,17.188434,27.864092,6.750529,3.12603,0.968234,1.0,17.848987,46.347572,-43.863747
50%,48.0,0.0,1.204637,0.277124,227.268717,22.22816,56.648308,6.894533,3.219547,1.017616,1.0,34.644953,118.409957,-26.829799
75%,64.0,0.0,1.940341,0.788026,238.110473,33.368001,67.604461,7.510301,3.690715,1.250521,1.0,54.650655,263.003533,-3.90297
max,75.0,1.0,23.017027,17.692164,2108.483728,1423.186473,814.439397,8.739,5.01472,1.813936,1.0,101.119249,801.98257,1069.522757


In [41]:
df_good.describe()

Unnamed: 0,Age,Gender,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,disease,Age_Tbill_ratio,Age_Dbill_ratio,ALTgpt_ASTgot_diff
count,472.0,472.0,472.0,472.0,472.0,472.0,472.0,472.0,472.0,472.0,472.0,472.0,472.0,472.0
mean,46.254237,0.209746,0.825693,0.210563,234.232684,15.975088,29.849293,7.087753,3.6059,1.207145,0.0,57.534537,312.35861,-13.874205
std,16.304396,0.407559,0.176583,0.553654,67.266382,8.397985,28.30319,0.861106,0.556395,0.200756,0.0,21.895343,184.588118,23.797045
min,10.0,0.0,0.585961,0.034861,163.261838,3.924518,11.283497,4.858679,2.295068,0.732968,0.0,12.680416,3.518967,-365.528287
25%,32.0,0.0,0.746191,0.120786,211.830825,12.533705,20.259545,6.714659,3.165267,1.029563,0.0,38.117816,185.371185,-13.939178
50%,48.0,0.0,0.797032,0.16668,215.761897,14.825259,22.265808,7.35096,3.63084,1.280918,0.0,56.800183,294.458193,-7.770661
75%,61.0,0.0,0.836842,0.201156,221.424607,17.02888,27.981309,7.538761,4.067706,1.294483,0.0,75.722682,396.03856,-5.447164
max,78.0,1.0,1.734959,11.935322,657.616053,116.407037,481.935324,8.733841,5.01697,1.821496,0.0,114.370398,1864.571939,51.362947


In [61]:
z_scores = (data - data.mean()) / data.std()

# Zスコアがしきい値を超える行を抽出
threshold = 3.2
outliers = data[(z_scores > threshold).any(axis=1)]
# 外れ値を含む行を削除
cleaned_data = data.drop(outliers.index)

# インデックスをリセット（オプション）
cleaned_data.reset_index(drop=True, inplace=True)


In [62]:
df_disease = cleaned_data[cleaned_data['disease']==1]
df_good = cleaned_data[cleaned_data['disease']==0]
df_disease.describe()

Unnamed: 0,Age,Gender,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,disease,Age_Tbill_ratio,Age_Dbill_ratio,ALTgpt_ASTgot_diff,Tbil_Dbil_diff,ALTgpt_ASTgot_prod,Tbil_Dbil_prod
count,318.0,318.0,318.0,318.0,318.0,318.0,318.0,318.0,318.0,318.0,318.0,318.0,318.0,318.0,318.0,318.0,318.0
mean,47.465409,0.141509,1.886469,0.628332,277.819636,30.046092,55.91731,6.967922,3.423295,1.067771,1.0,39.693065,181.857551,-25.871218,1.258137,1824.494807,2.828057
std,16.436391,0.349096,2.040124,0.862468,117.020751,35.258286,40.678829,0.809312,0.558593,0.234572,0.0,22.851876,141.700527,51.070603,1.268912,2717.399544,7.359063
min,16.0,0.0,0.642805,0.074207,185.491189,8.570383,11.601122,4.971703,2.180884,0.627521,1.0,2.750665,5.087875,-287.048871,-1.740654,105.054849,0.05101
25%,32.0,0.0,0.846158,0.177669,220.769876,17.259848,26.747173,6.760032,3.123672,0.968999,1.0,22.210208,66.211946,-40.195566,0.664102,595.290033,0.164211
50%,48.0,0.0,0.989421,0.232373,226.96005,21.883381,52.845918,6.895087,3.256592,1.019172,1.0,36.515264,138.874671,-24.362845,0.789874,1051.864823,0.259763
75%,64.0,0.0,1.82512,0.678903,236.335393,28.915865,62.13638,7.504117,3.699029,1.256223,1.0,57.419897,287.827815,-3.90297,1.230103,1791.822396,1.083556
max,75.0,1.0,9.310985,4.520551,669.527004,370.984154,368.629116,8.739,5.01472,1.813936,1.0,101.119249,801.98257,335.11895,6.153313,30072.853508,38.545148


In [63]:
df_good.describe()

Unnamed: 0,Age,Gender,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,disease,Age_Tbill_ratio,Age_Dbill_ratio,ALTgpt_ASTgot_diff,Tbil_Dbil_diff,ALTgpt_ASTgot_prod,Tbil_Dbil_prod
count,463.0,463.0,463.0,463.0,463.0,463.0,463.0,463.0,463.0,463.0,463.0,463.0,463.0,463.0,463.0,463.0,463.0
mean,46.0,0.209503,0.825933,0.186271,234.071548,15.857849,29.067109,7.091178,3.60676,1.209492,0.0,57.136652,302.816413,-13.20926,0.639662,516.924895,0.166962
std,16.279751,0.407394,0.174817,0.115548,67.418585,7.031901,19.267245,0.854855,0.55751,0.200689,0.0,21.655515,157.97744,17.563195,0.132762,730.771582,0.163108
min,10.0,0.0,0.597674,0.050686,163.261838,3.924518,11.283497,4.858679,2.295068,0.732968,0.0,12.680416,30.734509,-149.437286,0.00085,80.279145,0.039279
25%,32.0,0.0,0.747727,0.124476,211.84248,12.643604,20.282788,6.731189,3.164493,1.030264,0.0,37.570471,185.182413,-14.195588,0.582772,274.976262,0.095623
50%,48.0,0.0,0.797133,0.168114,215.755528,14.839739,22.275446,7.390437,3.630881,1.281929,0.0,56.518294,292.993902,-7.69424,0.620604,324.739112,0.13413
75%,61.0,0.0,0.836977,0.201278,221.28558,17.057939,28.064698,7.538652,4.069081,1.294581,0.0,75.345432,392.055985,-5.452709,0.679488,490.153513,0.163572
max,78.0,1.0,1.734959,0.806529,657.616053,86.911871,170.010177,8.733841,5.01697,1.821496,0.0,108.686899,822.548931,51.362947,1.537253,11712.270797,1.073553
