In [1]:
import sys
sys.path.append('../input/iterativestratification') #  iterstrat.ml_stratifiers 모듈을 사용하기 위해 경로를 추가

import pandas as pd # pandas 라이브러리를 임포트합니다. pandas는 데이터 조작과 분석을 위해 사용
import numpy as np #  NumPy 라이브러리를 임포트합니다. NumPy는 과학적 계산을 위한 다차원 배열과 함수를 제공
import matplotlib.pyplot as plt # matplotlib의 pyplot 모듈을 임포트합니다. matplotlib은 데이터 시각화를 위해 사용
import seaborn as sns # seaborn 라이브러리를 임포트합니다. seaborn은 matplotlib을 기반으로 한 데이터 시각화 라이브러리

from scipy import stats  # scipy의 stats 모듈을 임포트합니다. scipy는 과학적 계산과 통계 분석을 위한 라이브러리
from scipy.stats import norm, skew, kurtosis # scipy.stats 모듈에서 norm, skew, kurtosis 함수를 import. 이 함수들은 정규 분포, 왜도(skewness), 첨도(kurtosis) 등을 계산하는데 사용
import warnings # 경고 메시지를 처리하기 위한 warnings 모듈을 임포트

from sklearn.preprocessing import LabelEncoder # sklearn의 LabelEncoder 클래스를 import, LabelEncoder는 범주형 변수를 숫자로 인코딩하는데 사용
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold # iterstrat.ml_stratifiers 모듈에서 MultilabelStratifiedKFold 클래스를 import. 이 클래스는 다중 레이블 분류에서 계층적 교차 검증을 수행하는데 사용
from tqdm.auto import tqdm #  tqdm 모듈에서 tqdm 함수를 임포트합니다. tqdm은 반복문의 진행 상태를 표시하는데 사용
import catboost as cb # catboost 라이브러리를 임포트합니다. catboost는 그래디언트 부스팅 기반의 머신 러닝 모델

from sklearn.impute import KNNImputer

#from statsmodels.tools.sm_exceptions import ConvergenceWarning # statsmodels의 sm_exceptions 모듈에서 ConvergenceWarning 예외를 임포트합니다. 이 예외는 수렴 경고 메시지를 처리하는 데 사용
#warnings.simplefilter("ignore", category=ConvergenceWarning) # ConvergenceWarning 경고를 무시하도록 경고 필터를 설정
warnings.filterwarnings('ignore') # 모든 경고 메시지를 무시하도록 경고 필터를 설정

plt.style.use('ggplot') # gplot 스타일을 사용하여 그래프의 기본 스타일을 설정
cust_color = [ # 사용자 정의 색상을 지정한 리스트입니다. 그리드 색상 등에서 사용
    '#EDC7B7',
    '#EEE2DC',
    '#BAB2B5',
    '#123C69',
    '#AC3B61'
]

plt.rcParams['figure.figsize'] = (12,4) # 그래프의 기본 크기를 설정
plt.rcParams['figure.dpi'] = 300 # 그래프의 해상도를 설정
plt.rcParams["axes.grid"] = False # 그래프의 격자 표시 여부를 설정
plt.rcParams["grid.color"] = cust_color[3] # 격자의 색상을 지정
plt.rcParams["grid.alpha"] = 0.5 # 격자의 투명도를 설정
plt.rcParams["grid.linestyle"] = '--' # 격자의 선 스타일을 설정
plt.rcParams["font.family"] = "monospace" # 그래프의 폰트 패밀리를 설정

plt.rcParams['axes.edgecolor'] = 'black' # 축의 테두리 색상을 설정
plt.rcParams['figure.frameon'] = True # 그래프의 프레임 여부를 설정
plt.rcParams['axes.spines.left'] = True # 좌측 축의 테두리를 표시
plt.rcParams['axes.spines.bottom'] = True # 하단 축의 테두리를 표시
plt.rcParams['axes.spines.top'] = False # 상단 축의 테두리를 숨김
plt.rcParams['axes.spines.right'] = False # 우측 축의 테두리를 숨김
plt.rcParams['axes.linewidth'] = 0.5 # 축의 선 두께를 설정

In [2]:
train = pd.read_csv('../input/icr-identify-age-related-conditions/train.csv')
greeks = pd.read_csv('../input/icr-identify-age-related-conditions/greeks.csv')
test = pd.read_csv('../input/icr-identify-age-related-conditions/test.csv')

train.columns = train.columns.str.strip() # train 데이터 프레임 컬럼 이름에 있는 공백을 제거
test.columns = test.columns.str.strip() # # test 데이터 프레임 컬럼 이름에 있는 공백을 제거

In [3]:
num_cols = train.columns.tolist()[1:-1]
cat_cols = 'EJ'
num_cols.remove(cat_cols)

# Create a LabelEncoder object.
encoder = LabelEncoder()
# Transform the data.
train[cat_cols] = encoder.fit_transform(train[cat_cols])
test[cat_cols] = encoder.transform(test[cat_cols])

In [4]:
train = train.drop(labels='Id', axis=1)
train

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0
2,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,5135.78024,...,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,4157.68439,...,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,0
613,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,5654.07556,...,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,0
614,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,5888.87769,...,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,0
615,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,4517.86560,...,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,0


In [5]:
imp = KNNImputer()
labels = train["Class"]
train = train.drop(columns="Class")
data = imp.fit_transform(train)
tmp = pd.DataFrame(columns=train.columns, data=data)
train = pd.concat([tmp, labels], axis=1)
train

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0
2,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,5135.78024,...,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,4157.68439,...,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,0
613,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,5654.07556,...,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,0
614,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,5888.87769,...,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,0
615,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,4517.86560,...,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,0


In [6]:
from sklearn.preprocessing import RobustScaler

train_rob = train
# StandardScaler 선언 및 Fitting
robscaler = RobustScaler()
train_rob = pd.DataFrame(robscaler.fit_transform(train_rob.values), columns=train_rob.columns, index=train_rob.index)

In [7]:
def catboost_model(df):

    oof = np.zeros((len(train), 2))

    skf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    final_preds = []

    params={
        'iterations':10000,
        'learning_rate':0.005,
        'early_stopping_rounds':1000,
        'auto_class_weights':'Balanced',
        'loss_function':'MultiClass',
        'eval_metric':'MultiClass:use_weights=True',
        'random_seed':42,
        'use_best_model':True,
        'l2_leaf_reg':1,
        'max_ctr_complexity':15,
        'max_depth':10,
        "grow_policy":'Lossguide',
        'max_leaves':64,
        "min_data_in_leaf":40,

        }

    for train_index,val_index in skf.split(df, greeks.iloc[:,1:-1]):

        X_train, X_val = df.loc[train_index, num_cols + [cat_cols]], df.loc[val_index, num_cols + [cat_cols]]
        y_train, y_val = df.loc[train_index, 'Class'], df.loc[val_index, 'Class']


        model = cb.CatBoostClassifier(**params)
        model.fit(X_train,y_train,eval_set=[(X_val,y_val)], verbose=1000)
        preds = model.predict_proba(X_val)
        oof[val_index, :] = preds
        final_preds.append(model.predict_proba(test.iloc[:,1:]))

    return final_preds, oof

In [8]:
pred, oof = catboost_model(train)

0:	learn: 0.6909702	test: 0.6910470	best: 0.6910470 (0)	total: 64.8ms	remaining: 10m 48s
1000:	learn: 0.1025459	test: 0.3787499	best: 0.3769441 (832)	total: 8.09s	remaining: 1m 12s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.376944102
bestIteration = 832

Shrink model to first 833 iterations.
0:	learn: 0.6907749	test: 0.6910755	best: 0.6910755 (0)	total: 9.92ms	remaining: 1m 39s
1000:	learn: 0.1069505	test: 0.2707415	best: 0.2706503 (999)	total: 8.14s	remaining: 1m 13s
2000:	learn: 0.0238237	test: 0.2593047	best: 0.2546250 (1559)	total: 16.2s	remaining: 1m 4s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.2546249877
bestIteration = 1559

Shrink model to first 1560 iterations.
0:	learn: 0.6905853	test: 0.6912215	best: 0.6912215 (0)	total: 10.1ms	remaining: 1m 41s
1000:	learn: 0.1064676	test: 0.2527038	best: 0.2527038 (1000)	total: 8.09s	remaining: 1m 12s
2000:	learn: 0.0247469	test: 0.1925793	best: 0.1925660 (1999)	total: 16s	remaining: 1

In [9]:
def balance_logloss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred / np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)

    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2

    return logloss

print("robust scaling loss", balance_logloss(train['Class'], oof))

robust scaling loss 0.2766153911362194


In [10]:
sample_submission = pd.read_csv('../input/icr-identify-age-related-conditions/sample_submission.csv')
sample_submission[['class_0','class_1']] = np.mean(pred, axis=0)
sample_submission.to_csv('submission.csv',index=False)
sample_submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.689191,0.310809
1,010ebe33f668,0.689191,0.310809
2,02fa521e1838,0.689191,0.310809
3,040e15f562a2,0.689191,0.310809
4,046e85c7cc7f,0.689191,0.310809
