In [1]:
!pip install catboost



In [2]:
import warnings
warnings.filterwarnings('ignore')

import os, sys, gc, warnings, random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import GradientBoostingRegressor

import lightgbm as lgb
from catboost import CatBoostClassifier, Pool

from tqdm.notebook import tqdm

pd.set_option('display.max_row', 200)
pd.set_option('display.max_columns', 100)
%matplotlib inline

In [3]:
def seed_everything(SEED=9):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)

SEED = 9
seed_everything(SEED)

### 데이터 가져오기

In [4]:
df = pd.read_csv('cleaned_symptom2.csv') 

### NULL, 필요없는 데이터 제거

In [5]:
df.corona_result.unique()

array(['שלילי', 'אחר', 'חיובי'], dtype=object)

In [6]:
df.isna().sum()

test_date                    0
cough                        0
fever                        0
sore_throat                  0
shortness_of_breath          0
head_ache                    0
corona_result                0
age_60_and_above       1789274
gender                  601128
test_indication              0
dtype: int64

In [7]:
df.shape

(7861471, 10)

In [8]:
df = df[df.isna().age_60_and_above==False]

In [9]:
df = df[df.isna().gender==False]

In [10]:
df = df[df.corona_result!='אחר']

### 코로나 결과 인코딩

In [11]:
df.corona_result.unique() # 'שלילי', 'חיובי' / 양성, 음성

array(['שלילי', 'חיובי'], dtype=object)

In [12]:
dict2 = {key:value for (key,value) in zip(df.corona_result.unique(), range(2))}

In [13]:
dict2

{'שלילי': 0, 'חיובי': 1}

In [14]:
df['corona_result'] = df['corona_result'].map(lambda x:dict2[x])


### Gender 인코딩

In [15]:
df.gender.unique() # 'נקבה', 'זכר' / 남성1, 여성0

array(['נקבה', 'זכר'], dtype=object)

In [16]:
dict_gender = {key:value for (key,value) in zip(df.gender.unique(), range(2))}

In [17]:
dict_gender

{'נקבה': 0, 'זכר': 1}

In [18]:
df['gender'] = df['gender'].map(lambda x:dict_gender[x])

In [19]:
df.test_indication.unique()

array(['Other', 'Abroad', 'Contact with confirmed'], dtype=object)

### 60이상 인코딩

In [20]:
dict_60 = {key:value for (key,value) in zip(df.age_60_and_above.unique(), range(1,-1,-1))}
dict_60

{'Yes': 1, 'No': 0}

In [21]:
df['age_60_and_above'] = df['age_60_and_above'].map(lambda x:dict_60[x])

In [22]:
df

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
322580,2021-10-11,0,0,0,0,0,0,1,0,Other
322581,2021-10-11,0,0,0,0,0,0,0,1,Other
322582,2021-10-11,0,0,0,0,0,0,1,0,Other
322583,2021-10-11,0,0,0,0,0,0,1,1,Other
322584,2021-10-11,0,0,0,0,0,0,1,0,Other
...,...,...,...,...,...,...,...,...,...,...
7850517,2020-03-20,1,0,0,0,0,0,1,0,Other
7850518,2020-03-20,0,0,0,0,0,0,1,1,Other
7850519,2020-03-20,0,0,0,0,0,0,1,0,Other
7850520,2020-03-20,0,0,0,0,0,0,0,0,Other


In [23]:
# test_date 제거
df = df.drop('test_date', axis=1)

### 테스트 인코딩 - 확진자접촉, 해외, 기타

In [28]:
dict_test = {key:value for (key,value) in zip(df.test_indication.unique(), range(2,-1,-1))}
dict_test

{'Other': 2, 'Abroad': 1, 'Contact with confirmed': 0}

In [29]:
df['test_indication'] = df['test_indication'].map(lambda x:dict_test[x])

In [30]:
df

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
322580,0,0,0,0,0,0,1,0,2
322581,0,0,0,0,0,0,0,1,2
322582,0,0,0,0,0,0,1,0,2
322583,0,0,0,0,0,0,1,1,2
322584,0,0,0,0,0,0,1,0,2
...,...,...,...,...,...,...,...,...,...
7850517,1,0,0,0,0,0,1,0,2
7850518,0,0,0,0,0,0,1,1,2
7850519,0,0,0,0,0,0,1,0,2
7850520,0,0,0,0,0,0,0,0,2


In [31]:
train_df = df.drop('corona_result', axis=1) # label 제거
# train_df = train_df.drop('test_date', axis=1) # test_date 제거
# train_df

## Feature 생성

In [32]:
train_df['sum_symptom'] = train_df.loc[:,'cough':'head_ache'].sum(axis=1)

In [33]:
train_df

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above,gender,test_indication,sum_symptom
322580,0,0,0,0,0,1,0,2,0
322581,0,0,0,0,0,0,1,2,0
322582,0,0,0,0,0,1,0,2,0
322583,0,0,0,0,0,1,1,2,0
322584,0,0,0,0,0,1,0,2,0
...,...,...,...,...,...,...,...,...,...
7850517,1,0,0,0,0,1,0,2,1
7850518,0,0,0,0,0,1,1,2,0
7850519,0,0,0,0,0,1,0,2,0
7850520,0,0,0,0,0,0,0,2,0


In [34]:
label_df = df.iloc[:,5:6]
label_df

Unnamed: 0,corona_result
322580,0
322581,0
322582,0
322583,0
322584,0
...,...
7850517,0
7850518,0
7850519,0
7850520,0


In [37]:
model_params = {
    'n_estimators': 5000,
    'learning_rate': 0.01,
    'eval_metric': 'AUC',
    'loss_function': 'CrossEntropy',
    'od_wait': 500,
    'metric_period': 100,
    'depth': 8,
    'rsm': 0.8, # columns
    'random_seed':SEED,
    'subsample':0.8, # rows
    'max_ctr_complexity':1,
#     'task_type':'GPU'

}
def train_model(model_params, train_df, label_df, ratio):
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    train_df_ = train_df.copy().iloc[:int(len(train_df)*ratio),:]
    label_df_ = label_df.copy().iloc[:int(len(train_df)*ratio),:]
    for idx, (train_index, test_index) in enumerate(kf.split(train_df_)):
        # if idx == 0:
        #     continue
        x_train, x_test = train_df_.iloc[train_index], train_df_.iloc[test_index]
        y_train, y_test = label_df_.iloc[train_index], label_df_.iloc[test_index]

        test_preds = np.zeros(x_test.shape[0])
        y_oof = np.zeros(x_train.shape[0])

        fi = pd.DataFrame()
        fi['feature'] = train_df.columns[:]

        dtrain = lgb.Dataset(x_train, label=y_train)

        clf = CatBoostClassifier(**model_params)
        clf.fit(x_train, y_train, eval_set=(x_test, y_test),
               cat_features=list(train_df.columns),
               use_best_model=True,
               verbose=True)

        test_preds = clf.predict_proba(x_test)[:,1]

        fi['importance'] = clf.feature_importances_

        gc.collect()
        break
        
    return clf

In [None]:
clf = train_model(model_params, train_df, label_df, 1.0)

# A - base, 0.4 data -> 0.78223 (512)
# B - A + sum_symptom -> 0.78223
# C - B + depth 8 -> 0.78224 (759)
# D - C + 0.6 data -> 0.7901
# E - D + 0.8 data -> 0.7903
# F - E + max_ctr_complexity 1 -> 0.7904
# G - F + 1.0 data -> 0.7864



0:	test: 0.6828957	best: 0.6828957 (0)	total: 999ms	remaining: 1h 23m 11s
100:	test: 0.7837786	best: 0.7837786 (100)	total: 1m 19s	remaining: 1h 4m 5s
200:	test: 0.7853651	best: 0.7853651 (200)	total: 2m 44s	remaining: 1h 5m 20s
300:	test: 0.7860484	best: 0.7860484 (300)	total: 4m 6s	remaining: 1h 4m 13s
400:	test: 0.7863273	best: 0.7863273 (400)	total: 5m 25s	remaining: 1h 2m 8s
500:	test: 0.7864021	best: 0.7864039 (487)	total: 6m 44s	remaining: 1h 33s
600:	test: 0.7864097	best: 0.7864187 (578)	total: 8m 5s	remaining: 59m 12s
700:	test: 0.7864180	best: 0.7864187 (578)	total: 9m 26s	remaining: 57m 54s
800:	test: 0.7864611	best: 0.7864611 (799)	total: 10m 42s	remaining: 56m 6s
900:	test: 0.7864611	best: 0.7864611 (799)	total: 11m 55s	remaining: 54m 14s
1000:	test: 0.7864613	best: 0.7864614 (997)	total: 13m 15s	remaining: 52m 57s
1100:	test: 0.7864615	best: 0.7864615 (1079)	total: 14m 31s	remaining: 51m 27s
1200:	test: 0.7864620	best: 0.7864620 (1162)	total: 15m 46s	remaining: 49m 53s
13

In [None]:
gc.collect()

50

## 모델 저장

In [None]:
# clf.save_model('/content/drive/MyDrive/yeardream/model/fold0.cbm', format='cbm')