In [None]:
! pip install pycaret[full]
! pip install markupsafe==2.0.1

In [None]:
! pip install catboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import StratifiedKFold, KFold
submit = pd.read_csv('/content/drive/MyDrive/ML_projects/weather/2-2_검증데이터셋.csv', encoding='cp949')

# 훈련, 검증, 테스트셋 분류
#final = pd.read_csv('/content/drive/MyDrive/ML_projects/weather/final.csv', encoding='cp949').drop('yyyymmdd', axis=1)
final = pd.read_csv('/content/drive/MyDrive/ML_projects/weather/final_ver2.csv', encoding='cp949')#.drop('yyyymmdd', axis=1)
df = final[final['year'] != 2016]
train = df[df['year'] != 2015]
val = df[df['year'] == 2015]
test = final[final['year'] == 2016].drop('frequency', axis=1)

X_train = train.drop('frequency', axis=1)
y_train = train['frequency']
X_val = val.drop('frequency', axis=1)
y_val = val['frequency']

# 최종 증감분 계산 
year_frequency = final.groupby(['year'])['frequency'].mean()
rate = year_frequency.values[3] / year_frequency.values[2]

# 모델 평가 지표 
def RMSE(true, pred):
    score = np.sqrt(np.mean(np.square(true-pred)))
    return score

# K-Fold 
def kfold(model, tr):
    cv_accuracy = []
    cv = KFold(n_splits = 5)
    n_iter = 0

    for t,v in cv.split(tr, tr['frequency']):
        train_cv = tr.iloc[t]
        val_cv = tr.iloc[v]

        X = train_cv.drop('frequency', axis=1)
        y = train_cv['frequency']

        val_X = val_cv.drop('frequency', axis=1)
        val_y = val_cv['frequency']

        model.fit(X, y)
        score = RMSE(val_y, model.predict(val_X))

        cv_accuracy.append(score)
        n_iter += 1
    
    return np.mean(cv_accuracy)

In [None]:
import seaborn as sns 
fig, axes = plt.subplots(6,6, figsize=(25,25))
fig.suptitle('Distribution of quantitative features', fontsize=40)

for ax,feature in zip(axes.flatten(), df.columns):
    sns.histplot(data = df, x = feature, ax=ax, color='#f55354', edgecolor='#f15354')
plt.show()

In [None]:
f,ax = plt.subplots(figsize=(25,25))
sns.heatmap(df.corr(), annot=True, linewidths=.5, fmt= '.2f',ax=ax)

In [None]:
dates = pd.to_datetime(df['yyyymmdd'], format='%Y%m%d').dt
df['weekday']= dates.weekday

In [None]:
df['risk_ratio'] = df['num_risk_age'] / df['tot_person']

## 1) pycaret

In [None]:
from pycaret.regression import *

model = setup(
    data = train,
    target = 'frequency',
    fold = 5,
    use_gpu = True,
    session_id = 42
)

In [None]:
top5_models = compare_models(
    round=3,
    sort='MSE',
    n_select=5
)

In [None]:
tuned_models = [tune_model(i, n_iter = 15) for i in top6_models]

In [None]:
blended_model = blend_models(tuned_models)

In [None]:
final_model = finalize_model(blended_model)

## 2) CatBoostRegressor

In [3]:
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.preprocessing import RobustScaler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import StratifiedKFold, KFold
import random
random.seed(42)
np.random.seed(42)
submit = pd.read_csv('/content/drive/MyDrive/ML_projects/weather/2-2_검증데이터셋.csv', encoding='cp949')
final = pd.read_csv('/content/drive/MyDrive/ML_projects/weather/final_ver2.csv', encoding='cp949')

In [6]:
final.columns

Index(['yyyymmdd', 'area', 'sex', 'frequency', 'tot_person', 'year', 'month',
       'day', 'day_differ', 'month_differ', 'avg_tca_mean', 'ssrate_mean',
       'avg_rhm_mean', 'min_rhm_mean', 'avg_ws_mean', 'avg_min_tg_mean',
       'min_tg_mean', 'avg_ta_mean', 'max_ta_mean', 'min_ta_mean', 'SO2_mean',
       'CO_mean', 'O3_mean', 'NO2_mean', 'PM10_mean', 'weekday', 'risk_ratio',
       'day_diff', 'temp_humi', 'temp_threshold', 'season', 'diff_risk'],
      dtype='object')

In [None]:
final = pd.read_csv('/content/drive/MyDrive/ML_projects/weather/final_ver2.csv', encoding='cp949')
#final = final.sort_values(['yyyymmdd', 'sex','area']).reset_index(drop=True)
#df_11 = pd.read_csv('/content/drive/MyDrive/ML_projects/weather/df_11.csv', encoding='cp949')

# 새로운 변수 생성 
dates = pd.to_datetime(final['yyyymmdd'], format='%Y%m%d').dt
final['weekday']= dates.weekday
final['risk_ratio'] = final['num_risk_age'] / final['tot_person']
final['day_diff'] = final['max_ta_mean'] - final['min_ta_mean']
final['temp_humi'] = final['avg_rhm_mean'] * final['avg_ta_mean'] # 습도 * 기온 
# https://www.karger.com/Article/Fulltext/505122

final['temp_threshold'] = np.where((final['min_ta_mean'] < -1.5) | 
                                   (final['avg_ta_mean'] >= 27.5), 1, 0)
# https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0094070

conditionlist = [
    (final['month'].isin([3,4,5])),
    (final['month'].isin([6,7,8])),
    (final['month'].isin([9,10,11])),
    (final['month'].isin([12,1,2]))]
choicelist = [1,2,3,4]
final['season'] = np.select(conditionlist, choicelist, default='Not Specified')
final['diff_risk'] = np.where(final['day_diff'] >= 10, 1, 0)

final.drop(['num_risk_age','sum_rn_mean', 'mi10_max_rn_mean', 'hr1_max_rn_mean', 'hr6_max_rn_mean', 'sum_ss_hr_mean',
            #'avg_tca_mean',
            #'min_rhm_mean',
            'max_ws_mean',
            #'min_tg_mean',
            #'max_ta_mean', 'min_ta_mean',
            #'SO2_mean',
            #'CO_mean',
            #'O3_mean',
            #'NO2_mean',
            #'PM10_mean',
            #'temp_humi'
            ], axis=1, inplace=True)

# train, test split 
df = final[final['year'] != 2016].drop('yyyymmdd', axis=1)
test = final[final['year'] == 2016].drop(['frequency', 'yyyymmdd'], axis=1)

X = df.drop('frequency', axis=1)
y = df['frequency']

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.1, random_state=42)

# 모델링 
CAT = CatBoostRegressor(verbose=2, random_state=42, cat_features=[0])
CAT.fit(X_train, y_train,
        cat_features=[0],
        eval_set=(X_val, y_val))
pred = CAT.predict(test)

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/ML_projects/weather/2-2_검증데이터셋.csv', encoding='cp949')
CAT = CatBoostRegressor(verbose=2, random_state=42, cat_features=[0])
CAT.fit(X,y)
pred = CAT.predict(test)
submit['frequency'] = pred
submit.to_csv('/content/drive/MyDrive/ML_projects/weather/220139.csv', encoding='cp949', index=False)

In [None]:
CAT = CatBoostRegressor(verbose=1, random_seed=42, cat_features=[0], one_hot_max_size = 17)
# K-Fold 
def kfold(model, tr):
    cv_accuracy = []
    cv = KFold(n_splits = 5)
    n_iter = 0

    for t,v in cv.split(tr, tr['frequency']):
        train_cv = tr.iloc[t]
        val_cv = tr.iloc[v]

        X = train_cv.drop('frequency', axis=1)
        y = train_cv['frequency']

        val_X = val_cv.drop('frequency', axis=1)
        val_y = val_cv['frequency']

        model.fit(X, y, cat_features=[0])
        score = RMSE(val_y, model.predict(val_X))

        cv_accuracy.append(score)
        n_iter += 1
    
    return np.mean(cv_accuracy)
kfold(CAT, df)

## 3) LGBM

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

final = pd.read_csv('/content/drive/MyDrive/ML_projects/weather/final_ver2.csv', encoding='cp949')
#final = final.sort_values(['yyyymmdd', 'sex','area']).reset_index(drop=True)
#df_11 = pd.read_csv('/content/drive/MyDrive/ML_projects/weather/df_11.csv', encoding='cp949')
# 새로운 변수 생성 
dates = pd.to_datetime(final['yyyymmdd'], format='%Y%m%d').dt
final['weekday']= dates.weekday
final['risk_ratio'] = final['num_risk_age'] / final['tot_person']
final['day_diff'] = final['max_ta_mean'] - final['min_ta_mean']

conditionlist = [
    (final['month'].isin([3,4,5])),
    (final['month'].isin([6,7,8])),
    (final['month'].isin([9,10,11])),
    (final['month'].isin([12,1,2]))]
choicelist = [1,2,3,4]
final['season'] = np.select(conditionlist, choicelist, default='Not Specified')
final['diff_risk'] = np.where(final['day_diff'] >= 10, 1, 0)

final.drop(['num_risk_age','sum_rn_mean', 'mi10_max_rn_mean', 'hr1_max_rn_mean', 'hr6_max_rn_mean', 'sum_ss_hr_mean',
            #'avg_tca_mean',
            #'min_rhm_mean',
            'max_ws_mean',
            #'min_tg_mean',
            #'max_ta_mean', 'min_ta_mean',
            #'SO2_mean',
            #'CO_mean',
            #'O3_mean',
            #'NO2_mean',
            #'PM10_mean'
            ], axis=1, inplace=True)

scaler_ = MinMaxScaler()
a = final.drop(['area', 'year', 'month', 'day', 'frequency'], axis=1)
b = final[['area', 'year', 'month', 'day', 'frequency']]
scaler_.fit(a)
X_train_scaled = scaler_.transform(a)

wow = pd.DataFrame(X_train_scaled, columns=a.columns)
holi = pd.concat([b, wow], axis=1)
cat_features = ['area']
for i in enumerate(cat_features) :
    ca = i[1]
    holi[ca] = holi[ca].astype('category')
    holi[ca] = holi[ca].astype('category')

df = holi[holi['year'] != 2016]
test = holi[holi['year'] == 2016].drop('frequency', axis=1)
X = df.drop('frequency', axis=1)
y = df['frequency']

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.1, random_state=42)

In [None]:
from lightgbm import LGBMRegressor
LGBM = LGBMRegressor(objective='mse', verbose=-1, metric='rmse', random_state=42, n_jobs=-1,
                     learning_rate=0.1, n_estimators = 2000, max_depth = 10)
LGBM.fit(X_train, y_train,
         eval_set=(X_val, y_val))

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/ML_projects/weather/2-2_검증데이터셋.csv', encoding='cp949')
pred = CAT.predict(test)
submit['frequency'] = pred
submit