# Setting

## GPU

In [None]:
!nvidia-smi

Sun Nov 13 07:30:25 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0    46W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Install

In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.5 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1.1


## Library

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold

from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
import gc

from sklearn.linear_model import LogisticRegression

# Load Data

In [None]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_submission_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/sample_submission.csv'
df_train_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/df_train_V10.csv'
df_test_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/df_test_V10.csv'

In [None]:
csv_to_parquet(df_train_path, 'train')
csv_to_parquet(df_test_path, 'test')

train Done.
test Done.


In [None]:
df_train = pd.read_parquet('./train.parquet')
df_test = pd.read_parquet('./test.parquet')

In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291241 entries, 0 to 291240
Data columns (total 40 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Unnamed: 0               291241 non-null  int64 
 1   base_date                291241 non-null  int64 
 2   day_of_week              291241 non-null  object
 3   base_hour                291241 non-null  object
 4   lane_count               291241 non-null  object
 5   road_rating              291241 non-null  object
 6   multi_linked             291241 non-null  object
 7   connect_code             291241 non-null  object
 8   road_type                291241 non-null  object
 9   maximum_speed_limit      291241 non-null  object
 10  weight_restricted        291241 non-null  object
 11  start_turn_restricted    291241 non-null  object
 12  end_turn_restricted      291241 non-null  object
 13  road_name_is_start       291241 non-null  object
 14  road_name_is_end    

# Preprocessing

In [None]:
df_train.drop(columns='Unnamed: 0', inplace=True)
df_test.drop(columns='Unnamed: 0', inplace=True)

In [None]:
X = df_train.drop(columns='target')
y = df_train['target']

In [None]:
cat_cols = X.dtypes[X.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(cat_cols))

Number of Categorical features:  36


# Stacking

In [None]:
def get_stacking_base_datasets(X, y, df_test, n_splits):

  num_model = 0

  train_fold_pred = np.zeros((X.shape[0] ,1 ))
  test_pred = np.zeros((df_test.shape[0], n_splits))

  kf = KFold(n_splits=n_splits, shuffle=True, random_state=2022)

  for train_index, valid_index in kf.split(X):
    
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    print(f'\n----------------- {num_model} 번 모델 학습 -----------------\n')

    CAT = CatBoostRegressor(
                            depth = 14,
                            task_type='GPU',
                            use_best_model=True,
                            )

    CAT.fit(X_train, y_train,
            eval_set=(X_valid, y_valid),
            cat_features=cat_cols,
            early_stopping_rounds=100,
            use_best_model=True,
            verbose=100)
    
    y_pred = CAT.predict(X_valid)

    MAE = mean_absolute_error(y_pred, y_valid)
    print(f'\n{num_model} 번 Model Vaild MAE : {MAE}\n')

    train_fold_pred[valid_index, :] = y_pred.reshape(-1,1)
    test_pred[:, num_model] = CAT.predict(df_test)

    num_model += 1

  test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1) 

  return train_fold_pred , test_pred_mean

In [None]:
train_10, test_10 = get_stacking_base_datasets(X, y, df_test, 10)


----------------- 0 번 모델 학습 -----------------

Learning rate set to 0.153027
0:	learn: 13.9687603	test: 13.9487120	best: 13.9487120 (0)	total: 424ms	remaining: 7m 3s
100:	learn: 4.8538606	test: 4.8518834	best: 4.8518834 (100)	total: 45.7s	remaining: 6m 46s
200:	learn: 4.5954245	test: 4.6239673	best: 4.6239673 (200)	total: 1m 34s	remaining: 6m 13s
300:	learn: 4.4602181	test: 4.5274241	best: 4.5274241 (300)	total: 2m 22s	remaining: 5m 30s
400:	learn: 4.3532147	test: 4.4626009	best: 4.4626009 (400)	total: 3m 11s	remaining: 4m 45s
500:	learn: 4.2700544	test: 4.4210609	best: 4.4210609 (500)	total: 4m	remaining: 3m 59s
600:	learn: 4.1974105	test: 4.3896443	best: 4.3896443 (600)	total: 4m 49s	remaining: 3m 11s
700:	learn: 4.1358349	test: 4.3657384	best: 4.3657384 (700)	total: 5m 37s	remaining: 2m 23s
800:	learn: 4.0788873	test: 4.3464594	best: 4.3464594 (800)	total: 6m 26s	remaining: 1m 35s
900:	learn: 4.0247945	test: 4.3299046	best: 4.3299046 (900)	total: 7m 14s	remaining: 47.8s
999:	learn:

# Submission

In [None]:
CV_Stacking_train = pd.DataFrame(train_10)
CV_Stacking_train['target'] = y

CV_Stacking_test = pd.DataFrame(test_10)

In [None]:
CV_Stacking_train

In [None]:
CV_Stacking_test

In [None]:
CV_Stacking_train.to_csv('/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/submission/cv_stacking_train_verson_9_depth_15_15.csv', index=False)
CV_Stacking_test.to_csv('/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/submission/cv_stacking_test_verson_9_depth_15_15.csv', index=False)

In [None]:
sub = pd.read_csv(df_submission_path)
sub['target'] = CV_Stacking_test[0]

In [None]:
sub.to_csv('/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/submission/test_verson_9_depth_15_15.csv', index=False)