# Setting

## 라이브러리

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import gc

# Load Data

In [2]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df_submission_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/sample_submission.csv'
df_train_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/df_train_V1.csv'
df_test_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/df_test_V1.csv'

In [5]:
csv_to_parquet(df_train_path, 'train')
csv_to_parquet(df_test_path, 'test')

train Done.
test Done.


In [6]:
df_train = pd.read_parquet('./train.parquet')
df_test = pd.read_parquet('./test.parquet')

# Preprocessing

In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291241 entries, 0 to 291240
Data columns (total 36 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   Unnamed: 0             291241 non-null  int64 
 1   base_date              291241 non-null  int64 
 2   day_of_week            291241 non-null  object
 3   base_hour              291241 non-null  object
 4   lane_count             291241 non-null  object
 5   road_rating            291241 non-null  object
 6   multi_linked           291241 non-null  object
 7   connect_code           291241 non-null  object
 8   road_type              291241 non-null  object
 9   maximum_speed_limit    291241 non-null  object
 10  weight_restricted      291241 non-null  object
 11  start_node_name        291241 non-null  object
 12  end_node_name          291241 non-null  object
 13  start_latitude         291241 non-null  object
 14  end_latitude           291241 non-null  object
 15  

In [8]:
df_train.drop(columns='Unnamed: 0', inplace=True)
df_test.drop(columns='Unnamed: 0', inplace=True)

In [9]:
obj_feat = df_test.dtypes[df_test.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(obj_feat))

Number of Categorical features:  34


In [10]:
for feature in obj_feat:
    df_train[feature] = pd.Series(df_train[feature], dtype="category")
    df_test[feature] = pd.Series(df_test[feature], dtype="category")

In [11]:
X = df_train.drop(columns='target')
y = df_train['target']

# LGBM

In [12]:
num_model = 0

n_KFold = 10
early_stopping_rounds = 100
max_depth=20

ensemble_array = np.zeros(df_test.shape[0])

In [None]:
kf = KFold(n_splits=n_KFold, shuffle=True, random_state=2022)

for train_index, vaild_index in kf.split(X):
  
  X_train, X_vaild = X.iloc[train_index], X.iloc[vaild_index]
  y_train, y_vaild = y.iloc[train_index], y.iloc[vaild_index]

  print(f'\n----------------- {num_model} 번 모델 학습 -----------------\n')

  LR = lgb.LGBMRegressor(
                         n_estimators=10000,
                         learning_rate=0.19,
                         num_leaves=2^20,
                         max_depth=-1,
                         n_jobs=-1,
                         class_weight='balanced',
                         device='cpu',
                         early_stopping_round=early_stopping_rounds,
                         )

  LR.fit(X_train, y_train,
         eval_set=[(X_train, y_train), (X_vaild, y_vaild)],
         eval_metric='rmse',
         categorical_feature=obj_feat,
         verbose=100,
         )

  y_pred = LR.predict(X_vaild)

  MAE = mean_absolute_error(y_pred, y_vaild)

  print(f'\n{num_model} 번 Model Vaild MAE : {MAE}\n')

  ensemble_array += LR.predict(df_test)/n_KFold

  num_model += 1


----------------- 0 번 모델 학습 -----------------

Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 24.8441	training's rmse: 4.98438	valid_1's l2: 32.8392	valid_1's rmse: 5.73055
[200]	training's l2: 21.0916	training's rmse: 4.59256	valid_1's l2: 29.1108	valid_1's rmse: 5.39544
[300]	training's l2: 19.2904	training's rmse: 4.39208	valid_1's l2: 27.426	valid_1's rmse: 5.23698
[400]	training's l2: 18.3332	training's rmse: 4.28173	valid_1's l2: 26.4928	valid_1's rmse: 5.14711
[500]	training's l2: 17.5226	training's rmse: 4.186	valid_1's l2: 25.7612	valid_1's rmse: 5.07555
[600]	training's l2: 17.0137	training's rmse: 4.12477	valid_1's l2: 25.3236	valid_1's rmse: 5.03225
[700]	training's l2: 16.5451	training's rmse: 4.06757	valid_1's l2: 24.9098	valid_1's rmse: 4.99097
[800]	training's l2: 16.1646	training's rmse: 4.02052	valid_1's l2: 24.612	valid_1's rmse: 4.96105
[900]	training's l2: 15.8335	training's rmse: 3.97913	valid_1's l2: 24.301	valid_1's rmse: 4.

# Submission

In [None]:
sample_submission = pd.read_csv(df_submission_path)

sample_submission.iloc[:,1:] = 0

sample_submission['target'] = ensemble_array

In [None]:
sample_submission.to_csv("/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/submission/LGBM_KFold_10_V1_lr_0.16.csv", index = False)

In [None]:
sample_submission