# Setting

## GPU

In [1]:
!nvidia-smi

Fri Nov 18 05:35:50 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    45W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Library

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow.keras import *

import gc
import glob
from tqdm import tqdm

# Load Data

In [3]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
df_submission_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/sample_submission.csv'
df_train_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/train.csv'
df_test_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/test.csv'

In [12]:
csv_to_parquet(df_train_path, 'train')
csv_to_parquet(df_test_path, 'test')

train Done.
test Done.


In [13]:
train = pd.read_parquet('/content/train.parquet')
test = pd.read_parquet('/content/test.parquet')

# Preprocessing

## LabelEncoder

In [14]:
str_col = ['day_of_week',
           'base_hour',
           'lane_count',
           'maximum_speed_limit',
           'start_latitude',
           'start_longitude',
           'end_latitude',
           'end_longitude',
           'road_rating',
           'weight_restricted',
           'start_turn_restricted',
           'end_turn_restricted',
           'start_node_name', 
           'end_node_name', 
           'road_type',
           'road_name', 
           'connect_code', 
           'multi_linked']

In [16]:
for i in tqdm(str_col):

    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

100%|██████████| 18/18 [00:14<00:00,  1.22it/s]


## Split X / y 

In [18]:
y_train = train['target'] 

X_train = train.drop(['id', 'target', 'vehicle_restricted', 'height_restricted'], axis=1)

test = test.drop(['id', 'vehicle_restricted', 'height_restricted'], axis=1)

print(X_train.shape)
print(y_train.shape)
print(test.shape)

(4701217, 19)
(4701217,)
(291241, 19)


## array

In [19]:
X_train = np.array(X_train)
y_train = np.array(y_train)
test = np.array(test)

In [20]:
X_train.shape

(4701217, 19)

## External Data

### 제주시 월별 평년 값 중 최고기온 (NOAA 1991년 ~ 2022년)

In [21]:
max_temp = np.array([8, 9, 12, 17, 21, 24, 29, 29, 25, 21, 16, 10]) 

In [22]:
max_temp_max = np.max(max_temp)
max_temp = max_temp / max_temp_max 

## 휴일 (네이버 2021년 ~ 2022년)

In [23]:
holiday = np.array([
                    20210920, 
                    20210921, 
                    20210922, 
                    20211003, 
                    20211004, 
                    20211009, 
                    20211011, 
                    20211225,
                    20220101, 
                    20220131, 
                    20220201, 
                    20220202, 
                    20220301, 
                    20220309, 
                    20220505, 
                    20220508,
                    20220601, 
                    20220606, 
                    20220815
                    ]) 

### 최고 기온 및 휴일 데이터 삽입

In [24]:
x_train = []

for k, d in tqdm(enumerate(X_train)):

    month_i = np.int((d[0]%10000)/100) - 1

    d = np.concatenate([
        d[1:], [np.minimum(3,np.min(np.abs(holiday - d[0]))), max_temp[month_i]] #input data에 휴무일과의 차이(0일, 1일, 2일, 3일 이상), 최고기온에 대한 월별 평년값 추가 
        ])
    
    x_train.append([d, y_train[k]])

x_test = []

for d in tqdm(test):

    month_i = np.int((d[0]%10000)/100) - 1
    
    d = np.concatenate([
        d[1:], [np.minimum(3, np.min(np.abs(holiday - d[0]))), max_temp[month_i]]
        ])
    
    x_test.append(d)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """
4701217it [01:44, 44880.17it/s]
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
100%|██████████| 291241/291241 [00:05<00:00, 49184.20it/s]


In [25]:
x_train = np.array(x_train)
x_test = np.array(x_test)

  """Entry point for launching an IPython kernel.


# Modeling

In [26]:
def known_data_model(input_layer, start_neurons):

    #각각의 input값에 대한 embedding
    input_dims = [7, 24, 3, 3, 61, 2, 2, 6, 4, 2, 487, 586, 586, 2, 487, 586, 586, 2, 4]

    for i in range(20):

        if i==0:
            input_embedding = layers.Embedding(input_dim=input_dims[i], output_dim=start_neurons)(input_layer[:, i]) 

        elif i >= 19:
            input_embedding = layers.concatenate([input_embedding, layers.Dense(start_neurons)(input_layer[:, i:i+1])])

        else :
            input_embedding = layers.concatenate([input_embedding, layers.Embedding(input_dim=input_dims[i], output_dim=start_neurons)(input_layer[:, i])])
    
    
    
    print(input_embedding.get_shape().as_list())
    
    all_layer = input_embedding

    for layer_num in range(5):

        all_layer_d = layers.Dropout(0.2)(all_layer)
        all_layer_d_gate = layers.Dense(all_layer_d.get_shape().as_list()[-1])(all_layer_d)
        all_layer_ = all_layer * tf.math.sigmoid(all_layer_d_gate) #weighted sigmoid gate unit
        all_layer_c = layers.concatenate([all_layer, all_layer_]) 
        all_layer += layers.Dense(20*start_neurons, activation='relu')(all_layer_c)
    
    output1 = tf.squeeze(layers.Dense(1)(all_layer), axis=-1)
    output2 = tf.squeeze(layers.Dense(1)(all_layer), axis=-1)
    output3 = tf.squeeze(layers.Dense(1)(all_layer), axis=-1)
    output4 = tf.squeeze(layers.Dense(1)(all_layer), axis=-1)
    output5 = tf.squeeze(layers.Dense(1)(all_layer), axis=-1)
    output6 = tf.squeeze(layers.Dense(1)(all_layer), axis=-1)
    output7 = tf.squeeze(layers.Dense(1)(all_layer), axis=-1)
    output8 = tf.squeeze(layers.Dense(1)(all_layer), axis=-1)
    output9 = tf.squeeze(layers.Dense(1)(all_layer), axis=-1)
    output10 = tf.squeeze(layers.Dense(1)(all_layer), axis=-1)

    output = (output1 + output2 + output3 + output4 + output5 + output6 + output7 + output8 + output9 + output10) / 10 #average output

    return output 

In [27]:
strategy = tf.distribute.MirroredStrategy() # multi GPU parallelization strategy

## Training

In [33]:
mkdir models

In [None]:
#fold include day (need test w/o day), include output_dim, 256, layer 6, w/ all BatchNorm (need test w/o BatchNorm), 


# Ensemble codes in public discussion were used.
def trainGenerator():
    for data in train_data_:
        target = data[1]
        feature = data[0]
        yield (feature, target)

def valGenerator():
    for data in val_data:
        target = data[1]
        feature = data[0]
        yield (feature, target)

kfold_list = [2, 3, 4, 5, 6, 10, 20]

for kfold in kfold_list:

    kf = KFold(n_splits=kfold, random_state=42, shuffle=True)

    for fold, (train, val) in enumerate(kf.split(x_train)):


        val_data = np.array(x_train)[val]
        train_data_ = np.array(x_train)[train]


    
        tr_ds = tf.data.Dataset.from_generator(trainGenerator, (tf.float32, tf.float32), (tf.TensorShape([20]), tf.TensorShape([])))
        tr_ds = tr_ds.cache()
        tr_ds = tr_ds.shuffle(100000).padded_batch(4096)
        tr_ds = tr_ds.prefetch(tf.data.experimental.AUTOTUNE)

        val_ds = tf.data.Dataset.from_generator(valGenerator, (tf.float32, tf.float32), (tf.TensorShape([20]), tf.TensorShape([])))
        val_ds = val_ds.cache()
        val_ds = val_ds.batch(4096).prefetch(tf.data.experimental.AUTOTUNE)
    
        with strategy.scope():

            input_layer = Input((20))
  
            outputs = known_data_model(input_layer, 32)
            model = Model(input_layer, outputs)

            adam = tf.keras.optimizers.Adam()

            model.compile(optimizer=adam,
                  loss=tf.keras.losses.MeanAbsoluteError())
            callbacks = tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss', factor=.1, patience=2, verbose=0, mode='min', min_delta=1e-4, cooldown=0, min_lr=0
            )

            sv = tf.keras.callbacks.ModelCheckpoint(
            f'/content/models/ehfehf-{fold}-road_all_org_fold_{kfold}.h5', monitor='val_loss', verbose=0, save_best_only=True,
            save_weights_only=True, mode='min', save_freq='epoch'
            )

            es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, mode='min')

            model.fit(tr_ds, epochs=15000, verbose=1, validation_data=val_ds, callbacks=[callbacks, sv, es])

            del model 

            gc.collect()

print('Training complete.')

[None, 640]
Epoch 1/15000
    574/Unknown - 284s 465ms/step - loss: 4.7092

# Prediction

In [None]:
pred = []

for kfold in kfold_list:

    for n_fold in range(kfold):
      
        input_layer = Input((20))
        outputs = known_data_model(input_layer, 32)
        model = Model(input_layer, outputs)
        model.load_weights(f'./models/ehfehf-{n_fold}-road_all_org_fold_{kfold}.h5')
        val_pred = model.predict(np.array(x_test))
        pred.append(val_pred)

In [None]:
pred_sum = sum(pred)    

In [None]:
pred_sum /= len(pred)
pred_sum

In [None]:
val_pred = np.round(pred_sum) 
val_pred

# Submission

In [None]:
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['target'] = val_pred
sample_submission.to_csv("./submit.csv", index = False)