In [1]:
import pandas as pd 
import numpy as np 
import itertools
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import xgboost
import tensorflow as tf


In [2]:
SEED = 42
tf.random.set_seed(SEED)

In [3]:
# Define the function to return the SMAPE value
def smape(actual, predicted) -> float:
  
    # Convert actual and predicted to numpy
    # array data type if not already
    if not all([isinstance(actual, np.ndarray), 
                isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual),
        np.array(predicted)
  
    return round(
        np.mean(
            np.abs(predicted - actual) / 
            ((np.abs(predicted) + np.abs(actual))/2)
        )*100, 6
    )

In [4]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,waktu_setempat,id_jalan,id_titik_mulai,id_titik_akhir,rerata_kecepatan
0,2020-02-01 01:00:00+00:00,691007296,21390008,1425033102,29.126
1,2020-02-01 01:00:00+00:00,47010584,1677092762,579493410,46.576
2,2020-02-01 01:00:00+00:00,22932408,26486694,1930267566,36.587
3,2020-02-01 01:00:00+00:00,142479648,1111592522,3775231113,34.063
4,2020-02-01 01:00:00+00:00,8504977,5940503398,5940503394,38.336


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398648 entries, 0 to 398647
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   waktu_setempat    398648 non-null  object 
 1   id_jalan          398648 non-null  int64  
 2   id_titik_mulai    398648 non-null  int64  
 3   id_titik_akhir    398648 non-null  int64  
 4   rerata_kecepatan  398648 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 15.2+ MB


In [6]:
for col in df.columns:
    print(f'column: {col}; n_unique: {df[col].unique().shape[0]}')

column: waktu_setempat; n_unique: 527
column: id_jalan; n_unique: 20
column: id_titik_mulai; n_unique: 488
column: id_titik_akhir; n_unique: 488
column: rerata_kecepatan; n_unique: 29023


In [7]:
id_titik = np.sort(np.unique(np.concatenate((df['id_titik_mulai'].unique(), df['id_titik_akhir'].unique()), axis=0)), axis=0, kind='mergesort')
id_jalan = np.sort(df['id_jalan'].unique(), axis=0, kind='mergesort')
waktu_setempat = np.sort(df['waktu_setempat'].unique(), axis=0, kind='mergesort')

In [8]:
cols = ['waktu_setempat','id_jalan', 'id_titik_mulai', 'id_titik_akhir']
for col in cols:
    tmp = []
    for data in df[col].to_numpy():
        if col == 'id_jalan':
            idx = np.where(id_jalan == data)
            idx = idx[0][0] / id_jalan.shape[0]
        elif col == 'waktu_setempat':
            idx = np.where(waktu_setempat == data)
            idx = idx[0][0]
        else:
            idx = np.where(id_titik == data)
            idx = idx[0][0] / id_titik.shape[0]
        tmp.append(idx)
    df[col] = tmp
tmp = df['rerata_kecepatan'].to_numpy()
max_avg = df['rerata_kecepatan'].max() * 1.2
df['rerata_kecepatan'] = tmp / max_avg
df.head()

Unnamed: 0,waktu_setempat,id_jalan,id_titik_mulai,id_titik_akhir,rerata_kecepatan
0,0,0.95,0.209016,0.706967,0.401889
1,0,0.6,0.77459,0.571721,0.642669
2,0,0.4,0.272541,0.79918,0.504838
3,0,0.8,0.67418,0.866803,0.470011
4,0,0.35,0.956967,0.954918,0.528971


In [9]:
df.shape

(398648, 5)

In [11]:
def create_dataset(df, window_len, forecast_len):
    X = []
    X2 = []
    Y = []
    no_prev = np.array([[-1, -1, -1, -1]]).astype(np.float32)
    n_waktu_setempat = 527
    n_jalan = 20
    n_titik_mulai = 488
    n_titik_akhir = 488
    split_by_jalan = []
    df = df.sort_values(by=['waktu_setempat', 'id_jalan', 'id_titik_mulai', 'id_titik_akhir'])
    for i in range(n_jalan):
        tmp = df[(df['id_jalan'] == i/20)]
        tmp = tmp.reset_index(drop=True)
        split_by_jalan.append(tmp)
    for i in tqdm(range(n_jalan)):
        data = split_by_jalan[i]
        for t in range(n_waktu_setempat):
            # for j in range(n_titik_mulai):
            tmp = []
            for k in range(window_len):
                tmp2 = data[(data['waktu_setempat'] == t+k)]
                tmp2 = tmp2.reset_index(drop=True)
                tmp2 = tmp2.drop(['waktu_setempat'], axis=1)
                tmp2 = tmp2.to_numpy().astype(np.float32)
                tmp2 = np.concatenate((tmp2, no_prev), axis=0)
                tmp.append(tmp2)
            tmp2 = data[(data['waktu_setempat'] == t+window_len)]
            tmp2 = tmp2.reset_index(drop=True)
            tmp4 = tmp2['rerata_kecepatan'].to_numpy()
            tmp2 = tmp2.drop(['waktu_setempat', 'rerata_kecepatan'], axis=1)
            tmp2 = tmp2.to_numpy()
            if window_len > 1:
                tmp3 = list(itertools.product(*tmp))
                
            else:
                tmp3 = tmp[0]
            tmp3 = np.asarray(tmp3)
            tmpX = []
            tmpY = []
            for _tmp in tmp3:
                for idx, __tmp in enumerate(tmp2):
                    _tmp = np.asarray(_tmp)
                    __tmp = np.asarray(__tmp)
                    __tmp2 = tmp4[idx]
                    X.append(_tmp)
                    X2.append(__tmp)
                    Y.append(__tmp2)
            # X.append(tmpX)
            # Y.append(tmpY)
            # X = X + tmpX
            # Y = Y + tmpY

    
    # X = np.concatenate(X, axis=0)
    # Y = np.concatenate(Y, axis=0)
    X = np.asarray(X)
    X2 = np.asarray(X2)
    Y = np.asarray(Y)
    return X, X2,  Y




In [12]:
# tmp2 = tmp2.to_numpy()

In [13]:
window_len=1
forecast_len = 1

In [14]:
X, X2, Y = create_dataset(df, window_len=window_len, forecast_len=forecast_len)

100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


In [15]:
X.shape

(16645036, 4)

In [16]:
Y.shape

(16645036,)

In [17]:
x1_train, x1_test, x2_train, x2_test, y_train, y_test = train_test_split(X, X2, Y, test_size=0.2, random_state=SEED)

Model

In [None]:
def encoder_decoder_block(past_inputs, future_inputs):
    # Encoding the past
    encoder = tf.keras.layers.LSTM(128, return_state=True)
    encoder_outputs, state_h, state_c = encoder(past_inputs)
    state_h = state_h + tf.keras.layers.Dense(128, activation='relu')(state_h)
    state_c = state_c + tf.keras.layers.Dense(128, activation='relu')(state_c)
    
    # x = tf.keras.layers.Dense(1)(future_inputs)
    # 
    # Combining future inputs with recurrent branch output
    decoder_lstm = tf.keras.layers.LSTM(128, return_sequences=True)
    x = decoder_lstm(future_inputs,
                    initial_state=[state_h, state_c])
    return x

past_inputs = tf.keras.Input(
    shape=(window_len, 4), name='past_inputs')
future_inputs = tf.keras.Input(
        shape=(forecast_len, 3), name='future_inputs')

x_1 = encoder_decoder_block(past_inputs, future_inputs)
x = tf.keras.layers.Dense(4, activation='linear')(x_1) + past_inputs
x = tf.keras.layers.ReLU()(x)
x = encoder_decoder_block(x, future_inputs)
x_2 = tf.keras.layers.Dense(256, activation='relu')(future_inputs)
x = tf.keras.layers.Concatenate(axis=2)([x, x_1, x_2])
x_1 = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(512, activation='relu')(x_1)
x = tf.keras.layers.Concatenate(axis=1)([x, x_1])
x = tf.keras.layers.Dense(1024, activation='relu')(x)
x = tf.keras.layers.BatchNormalization()(x)
# x = tf.keras.layers.Dense(256, activation='relu')(x)
# x = tf.keras.layers.Dropout(0.2)(x)
output = tf.keras.layers.Dense(1, activation='linear')(x)
output = tf.keras.layers.LeakyReLU(alpha=1e-5, name='outputs')(output)
model = tf.keras.models.Model(
    inputs=[past_inputs, future_inputs], outputs=output)
model.summary()

In [18]:
past_inputs = tf.keras.Input(
    shape=(window_len, 4), name='past_inputs')
# Encoding the past
encoder = tf.keras.layers.LSTM(128, return_state=True)
encoder_outputs, state_h, state_c = encoder(past_inputs)
state_h = state_h + tf.keras.layers.Dense(128, activation='relu')(state_h)
state_c = state_c + tf.keras.layers.Dense(128, activation='relu')(state_c)
future_inputs = tf.keras.Input(
    shape=(forecast_len, 3), name='future_inputs')
# x = tf.keras.layers.Dense(1)(future_inputs)
# 
# Combining future inputs with recurrent branch output
decoder_lstm = tf.keras.layers.LSTM(128, return_sequences=True)
x = decoder_lstm(future_inputs,
                 initial_state=[state_h, state_c])

x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
# x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dense(256, activation='relu')(x)
# x = tf.keras.layers.Dropout(0.2)(x)
output = tf.keras.layers.Dense(1, activation='linear')(x)
output = tf.keras.layers.LeakyReLU(alpha=1e-5, name='outputs')(output)
model = tf.keras.models.Model(
    inputs=[past_inputs, future_inputs], outputs=output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 past_inputs (InputLayer)    [(None, 1, 4)]               0         []                            
                                                                                                  
 lstm (LSTM)                 [(None, 128),                68096     ['past_inputs[0][0]']         
                              (None, 128),                                                        
                              (None, 128)]                                                        
                                                                                                  
 dense (Dense)               (None, 128)                  16512     ['lstm[0][1]']                
                                                                                              

In [19]:
import tensorflow.keras.backend as K

def smape_loss(y_true, y_pred):
    epsilon = 0.1
    summ = K.maximum(K.abs(y_true) + K.abs(y_pred) + epsilon, 0.5 + epsilon)
    smape = K.abs(y_pred - y_true) / summ * 2.0
    return smape
optimizer = tf.keras.optimizers.legacy.Adam()
loss = tf.keras.losses.MeanAbsoluteError()
model.compile(loss=loss, optimizer=optimizer, metrics=["mae", 'mse', 'mean_absolute_percentage_error'])

In [20]:
model.fit([np.expand_dims(x1_train, axis=1), np.expand_dims(x2_train, axis=1)], y_train, epochs=20, validation_split=0.2, batch_size=512)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x299b603d0>

In [21]:
pred = []
tmpx1 = []
tmpx2 = []
for idx, (x1, x2) in enumerate(zip(np.expand_dims(x1_test, axis=1), np.expand_dims(x2_test, axis=1))):
    tmpx1.append(x1)
    tmpx2.append(x2)
    if (idx+1) % 2048000 == 0 or (idx+1) == len(x1_test):
        tmpx1 = np.asarray(tmpx1)
        tmpx2 = np.asarray(tmpx2)
        tmp = model.predict([tmpx1, tmpx2], batch_size=2048)
        tmp = np.asarray(tmp).astype(np.float32)
        pred.append(tmp)
        tmpx1 = []
        tmpx2 = []
pred = np.concatenate(pred, axis=0)
pred = pred.flatten()
print(f'test_data: {smape(y_test, pred)}')

test_data: 11.779328


In [22]:
pred = []
tmpx1 = []
tmpx2 = []
for idx, (x1, x2) in enumerate(zip(np.expand_dims(x1_train, axis=1), np.expand_dims(x2_train, axis=1))):
    tmpx1.append(x1)
    tmpx2.append(x2)
    if (idx+1) % (2*4096000) == 0 or (idx+1) == len(x1_train):
        tmpx1 = np.asarray(tmpx1)
        tmpx2 = np.asarray(tmpx2)
        tmp = model.predict([tmpx1, tmpx2], batch_size=4096)
        tmp = np.asarray(tmp).astype(np.float32)
        pred.append(tmp)
        tmpx1 = []
        tmpx2 = []
pred = np.concatenate(pred, axis=0)
pred = pred.flatten()
print(f'train_data: {smape(y_train, pred)}')

train_data: 11.767992


In [59]:
model.predict([np.expand_dims(x1_train, axis=1)[0:5], np.expand_dims(x2_train, axis=1)[0:5]]).mean()



0.44399634

In [23]:
model.save("modelV2")

INFO:tensorflow:Assets written to: modelV2/assets


INFO:tensorflow:Assets written to: modelV2/assets


In [24]:
df_test = pd.read_csv('data/test.csv')
df_test.head()

Unnamed: 0,id,waktu_setempat,id_jalan,id_titik_mulai,id_titik_akhir
0,0,2020-02-23 00:00:00+00:00,4004732,32046542,6454026544
1,1,2020-02-23 00:00:00+00:00,182210371,1314925464,1314925496
2,2,2020-02-23 00:00:00+00:00,22932408,1482086782,26481020
3,3,2020-02-23 00:00:00+00:00,182210371,3892883,267337489
4,4,2020-02-23 00:00:00+00:00,66924592,266041030,2592978110


In [25]:
cols = ['waktu_setempat','id_jalan', 'id_titik_mulai', 'id_titik_akhir']
waktu_setempat_test = np.sort(df_test['waktu_setempat'].unique(), axis=0, kind='mergesort')
for col in cols:
    tmp = []
    for data in df_test[col].to_numpy():
        if col == 'id_jalan':
            idx = np.where(id_jalan == data)
            idx = idx[0][0] / id_jalan.shape[0]
        elif col == 'waktu_setempat':
            idx = np.where(waktu_setempat_test == data)
            idx = idx[0][0]
        else:
            idx = np.where(id_titik == data)
            idx = idx[0][0] / id_titik.shape[0]
        tmp.append(idx)
    df_test[col] = tmp
df_test.head()

Unnamed: 0,id,waktu_setempat,id_jalan,id_titik_mulai,id_titik_akhir
0,0,0,0.15,0.313525,0.997951
1,1,0,0.9,0.70082,0.702869
2,2,0,0.4,0.719262,0.268443
3,3,0,0.9,0.139344,0.497951
4,4,0,0.7,0.493852,0.836066


In [26]:
len(waktu_setempat_test)

168

In [33]:
def predict_test(df, df_test, model, max_avg):
    last_data = df[(df['waktu_setempat'] == 526)]
    last_data = last_data.reset_index(drop=True)
    n_time = 168
    predicted_data = {'id':[], 'rerata_kecepatan':[]}
    no_prev = np.array([[-1, -1, -1, -1]]).astype(np.float32)
    for i in tqdm(range(n_time)):
        last_data_per_road = []
        for j in range(20):
            tmp_road = last_data[(last_data['id_jalan'] == j/20)]
            tmp_road = tmp_road.reset_index(drop=True)
            last_data_per_road.append(tmp_road)
        current_test = df_test[(df_test['waktu_setempat'] == i)]
        current_test = current_test.reset_index(drop=True)
        tmp_pred_ = []
        for index, row in current_test.iterrows():
            predicted_data['id'].append(row['id'])
            tmp_x2_test = np.array([row['id_jalan'], row['id_titik_mulai'], row['id_titik_akhir']])
            tmp_x2_test = np.expand_dims(tmp_x2_test, axis=0)
            idx_road = int(row['id_jalan']*20)
            tmp_x1_test = last_data_per_road[idx_road]
            n_x1 = len(tmp_x1_test)
            tmp_x1_test = tmp_x1_test.drop(['waktu_setempat'], axis=1)
            tmp_x1_test = tmp_x1_test.to_numpy().astype(np.float32)
            if n_x1 > 0:
                tmp_x2_test = [tmp_x2_test for aa in range(n_x1+1)]
                tmp_x2_test = np.concatenate(tmp_x2_test, axis=0)
                tmp_x1_test = np.concatenate((tmp_x1_test, no_prev), axis=0)
            else:
                tmp_x1_test = np.concatenate((tmp_x1_test, no_prev), axis=0)
            # tmp_pred = []
            # for i in range(n_x1):
            # try:
            tmp_pred = model.predict([np.expand_dims(tmp_x1_test, axis=1), np.expand_dims(tmp_x2_test, axis=1)], verbose=0, batch_size=n_x1).mean()
            # except:
            #     tmp_data__ = df[(df['id_jalan'] == row['id_jalan'])]
            #     tmp_pred = tmp_data__['rerata_kecepatan'].mean()
                # tmp_pred.append(tmp_pred2)
            # tmp_pred = sum(tmp_pred)/len(tmp_pred)
            predicted_data['rerata_kecepatan'].append(tmp_pred*max_avg)
            tmp_pred_.append(tmp_pred*max_avg)
        last_data = current_test[['waktu_setempat','id_jalan', 'id_titik_mulai', 'id_titik_akhir']]
        last_data['rerata_kecepatan'] = tmp_pred_
        tmp_pred_ = []
    
    return predicted_data


            


In [34]:
predicted_data = predict_test(df, df_test, model, max_avg)

100%|██████████| 168/168 [1:03:37<00:00, 22.72s/it]


In [35]:
tmp = predicted_data['id']
predicted_data['id'] = np.asarray(tmp).astype(np.uint32)

In [43]:
subm = pd.DataFrame.from_dict(predicted_data)
subm = subm.sort_values(by=['id'])
subm.head()

Unnamed: 0,id,rerata_kecepatan
0,0,41.504017
1,1,41.519714
2,2,34.982294
3,3,40.739993
4,4,32.622729


In [44]:
subm.to_csv('submissionV2.csv', index=False)

: 

In [10]:
model = xgboost.XGBRegressor(n_jobs=-1, random_state=42)
model.fit(X, y)

In [11]:
model.predict(X)

array([29.29272 , 46.12336 , 35.993378, ..., 35.50384 , 33.308434,
       37.70065 ], dtype=float32)

In [12]:
waktu_setempat

array(['2020-02-01 01:00:00+00:00', '2020-02-01 02:00:00+00:00',
       '2020-02-01 03:00:00+00:00', '2020-02-01 04:00:00+00:00',
       '2020-02-01 05:00:00+00:00', '2020-02-01 06:00:00+00:00',
       '2020-02-01 07:00:00+00:00', '2020-02-01 08:00:00+00:00',
       '2020-02-01 09:00:00+00:00', '2020-02-01 10:00:00+00:00',
       '2020-02-01 11:00:00+00:00', '2020-02-01 12:00:00+00:00',
       '2020-02-01 13:00:00+00:00', '2020-02-01 14:00:00+00:00',
       '2020-02-01 15:00:00+00:00', '2020-02-01 16:00:00+00:00',
       '2020-02-01 17:00:00+00:00', '2020-02-01 18:00:00+00:00',
       '2020-02-01 19:00:00+00:00', '2020-02-01 20:00:00+00:00',
       '2020-02-01 21:00:00+00:00', '2020-02-01 22:00:00+00:00',
       '2020-02-01 23:00:00+00:00', '2020-02-02 00:00:00+00:00',
       '2020-02-02 01:00:00+00:00', '2020-02-02 02:00:00+00:00',
       '2020-02-02 03:00:00+00:00', '2020-02-02 04:00:00+00:00',
       '2020-02-02 05:00:00+00:00', '2020-02-02 06:00:00+00:00',
       '2020-02-02 07:00:

In [12]:
df = pd.read_csv('data/test.csv')
df.head()

Unnamed: 0,id,waktu_setempat,id_jalan,id_titik_mulai,id_titik_akhir
0,0,2020-02-23 00:00:00+00:00,4004732,32046542,6454026544
1,1,2020-02-23 00:00:00+00:00,182210371,1314925464,1314925496
2,2,2020-02-23 00:00:00+00:00,22932408,1482086782,26481020
3,3,2020-02-23 00:00:00+00:00,182210371,3892883,267337489
4,4,2020-02-23 00:00:00+00:00,66924592,266041030,2592978110


In [13]:
for col in df.columns:
    print(f'column: {col}; n_unique: {df[col].unique().shape[0]}')

column: id; n_unique: 127489
column: waktu_setempat; n_unique: 168
column: id_jalan; n_unique: 20
column: id_titik_mulai; n_unique: 488
column: id_titik_akhir; n_unique: 488
