In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, GroupKFold

import xgboost
import lightgbm

import tensorflow as tf

import bisect

In [2]:
class CFG:
  folds = 5

  hidden_layers = 10
  hidden_size = 24
  activation = 'relu'

  PATH = '/content/gdrive/MyDrive/Dacon/HD AI Challenge/datasets/'

In [3]:
train = pd.read_csv(CFG.PATH + 'train.csv')
test = pd.read_csv(CFG.PATH + 'test.csv')
sample_submission = pd.read_csv(CFG.PATH + 'sample_submission.csv')

In [4]:
#train = train.dropna(axis=0).reset_index(drop=True)
#train = train.sort_values('ATA').reset_index(drop=True)

In [5]:
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

for df in [train, test]:
  df['year'] = df['ATA'].dt.year
  df['month'] = df['ATA'].dt.month
  df['day'] = df['ATA'].dt.day
  df['hour'] = df['ATA'].dt.hour
  df['minute'] = df['ATA'].dt.minute
  df['weekday'] = df['ATA'].dt.weekday

#train.drop(columns='ATA', inplace=True)
#test.drop(columns='ATA', inplace=True)

In [6]:
#train.fillna(train.mean(), inplace=True)
#test.fillna(test.mean(), inplace=True)

In [7]:
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']
encoders = {}

for feature in tqdm(categorical_features):
  le = LabelEncoder()
  train[feature] = le.fit_transform(train[feature].astype(str))
  le_classes_set = set(le.classes_)
  test[feature] = test[feature].map(lambda s:'-1' if s not in le_classes_set else s)
  le_classes = le.classes_.tolist()
  bisect.insort_left(le_classes, '-1')
  le.classes_ = np.array(le_classes)
  test[feature] = le.transform(test[feature].astype(str))
  encoders[feature] = le

100%|██████████| 6/6 [00:04<00:00,  1.22it/s]


In [8]:
train.drop(train[(train['DIST'] == 0) & (train['CI_HOUR'] != 0)].index, inplace=True)
#train.drop(train[train['DIST'] == 0].index, inplace=True)
train = train.reset_index(drop=True)

In [9]:
train.columns

Index(['SAMPLE_ID', 'ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'DIST', 'ATA',
       'ID', 'BREADTH', 'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT', 'GT',
       'LENGTH', 'SHIPMANAGER', 'FLAG', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE',
       'BN', 'ATA_LT', 'DUBAI', 'BRENT', 'WTI', 'BDI_ADJ', 'PORT_SIZE',
       'CI_HOUR', 'year', 'month', 'day', 'hour', 'minute', 'weekday'],
      dtype='object')

In [15]:
features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'DIST',
            'BREADTH', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT', 'GT',
            'LENGTH', 'FLAG', 'WTI', 'BDI_ADJ', 'PORT_SIZE',
            'year', 'month']   #ATA_LT

target = 'CI_HOUR'

In [11]:
extract = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'DIST',
           'BREADTH', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT', 'GT',
           'LENGTH', 'FLAG', 'ATA_LT', 'WTI', 'BDI_ADJ', 'PORT_SIZE',
           'year', 'month', 'CI_HOUR']

In [12]:
train = train[extract]
train = train.dropna(axis=0).reset_index(drop=True)

In [13]:
X_test = test[features]

In [18]:
kf = KFold(n_splits=CFG.folds, shuffle=True, random_state=42)
#gkf = GroupKFold(n_splits=CFG.folds)
xgb = xgboost.XGBRegressor(n_estimators=1000, max_depth=9, learning_rate=0.2)
#gru = build_model()
#mlp = build_model()
lgbm = lightgbm.LGBMRegressor()

mae_score = 0
#test_predictions = []
for fold, (train_idx, valid_idx) in enumerate(kf.split(train)):
  X_train, y_train = train.loc[train_idx, features], train.loc[train_idx, target]
  X_valid, y_valid = train.loc[valid_idx, features], train.loc[valid_idx, target]

  xgb.fit(X_train, y_train)
  lgbm.fit(X_train, y_train)
  #xgb_pred = xgb.predict(X_valid)
  #xgb_pred[X_valid['DIST'] == 0] = 0

  #gru.fit(X_train, y_train, validation_data = (X_valid, y_valid), batch_size = 16, epochs = 1)
  #gru_pred = gru.predict(X_valid)

  pred = (lgbm.predict(X_valid) + xgb.predict(X_valid)) / 2
  pred[X_valid['DIST'] == 0] = 0
  pred = np.where(pred < 0, 0, pred)

  print(f'Fold{fold} MAE: ', mean_absolute_error(y_valid, pred))

  mae_score += mean_absolute_error(y_valid, pred)

  #test_pred = (lgbm.predict(X_test) + xgb.predict(X_test)) / 2
  #test_pred = np.where(test_pred < 0, 0, test_pred)
  #test_predictions.append(test_pred)

#test_predictions = np.mean(test_predictions, axis=0)

print('MAE score: ', mae_score / CFG.folds)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1689
[LightGBM] [Info] Number of data points in the train set: 293931, number of used features: 16
[LightGBM] [Info] Start training from score 61.745688
Fold0 MAE:  43.20186665304422


KeyboardInterrupt: ignored

In [None]:
kf = KFold(n_splits=CFG.folds, shuffle=True, random_state=42)
xgb = xgboost.XGBRegressor(n_estimators=100,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.8)
lgbm = lightgbm.LGBMRegressor()

port_mae_score = []
for port in train['ARI_PO'].unique():
  print('='*10 + f'{port}' + '='*10)
  train_ = train[train['ARI_PO'] == port].reset_index(drop=True)
  mae_score = 0

  for fold, (train_idx, valid_idx) in enumerate(kf.split(train_)):
    X_train, y_train = train_.loc[train_idx, features], train_.loc[train_idx, target]
    X_valid, y_valid = train_.loc[valid_idx, features], train_.loc[valid_idx, target]

    xgb.fit(X_train, y_train)
    pred = xgb.predict(X_valid)
    #lgbm.fit(X_train, y_train)
    #pred = lgbm.predict(X_valid)
    pred = np.where(pred < 0, 0, pred)

    print(f'Fold{fold} MAE: ', mean_absolute_error(y_valid, pred))
    mae_score += mean_absolute_error(y_valid, pred)

  print('MAE score: ', mae_score / CFG.folds)
  port_mae_score.append(mae_score / CFG.folds)

  print('='*30)

print('Total MAE score: ', np.mean(port_mae_score))

In [None]:
#def build_model():
    inp = tf.keras.Input(shape=(len(features), 1))

    x = tf.keras.layers.GRU(units=8, return_sequences=True)(inp)
    x = tf.keras.layers.GRU(units=8, return_sequences=True)(x)
    x = tf.keras.layers.GRU(units=8, return_sequences=False)(x)
    x = tf.keras.layers.Dense(1,activation='linear')(x)
    model = tf.keras.Model(inputs=inp, outputs=x)

    opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
    loss = tf.keras.losses.MeanAbsoluteError()
    model.compile(loss=loss, optimizer = opt)

    return model

In [None]:
model = build_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20, 1)]           0         
                                                                 
 gru (GRU)                   (None, 20, 8)             264       
                                                                 
 gru_1 (GRU)                 (None, 20, 8)             432       
                                                                 
 gru_2 (GRU)                 (None, 8)                 432       
                                                                 
 dense (Dense)               (None, 1)                 9         
                                                                 
Total params: 1137 (4.44 KB)
Trainable params: 1137 (4.44 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
def build_model():
    inp = tf.keras.Input(shape=(len(features,)))

    x = tf.keras.layers.Dense(CFG.hidden_size,activation=CFG.activation)(inp)
    for k in range(CFG.hidden_layers-1):
        x = tf.keras.layers.Dense(CFG.hidden_size)(x)
        x = tf.keras.layers.Activation(CFG.activation)(x)
    x = tf.keras.layers.Dense(1,activation='linear')(x)

    model = tf.keras.Model(inputs=[inp], outputs=[x])
    opt = tf.keras.optimizers.Adam(learning_rate=1e-3)
    loss = tf.keras.losses.MeanAbsoluteError()
    model.compile(loss=loss, optimizer = opt)

    return model

In [None]:
model = build_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 23)]              0         
                                                                 
 dense (Dense)               (None, 24)                576       
                                                                 
 dense_1 (Dense)             (None, 24)                600       
                                                                 
 activation (Activation)     (None, 24)                0         
                                                                 
 dense_2 (Dense)             (None, 24)                600       
                                                                 
 activation_1 (Activation)   (None, 24)                0         
                                                                 
 dense_3 (Dense)             (None, 24)                600   

In [None]:
kf = KFold(n_splits=CFG.folds, shuffle=True, random_state=42)
#gkf = GroupKFold(n_splits=CFG.folds)
#xgb = xgboost.XGBRegressor()
#gru = build_model()
mlp = build_model()

mae_score = 0
for fold, (train_idx, valid_idx) in enumerate(kf.split(train)):
  X_train, y_train = train.loc[train_idx, features], train.loc[train_idx, target]
  X_valid, y_valid = train.loc[valid_idx, features], train.loc[valid_idx, target]

  #xgb.fit(X_train, y_train)
  #xgb_pred = xgb.predict(X_valid)

  #gru.fit(X_train, y_train, validation_data = (X_valid, y_valid), batch_size = 16, epochs = 1)
  #gru_pred = gru.predict(X_valid)

  mlp.fit(X_train, y_train, validation_data = (X_valid, y_valid), verbose=2, batch_size = 32, epochs = 12)
  mlp_pred = mlp.predict(X_valid)

  pred = mlp_pred
  pred = np.where(pred < 0, 0, pred)

  print(f'Fold{fold} MAE: ', mean_absolute_error(y_valid, pred))

  mae_score += mean_absolute_error(y_valid, pred)

print('MAE score: ', mae_score / CFG.folds)