In [1]:
%load_ext autoreload
%autoreload
import lightgbm as lgb
import pandas as pd
import numpy as np
import optuna
import merging
import preprocess
import scores
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import model_selection

In [2]:
#Load new file
df = pd.read_csv('df_preprocessed_2015-2019.csv')

In [3]:
cols = ['Flight Datetime', 'AOBT', 'ATOT']
for col in cols:
    df[col] = pd.to_datetime(df[col])

In [4]:
df.drop(['Flight Datetime', 'AOBT', 'ATOT'],axis=1,inplace=True)

In [5]:
# Removing special characters from variable names
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
df.drop(['Unnamed0','AircraftModel_x', 'Stand', 'Runway', 'summary', 'Manufacturer',
       'Model', 'WakeCategory', 'Final', 'AircraftModel_y', 'OldMovementType'],axis=1,inplace=True)

In [6]:
cols=['aobt_month', 'aobt_day', 'aobt_hour']
for col in cols:
    df[col]=df[col].astype('category')

In [7]:
df = preprocess.ohe(df,['aobt_month','aobt_hour','aobt_day'])

#### LightGBM hyperparameter tuning

In [8]:
features_var = ['flight_hour','flight_weekday','cloudCover','windSpeed','windBearing','traffic', 'Q',
        'TO1', 'TO2', 'TO3',  'TORunway1',
       'TORunway2', 'TORunway3',
       'aobt_year', 'aobt_month_1', 'aobt_month_2',
       'aobt_month_3', 'aobt_month_4', 'aobt_month_5', 'aobt_month_6',
       'aobt_month_7', 'aobt_month_8', 'aobt_month_9', 'aobt_month_10',
       'aobt_month_11', 'aobt_month_12', 'aobt_hour_0', 'aobt_hour_1',
       'aobt_hour_2', 'aobt_hour_3', 'aobt_hour_4', 'aobt_hour_5',
       'aobt_hour_6', 'aobt_hour_7', 'aobt_hour_8', 'aobt_hour_9',
       'aobt_hour_10', 'aobt_hour_11', 'aobt_hour_12', 'aobt_hour_13',
       'aobt_hour_14', 'aobt_hour_15', 'aobt_hour_16', 'aobt_hour_17',
       'aobt_hour_18', 'aobt_hour_19', 'aobt_hour_20', 'aobt_hour_21',
       'aobt_hour_22', 'aobt_hour_23', 'aobt_day_0', 'aobt_day_1',
       'aobt_day_2', 'aobt_day_3', 'aobt_day_4', 'aobt_day_5', 'aobt_day_6',
       'precipAccumulation',   'Lengthft','TO']
df_lightGBM = df[features_var]

In [9]:
def create_train_test(df):
    df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_ ]+', '', x))
    train=df[df['aobt_year']!=2019]
    test=df[df['aobt_year']==2019]
    X_train=train.drop('TO',axis=1)
    X_test=test.drop('TO',axis=1)
    y_train=train['TO']
    y_test=test['TO']
    
    return X_train,y_train,X_test,y_test

In [10]:
def objective(trial: optuna.Trial, fast_check=True, target_meter=0, return_info=False):
    folds = 5
    seed = 666
    shuffle = False
    kf = model_selection.KFold(n_splits=folds, shuffle=shuffle, random_state=seed)
    
    X_train,y_train,X_test,y_test = create_train_test(df_lightGBM)
    
    y_valid_pred_total = np.zeros(X_train.shape[0])

    models = []
    valid_score = 0
    for train_idx, valid_idx in kf.split(X_train, y_train):
        train_data = X_train.iloc[train_idx,:], y_train.iloc[train_idx]
        valid_data = X_train.iloc[valid_idx,:], y_train.iloc[valid_idx]

        print('train', len(train_idx), 'valid', len(valid_idx))
        model, y_pred_valid, log = fit_lgbm(trial, train_data, valid_data,num_rounds=1000) 
                                            
        y_valid_pred_total[valid_idx] = y_pred_valid
        models.append(model)
        valid_score += log["valid/l2"]
        if fast_check:
            break
    valid_score /= len(models)
    if return_info:
        return valid_score, models, y_pred_valid, y_train
    else:
        return valid_score

In [11]:
def fit_lgbm(trial, train, val, devices=(-1,), seed=None, cat_features=None, num_rounds=1500):
    """Train Light GBM model"""
    X_train, y_train = train
    X_valid, y_valid = val
    metric = 'l2'
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'objective': 'regression',
#               'max_depth': -1,
        'learning_rate': 0.1,
        "boosting": "gbdt",
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        "bagging_freq": 5,
        "bagging_fraction": trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        "feature_fraction": trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        "metric": metric,
        "verbosity": -1,
    }
    device = devices[0]
    if device == -1:
        # use cpu
        pass
    else:
        # use gpu
        print(f'using gpu device_id {device}...')
        params.update({'device': 'gpu', 'gpu_device_id': device})

    params['seed'] = seed

    early_stop = 20
    verbose_eval = 20

    d_train = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features)
    d_valid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_features)
    watchlist = [d_train, d_valid]

    print('training LGB:')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)

    # predictions
    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    
    print('best_score', model.best_score)
    log = {'train/l2': model.best_score['training']['l2'],
           'valid/l2': model.best_score['valid_1']['l2']}
    return model, y_pred_valid, log

In [12]:
study = optuna.create_study()
study.optimize(objective,n_trials=10)

[I 2020-10-08 12:09:15,388] A new study created in memory with name: no-name-b17fad60-6909-41c6-8fd8-9e51caede5eb


train 435048 valid 108763
training LGB:
Training until validation scores don't improve for 20 rounds
[20]	training's l2: 189027	valid_1's l2: 167926
[40]	training's l2: 184099	valid_1's l2: 163284
[60]	training's l2: 182462	valid_1's l2: 162008
[80]	training's l2: 181626	valid_1's l2: 161511
[100]	training's l2: 181010	valid_1's l2: 161124
[120]	training's l2: 180593	valid_1's l2: 160968
[140]	training's l2: 180221	valid_1's l2: 160859
[160]	training's l2: 179814	valid_1's l2: 160780
[180]	training's l2: 179441	valid_1's l2: 160691
[200]	training's l2: 179116	valid_1's l2: 160603
[220]	training's l2: 178797	valid_1's l2: 160566
Early stopping, best iteration is:
[215]	training's l2: 178872	valid_1's l2: 160539
best_score defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('l2', 178872.3772212358)]), 'valid_1': OrderedDict([('l2', 160538.93583405932)])})


[I 2020-10-08 12:09:26,172] Trial 0 finished with value: 160538.93583405932 and parameters: {'num_leaves': 13, 'lambda_l1': 0.0950974134553322, 'lambda_l2': 0.18423816547058833, 'bagging_fraction': 0.40067438403946143, 'feature_fraction': 0.4129784003250001}. Best is trial 0 with value: 160538.93583405932.


train 435048 valid 108763
training LGB:
Training until validation scores don't improve for 20 rounds
[20]	training's l2: 179664	valid_1's l2: 163577
[40]	training's l2: 172208	valid_1's l2: 160806
[60]	training's l2: 167238	valid_1's l2: 159832
[80]	training's l2: 163833	valid_1's l2: 159576
[100]	training's l2: 160918	valid_1's l2: 159330
[120]	training's l2: 158206	valid_1's l2: 159236
[140]	training's l2: 155753	valid_1's l2: 159140
[160]	training's l2: 153281	valid_1's l2: 158991
[180]	training's l2: 151267	valid_1's l2: 158901
[200]	training's l2: 149335	valid_1's l2: 158950
Early stopping, best iteration is:
[189]	training's l2: 150352	valid_1's l2: 158880
best_score defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('l2', 150351.63865899766)]), 'valid_1': OrderedDict([('l2', 158879.63328458526)])})


[I 2020-10-08 12:09:53,779] Trial 1 finished with value: 158879.63328458526 and parameters: {'num_leaves': 166, 'lambda_l1': 8.05868866329061e-08, 'lambda_l2': 0.01653533616154787, 'bagging_fraction': 0.6646506053400946, 'feature_fraction': 0.7896426302626525}. Best is trial 1 with value: 158879.63328458526.


train 435048 valid 108763
training LGB:
Training until validation scores don't improve for 20 rounds
[20]	training's l2: 179484	valid_1's l2: 163743
[40]	training's l2: 171708	valid_1's l2: 160653
[60]	training's l2: 166627	valid_1's l2: 159918
[80]	training's l2: 163009	valid_1's l2: 159637
[100]	training's l2: 159766	valid_1's l2: 159514
[120]	training's l2: 156884	valid_1's l2: 159581
Early stopping, best iteration is:
[102]	training's l2: 159412	valid_1's l2: 159473
best_score defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('l2', 159411.76325349574)]), 'valid_1': OrderedDict([('l2', 159472.7296134298)])})


[I 2020-10-08 12:10:20,938] Trial 2 finished with value: 159472.7296134298 and parameters: {'num_leaves': 219, 'lambda_l1': 1.1027592302417265e-07, 'lambda_l2': 1.3909909107875533e-05, 'bagging_fraction': 0.4675775476590517, 'feature_fraction': 0.6315956055266557}. Best is trial 1 with value: 158879.63328458526.


train 435048 valid 108763
training LGB:
Training until validation scores don't improve for 20 rounds
[20]	training's l2: 183745	valid_1's l2: 164232
[40]	training's l2: 178359	valid_1's l2: 160814
[60]	training's l2: 175327	valid_1's l2: 159774
[80]	training's l2: 173259	valid_1's l2: 159325
[100]	training's l2: 171698	valid_1's l2: 158955
[120]	training's l2: 170190	valid_1's l2: 158749
[140]	training's l2: 168864	valid_1's l2: 158539
[160]	training's l2: 167568	valid_1's l2: 158338
[180]	training's l2: 166417	valid_1's l2: 158193
[200]	training's l2: 165326	valid_1's l2: 158042
[220]	training's l2: 164209	valid_1's l2: 157958
[240]	training's l2: 163137	valid_1's l2: 157902
[260]	training's l2: 162088	valid_1's l2: 157823
[280]	training's l2: 160964	valid_1's l2: 157743
[300]	training's l2: 159948	valid_1's l2: 157681
[320]	training's l2: 158858	valid_1's l2: 157578
[340]	training's l2: 157926	valid_1's l2: 157483
[360]	training's l2: 156916	valid_1's l2: 157457
[380]	training's l2: 

[I 2020-10-08 12:11:01,965] Trial 3 finished with value: 157208.2454841379 and parameters: {'num_leaves': 67, 'lambda_l1': 1.0034340430533997, 'lambda_l2': 6.273266822048027, 'bagging_fraction': 0.9441048263180452, 'feature_fraction': 0.7741800625243584}. Best is trial 3 with value: 157208.2454841379.


train 435048 valid 108763
training LGB:
Training until validation scores don't improve for 20 rounds
[20]	training's l2: 180635	valid_1's l2: 165089
[40]	training's l2: 171972	valid_1's l2: 161073
[60]	training's l2: 166805	valid_1's l2: 160135
[80]	training's l2: 162863	valid_1's l2: 159940
[100]	training's l2: 159484	valid_1's l2: 159952
Early stopping, best iteration is:
[88]	training's l2: 161411	valid_1's l2: 159844
best_score defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('l2', 161410.5713909665)]), 'valid_1': OrderedDict([('l2', 159843.84958591513)])})


[I 2020-10-08 12:11:17,934] Trial 4 finished with value: 159843.84958591513 and parameters: {'num_leaves': 248, 'lambda_l1': 0.17610495706218532, 'lambda_l2': 2.5109871580421584e-05, 'bagging_fraction': 0.4848740602653556, 'feature_fraction': 0.4681727404526955}. Best is trial 3 with value: 157208.2454841379.


train 435048 valid 108763
training LGB:
Training until validation scores don't improve for 20 rounds
[20]	training's l2: 184634	valid_1's l2: 164746
[40]	training's l2: 179536	valid_1's l2: 161247
[60]	training's l2: 177086	valid_1's l2: 160230
[80]	training's l2: 175406	valid_1's l2: 159754
[100]	training's l2: 174011	valid_1's l2: 159506
[120]	training's l2: 172708	valid_1's l2: 159378
[140]	training's l2: 171702	valid_1's l2: 159297
[160]	training's l2: 170551	valid_1's l2: 159145
[180]	training's l2: 169503	valid_1's l2: 159061
[200]	training's l2: 168585	valid_1's l2: 159040
[220]	training's l2: 167680	valid_1's l2: 158934
[240]	training's l2: 166723	valid_1's l2: 158940
[260]	training's l2: 165918	valid_1's l2: 158862
Early stopping, best iteration is:
[255]	training's l2: 166103	valid_1's l2: 158837
best_score defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('l2', 166102.70436217808)]), 'valid_1': OrderedDict([('l2', 158837.26153437182)])})


[I 2020-10-08 12:11:38,653] Trial 5 finished with value: 158837.26153437182 and parameters: {'num_leaves': 50, 'lambda_l1': 1.986473985560066, 'lambda_l2': 1.2512121971823337e-06, 'bagging_fraction': 0.6631345865246341, 'feature_fraction': 0.6069137138167517}. Best is trial 3 with value: 157208.2454841379.


train 435048 valid 108763
training LGB:
Training until validation scores don't improve for 20 rounds
[20]	training's l2: 183704	valid_1's l2: 165185
[40]	training's l2: 177126	valid_1's l2: 161139
[60]	training's l2: 173713	valid_1's l2: 160160
[80]	training's l2: 171203	valid_1's l2: 159674
[100]	training's l2: 168894	valid_1's l2: 159413
[120]	training's l2: 167041	valid_1's l2: 159246
[140]	training's l2: 165475	valid_1's l2: 159086
[160]	training's l2: 163937	valid_1's l2: 158931
[180]	training's l2: 162547	valid_1's l2: 158883
[200]	training's l2: 161162	valid_1's l2: 158847
[220]	training's l2: 159703	valid_1's l2: 158792
[240]	training's l2: 158441	valid_1's l2: 158684
[260]	training's l2: 157279	valid_1's l2: 158509
[280]	training's l2: 156095	valid_1's l2: 158503
[300]	training's l2: 154936	valid_1's l2: 158443
[320]	training's l2: 153740	valid_1's l2: 158448
Early stopping, best iteration is:
[301]	training's l2: 154872	valid_1's l2: 158431
best_score defaultdict(<class 'coll

[I 2020-10-08 12:12:06,342] Trial 6 finished with value: 158431.36699270862 and parameters: {'num_leaves': 101, 'lambda_l1': 9.09282953743888e-08, 'lambda_l2': 1.5840166087287617, 'bagging_fraction': 0.7275757935403158, 'feature_fraction': 0.49965218490455243}. Best is trial 3 with value: 157208.2454841379.


train 435048 valid 108763
training LGB:
Training until validation scores don't improve for 20 rounds
[20]	training's l2: 179751	valid_1's l2: 163842
[40]	training's l2: 172692	valid_1's l2: 162102
[60]	training's l2: 167938	valid_1's l2: 161876
[80]	training's l2: 164450	valid_1's l2: 161977
Early stopping, best iteration is:
[63]	training's l2: 167325	valid_1's l2: 161820
best_score defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('l2', 167325.06404465047)]), 'valid_1': OrderedDict([('l2', 161820.36003140526)])})


[I 2020-10-08 12:12:21,060] Trial 7 finished with value: 161820.36003140526 and parameters: {'num_leaves': 249, 'lambda_l1': 0.009726573140172322, 'lambda_l2': 0.028273591813719103, 'bagging_fraction': 0.25821292511459376, 'feature_fraction': 0.8084095954374562}. Best is trial 3 with value: 157208.2454841379.


train 435048 valid 108763
training LGB:
Training until validation scores don't improve for 20 rounds
[20]	training's l2: 183471	valid_1's l2: 164402
[40]	training's l2: 178390	valid_1's l2: 161535
[60]	training's l2: 175658	valid_1's l2: 160845
[80]	training's l2: 173773	valid_1's l2: 160554
[100]	training's l2: 172206	valid_1's l2: 160352
[120]	training's l2: 170885	valid_1's l2: 160335
Early stopping, best iteration is:
[113]	training's l2: 171300	valid_1's l2: 160283
best_score defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('l2', 171299.57839214543)]), 'valid_1': OrderedDict([('l2', 160282.90001383825)])})


[I 2020-10-08 12:12:32,660] Trial 8 finished with value: 160282.90001383825 and parameters: {'num_leaves': 71, 'lambda_l1': 7.509068976635221e-07, 'lambda_l2': 3.065867424603642e-05, 'bagging_fraction': 0.3958660013536853, 'feature_fraction': 0.7867925122959177}. Best is trial 3 with value: 157208.2454841379.


train 435048 valid 108763
training LGB:
Training until validation scores don't improve for 20 rounds
[20]	training's l2: 187834	valid_1's l2: 166630
[40]	training's l2: 183580	valid_1's l2: 162813
[60]	training's l2: 182055	valid_1's l2: 161766
[80]	training's l2: 181276	valid_1's l2: 161367
[100]	training's l2: 180656	valid_1's l2: 161101
[120]	training's l2: 180254	valid_1's l2: 160971
[140]	training's l2: 179850	valid_1's l2: 160913
[160]	training's l2: 179499	valid_1's l2: 160822
[180]	training's l2: 179185	valid_1's l2: 160816
[200]	training's l2: 178823	valid_1's l2: 160699
[220]	training's l2: 178414	valid_1's l2: 160651
Early stopping, best iteration is:
[213]	training's l2: 178530	valid_1's l2: 160602
best_score defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('l2', 178530.21694196048)]), 'valid_1': OrderedDict([('l2', 160601.60062476317)])})


[I 2020-10-08 12:12:45,569] Trial 9 finished with value: 160601.60062476317 and parameters: {'num_leaves': 13, 'lambda_l1': 0.0006440142119142004, 'lambda_l2': 1.3462347168821946, 'bagging_fraction': 0.36726729033819483, 'feature_fraction': 0.621796541410032}. Best is trial 3 with value: 157208.2454841379.


In [13]:
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

Best trial: score 157208.2454841379, params {'num_leaves': 67, 'lambda_l1': 1.0034340430533997, 'lambda_l2': 6.273266822048027, 'bagging_fraction': 0.9441048263180452, 'feature_fraction': 0.7741800625243584}


In [18]:
params = {'num_leaves': 67, 'lambda_l1': 1.0034340430533997, 'lambda_l2': 6.273266822048027, 'bagging_fraction': 0.9441048263180452, 'feature_fraction': 0.7741800625243584,'n_jobs':1}

In [19]:
X_train,y_train,X_test,y_test = create_train_test(df_lightGBM)
train_set = lgb.Dataset(X_train, label=y_train)
model = lgb.train(params,train_set)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1800
[LightGBM] [Info] Number of data points in the train set: 543811, number of used features: 59
[LightGBM] [Info] Start training from score 1125.316185


In [20]:
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 430.290247
