In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
challenge_df = pd.read_csv("../data/challenge_set.csv")
submission_df = pd.read_csv("../data/submission_set.csv")

In [4]:
challenge_df.describe()

Unnamed: 0,flight_id,flight_duration,taxiout_time,flown_distance,tow
count,369013.0,369013.0,369013.0,369013.0,369013.0
mean,253522000.0,145.876779,13.489709,1021.728581,79482.257229
std,2688565.0,139.337587,5.779555,1128.171163,53250.919631
min,248750600.0,8.0,0.0,19.0,14944.0
25%,251229600.0,59.0,10.0,338.0,55836.0
50%,253620000.0,100.0,12.0,647.0,63852.0
75%,255905900.0,164.0,16.0,1113.0,73756.0
max,258074500.0,1013.0,90.0,7272.0,351327.0


In [5]:
challenge_df.head()

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,arrival_time,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow
0,248763780,2022-01-01,3840d84f25d3f5fcc0a1be3076bb4039,EGLL,London Heathrow,GB,EICK,Cork,IE,2022-01-01T13:46:00Z,2022-01-01T15:04:56Z,A320,M,a73f82288988b79be490c6322f4c32ed,61,18,321,54748.0
1,248760618,2022-01-01,f6f610e73002b8892a239a81321f7f1d,LEBL,Barcelona,ES,KMIA,Miami,US,2022-01-01T09:55:00Z,2022-01-01T19:37:56Z,B772,H,5543e4dc327359ffaf5b9c0e6faaf0e1,570,13,4193,185441.0
2,248753824,2022-01-01,139670936660762c230ca92556ba842b,ESSA,Stockholm Arlanda,SE,KORD,Chicago O'Hare,US,2022-01-01T09:39:00Z,2022-01-01T19:08:13Z,A333,H,8be5c854fd664bcb97fb543339f74770,554,15,3770,230396.0
3,248753852,2022-01-01,509dc61bb54fbab0e5406067c95603e2,LSZH,Zurich,CH,KPHL,Philadelphia,US,2022-01-01T11:04:00Z,2022-01-01T19:32:13Z,B788,H,5543e4dc327359ffaf5b9c0e6faaf0e1,497,11,3607,157615.0
4,248755934,2022-01-01,d0610d000dcf26b1d7bba8103ecc393d,EIDW,Dublin,IE,EGLL,London Heathrow,GB,2022-01-01T12:36:00Z,2022-01-01T13:44:32Z,A21N,M,a73f82288988b79be490c6322f4c32ed,55,14,305,70318.447226


In [6]:
list(challenge_df)

['flight_id',
 'date',
 'callsign',
 'adep',
 'name_adep',
 'country_code_adep',
 'ades',
 'name_ades',
 'country_code_ades',
 'actual_offblock_time',
 'arrival_time',
 'aircraft_type',
 'wtc',
 'airline',
 'flight_duration',
 'taxiout_time',
 'flown_distance',
 'tow']

# Feature Encoding:

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [8]:
le.fit(challenge_df['adep'])
challenge_df['adep_category'] = le.transform(challenge_df['adep']) + 1
challenge_df[['adep','adep_category']]

Unnamed: 0,adep,adep_category
0,EGLL,68
1,LEBL,211
2,ESSA,134
3,LSZH,329
4,EIDW,90
...,...,...
369008,LFPG,246
369009,LTFM,375
369010,EDDL,30
369011,LFPG,246


In [9]:
le.fit(challenge_df['country_code_adep'])
challenge_df['country_code_adep_category'] = le.transform(challenge_df['country_code_adep']) + 1
challenge_df[['country_code_adep','country_code_adep_category']]

Unnamed: 0,country_code_adep,country_code_adep_category
0,GB,37
1,ES,33
2,SE,87
3,CH,20
4,IE,44
...,...,...
369008,FR,36
369009,TR,95
369010,DE,27
369011,FR,36


In [10]:
le.fit(challenge_df['ades'])
challenge_df['ades_category'] = le.transform(challenge_df['ades']) + 1
challenge_df[['ades','adep_category']]

Unnamed: 0,ades,adep_category
0,EICK,68
1,KMIA,211
2,KORD,134
3,KPHL,329
4,EGLL,90
...,...,...
369008,KMIA,246
369009,EDDB,375
369010,EIDW,30
369011,EIDW,246


In [11]:
le.fit(challenge_df['country_code_ades'])
challenge_df['country_code_ades_category'] = le.transform(challenge_df['country_code_ades']) + 1
challenge_df[['country_code_ades','country_code_ades_category']]

Unnamed: 0,country_code_ades,country_code_ades_category
0,IE,34
1,US,77
2,US,77
3,US,77
4,GB,27
...,...,...
369008,US,77
369009,DE,19
369010,IE,34
369011,IE,34


In [12]:
def encode_datetime(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [13]:
challenge_df['actual_offblock_time'] = pd.to_datetime(challenge_df['actual_offblock_time'], utc=True)
challenge_df['actual_offblock_time_month'] = challenge_df["actual_offblock_time"].dt.month
challenge_df['actual_offblock_time_day'] = challenge_df["actual_offblock_time"].dt.day
challenge_df['actual_offblock_time_hour'] = challenge_df["actual_offblock_time"].dt.hour
challenge_df['actual_offblock_time_minue'] = challenge_df["actual_offblock_time"].dt.minute
encode_datetime(challenge_df, 'actual_offblock_time_month', 12)
encode_datetime(challenge_df, 'actual_offblock_time_day', 31)
encode_datetime(challenge_df, 'actual_offblock_time_hour', 24)
encode_datetime(challenge_df, 'actual_offblock_time_minue', 60)
challenge_df

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,...,actual_offblock_time_hour,actual_offblock_time_minue,actual_offblock_time_month_sin,actual_offblock_time_month_cos,actual_offblock_time_day_sin,actual_offblock_time_day_cos,actual_offblock_time_hour_sin,actual_offblock_time_hour_cos,actual_offblock_time_minue_sin,actual_offblock_time_minue_cos
0,248763780,2022-01-01,3840d84f25d3f5fcc0a1be3076bb4039,EGLL,London Heathrow,GB,EICK,Cork,IE,2022-01-01 13:46:00+00:00,...,13,46,5.000000e-01,0.866025,2.012985e-01,0.97953,-2.588190e-01,-0.965926,-0.994522,0.104528
1,248760618,2022-01-01,f6f610e73002b8892a239a81321f7f1d,LEBL,Barcelona,ES,KMIA,Miami,US,2022-01-01 09:55:00+00:00,...,9,55,5.000000e-01,0.866025,2.012985e-01,0.97953,7.071068e-01,-0.707107,-0.500000,0.866025
2,248753824,2022-01-01,139670936660762c230ca92556ba842b,ESSA,Stockholm Arlanda,SE,KORD,Chicago O'Hare,US,2022-01-01 09:39:00+00:00,...,9,39,5.000000e-01,0.866025,2.012985e-01,0.97953,7.071068e-01,-0.707107,-0.809017,-0.587785
3,248753852,2022-01-01,509dc61bb54fbab0e5406067c95603e2,LSZH,Zurich,CH,KPHL,Philadelphia,US,2022-01-01 11:04:00+00:00,...,11,4,5.000000e-01,0.866025,2.012985e-01,0.97953,2.588190e-01,-0.965926,0.406737,0.913545
4,248755934,2022-01-01,d0610d000dcf26b1d7bba8103ecc393d,EIDW,Dublin,IE,EGLL,London Heathrow,GB,2022-01-01 12:36:00+00:00,...,12,36,5.000000e-01,0.866025,2.012985e-01,0.97953,1.224647e-16,-1.000000,-0.587785,-0.809017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369008,258058361,2022-12-31,85ee68e5b7b5acf24ba00d1318eca1e8,LFPG,Paris Charles de Gaulle,FR,KMIA,Miami,US,2022-12-31 09:38:00+00:00,...,9,38,-2.449294e-16,1.000000,-2.449294e-16,1.00000,7.071068e-01,-0.707107,-0.743145,-0.669131
369009,258071247,2022-12-31,570cf7d5ebbd691bcba63e7466607da7,LTFM,iGA Istanbul,TR,EDDB,Berlin Brandenburg,DE,2022-12-31 09:27:00+00:00,...,9,27,-2.449294e-16,1.000000,-2.449294e-16,1.00000,7.071068e-01,-0.707107,0.309017,-0.951057
369010,258059152,2022-12-31,5a7e43e4f981539ae3d3b1cb31591b7c,EDDL,Dusseldorf,DE,EIDW,Dublin,IE,2022-12-31 09:52:00+00:00,...,9,52,-2.449294e-16,1.000000,-2.449294e-16,1.00000,7.071068e-01,-0.707107,-0.743145,0.669131
369011,258072276,2022-12-31,a1c078516f9f9e90cacec61854cad45b,LFPG,Paris Charles de Gaulle,FR,EIDW,Dublin,IE,2022-12-31 09:37:00+00:00,...,9,37,-2.449294e-16,1.000000,-2.449294e-16,1.00000,7.071068e-01,-0.707107,-0.669131,-0.743145


In [14]:
challenge_df['arrival_time'] = pd.to_datetime(challenge_df['arrival_time'], utc=True)
challenge_df['arrival_time_month'] = challenge_df["arrival_time"].dt.month
challenge_df['arrival_time_day'] = challenge_df["arrival_time"].dt.day
challenge_df['arrival_time_hour'] = challenge_df["arrival_time"].dt.hour
challenge_df['arrival_time_minue'] = challenge_df["arrival_time"].dt.minute
encode_datetime(challenge_df, 'arrival_time_month', 12)
encode_datetime(challenge_df, 'arrival_time_day', 31)
encode_datetime(challenge_df, 'arrival_time_hour', 12)
encode_datetime(challenge_df, 'arrival_time_minue', 60)
challenge_df

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,...,arrival_time_hour,arrival_time_minue,arrival_time_month_sin,arrival_time_month_cos,arrival_time_day_sin,arrival_time_day_cos,arrival_time_hour_sin,arrival_time_hour_cos,arrival_time_minue_sin,arrival_time_minue_cos
0,248763780,2022-01-01,3840d84f25d3f5fcc0a1be3076bb4039,EGLL,London Heathrow,GB,EICK,Cork,IE,2022-01-01 13:46:00+00:00,...,15,4,5.000000e-01,0.866025,2.012985e-01,0.97953,1.000000e+00,1.194340e-15,0.406737,0.913545
1,248760618,2022-01-01,f6f610e73002b8892a239a81321f7f1d,LEBL,Barcelona,ES,KMIA,Miami,US,2022-01-01 09:55:00+00:00,...,19,37,5.000000e-01,0.866025,2.012985e-01,0.97953,-5.000000e-01,-8.660254e-01,-0.669131,-0.743145
2,248753824,2022-01-01,139670936660762c230ca92556ba842b,ESSA,Stockholm Arlanda,SE,KORD,Chicago O'Hare,US,2022-01-01 09:39:00+00:00,...,19,8,5.000000e-01,0.866025,2.012985e-01,0.97953,-5.000000e-01,-8.660254e-01,0.743145,0.669131
3,248753852,2022-01-01,509dc61bb54fbab0e5406067c95603e2,LSZH,Zurich,CH,KPHL,Philadelphia,US,2022-01-01 11:04:00+00:00,...,19,32,5.000000e-01,0.866025,2.012985e-01,0.97953,-5.000000e-01,-8.660254e-01,-0.207912,-0.978148
4,248755934,2022-01-01,d0610d000dcf26b1d7bba8103ecc393d,EIDW,Dublin,IE,EGLL,London Heathrow,GB,2022-01-01 12:36:00+00:00,...,13,44,5.000000e-01,0.866025,2.012985e-01,0.97953,5.000000e-01,8.660254e-01,-0.994522,-0.104528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369008,258058361,2022-12-31,85ee68e5b7b5acf24ba00d1318eca1e8,LFPG,Paris Charles de Gaulle,FR,KMIA,Miami,US,2022-12-31 09:38:00+00:00,...,19,3,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-5.000000e-01,-8.660254e-01,0.309017,0.951057
369009,258071247,2022-12-31,570cf7d5ebbd691bcba63e7466607da7,LTFM,iGA Istanbul,TR,EDDB,Berlin Brandenburg,DE,2022-12-31 09:27:00+00:00,...,12,29,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-2.449294e-16,1.000000e+00,0.104528,-0.994522
369010,258059152,2022-12-31,5a7e43e4f981539ae3d3b1cb31591b7c,EDDL,Dusseldorf,DE,EIDW,Dublin,IE,2022-12-31 09:52:00+00:00,...,11,41,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-5.000000e-01,8.660254e-01,-0.913545,-0.406737
369011,258072276,2022-12-31,a1c078516f9f9e90cacec61854cad45b,LFPG,Paris Charles de Gaulle,FR,EIDW,Dublin,IE,2022-12-31 09:37:00+00:00,...,11,13,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-5.000000e-01,8.660254e-01,0.978148,0.207912


In [15]:
le.fit(challenge_df['aircraft_type'])
challenge_df['aircraft_type_category'] = le.transform(challenge_df['aircraft_type']) + 1
challenge_df[['aircraft_type','aircraft_type_category']]

Unnamed: 0,aircraft_type,aircraft_type_category
0,A320,5
1,B772,19
2,A333,8
3,B788,22
4,A21N,2
...,...,...
369008,B788,22
369009,A21N,2
369010,A320,5
369011,A21N,2


In [16]:
le.fit(challenge_df['wtc'])
challenge_df['wtc_category'] = le.transform(challenge_df['wtc']) + 1
challenge_df[['wtc','wtc_category']]

Unnamed: 0,wtc,wtc_category
0,M,2
1,H,1
2,H,1
3,H,1
4,M,2
...,...,...
369008,H,1
369009,M,2
369010,M,2
369011,M,2


In [17]:
le.fit(challenge_df['airline'])
challenge_df['airline_category'] = le.transform(challenge_df['airline']) + 1
challenge_df[['airline','airline_category']]

Unnamed: 0,airline,airline_category
0,a73f82288988b79be490c6322f4c32ed,21
1,5543e4dc327359ffaf5b9c0e6faaf0e1,11
2,8be5c854fd664bcb97fb543339f74770,19
3,5543e4dc327359ffaf5b9c0e6faaf0e1,11
4,a73f82288988b79be490c6322f4c32ed,21
...,...,...
369008,5543e4dc327359ffaf5b9c0e6faaf0e1,11
369009,6351ec1b849adacc0cbb3b1313d8d39b,15
369010,a73f82288988b79be490c6322f4c32ed,21
369011,a73f82288988b79be490c6322f4c32ed,21


In [18]:
import lightgbm as lgb

# Splitting the data

In [19]:
from sklearn.model_selection import train_test_split
import gc

In [20]:
list(challenge_df)

['flight_id',
 'date',
 'callsign',
 'adep',
 'name_adep',
 'country_code_adep',
 'ades',
 'name_ades',
 'country_code_ades',
 'actual_offblock_time',
 'arrival_time',
 'aircraft_type',
 'wtc',
 'airline',
 'flight_duration',
 'taxiout_time',
 'flown_distance',
 'tow',
 'adep_category',
 'country_code_adep_category',
 'ades_category',
 'country_code_ades_category',
 'actual_offblock_time_month',
 'actual_offblock_time_day',
 'actual_offblock_time_hour',
 'actual_offblock_time_minue',
 'actual_offblock_time_month_sin',
 'actual_offblock_time_month_cos',
 'actual_offblock_time_day_sin',
 'actual_offblock_time_day_cos',
 'actual_offblock_time_hour_sin',
 'actual_offblock_time_hour_cos',
 'actual_offblock_time_minue_sin',
 'actual_offblock_time_minue_cos',
 'arrival_time_month',
 'arrival_time_day',
 'arrival_time_hour',
 'arrival_time_minue',
 'arrival_time_month_sin',
 'arrival_time_month_cos',
 'arrival_time_day_sin',
 'arrival_time_day_cos',
 'arrival_time_hour_sin',
 'arrival_time_hou

In [21]:
x, y = challenge_df[[
    'flight_duration',
    'taxiout_time',
    'flown_distance', 
    'adep_category',
    'country_code_adep_category',
    'ades_category',
    'country_code_ades_category',
    'actual_offblock_time_month',
    'actual_offblock_time_day',
    'actual_offblock_time_hour',
    'actual_offblock_time_minue',
    'actual_offblock_time_month_sin',
    'actual_offblock_time_month_cos',
    'actual_offblock_time_day_sin',
    'actual_offblock_time_day_cos',
    'actual_offblock_time_hour_sin',
    'actual_offblock_time_hour_cos',
    'actual_offblock_time_minue_sin',
    'actual_offblock_time_minue_cos',
    'arrival_time_month',
    'arrival_time_day',
    'arrival_time_hour',
    'arrival_time_minue',
    'arrival_time_month_sin',
    'arrival_time_month_cos',
    'arrival_time_day_sin',
    'arrival_time_day_cos',
    'arrival_time_hour_sin',
    'arrival_time_hour_cos',
    'arrival_time_minue_sin',
    'arrival_time_minue_cos',
    'aircraft_type_category',
    'wtc_category',
    'airline_category']], challenge_df['tow']

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Proper model
source: https://datascience.stackexchange.com/questions/103766/lightgbm-regressor-score-function

In [25]:
from lightgbm import LGBMRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import root_mean_squared_error, r2_score


model = LGBMRegressor(learning_rate=0.001,force_row_wise=true)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print("model score", model.score(x_train, y_train))
# 0.9863556751160256

print("rmse:", root_mean_squared_error(y_test, y_pred))
print("r2 score", r2_score(y_test, y_pred))

NameError: name 'true' is not defined

# Rate of climb(roc) and speed profile