In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
challenge_df = pd.read_csv("../data/challenge_set.csv")
submission_df = pd.read_csv("../data/submission_set.csv")

In [None]:
challenge_df.describe()

In [None]:
challenge_df.head()

In [None]:
list(challenge_df)

## Feature Encoding:

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [None]:
le.fit(challenge_df['adep'])
challenge_df['adep_category'] = le.transform(challenge_df['adep']) + 1
challenge_df[['adep','adep_category']]

In [None]:
le.fit(challenge_df['country_code_adep'])
challenge_df['country_code_adep_category'] = le.transform(challenge_df['country_code_adep']) + 1
challenge_df[['country_code_adep','country_code_adep_category']]

In [None]:
le.fit(challenge_df['ades'])
challenge_df['ades_category'] = le.transform(challenge_df['ades']) + 1
challenge_df[['ades','adep_category']]

In [None]:
le.fit(challenge_df['country_code_ades'])
challenge_df['country_code_ades_category'] = le.transform(challenge_df['country_code_ades']) + 1
challenge_df[['country_code_ades','country_code_ades_category']]

In [None]:
def encode_datetime(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [None]:
challenge_df['actual_offblock_time'] = pd.to_datetime(challenge_df['actual_offblock_time'], utc=True)
challenge_df['actual_offblock_time_month'] = challenge_df["actual_offblock_time"].dt.month
challenge_df['actual_offblock_time_day'] = challenge_df["actual_offblock_time"].dt.day
challenge_df['actual_offblock_time_hour'] = challenge_df["actual_offblock_time"].dt.hour
challenge_df['actual_offblock_time_minue'] = challenge_df["actual_offblock_time"].dt.minute
encode_datetime(challenge_df, 'actual_offblock_time_month', 12)
encode_datetime(challenge_df, 'actual_offblock_time_day', 31)
encode_datetime(challenge_df, 'actual_offblock_time_hour', 24)
encode_datetime(challenge_df, 'actual_offblock_time_minue', 60)
challenge_df

In [None]:
challenge_df['arrival_time'] = pd.to_datetime(challenge_df['arrival_time'], utc=True)
challenge_df['arrival_time_month'] = challenge_df["arrival_time"].dt.month
challenge_df['arrival_time_day'] = challenge_df["arrival_time"].dt.day
challenge_df['arrival_time_hour'] = challenge_df["arrival_time"].dt.hour
challenge_df['arrival_time_minue'] = challenge_df["arrival_time"].dt.minute
encode_datetime(challenge_df, 'arrival_time_month', 12)
encode_datetime(challenge_df, 'arrival_time_day', 31)
encode_datetime(challenge_df, 'arrival_time_hour', 12)
encode_datetime(challenge_df, 'arrival_time_minue', 60)
challenge_df

In [None]:
le.fit(challenge_df['aircraft_type'])
challenge_df['aircraft_type_category'] = le.transform(challenge_df['aircraft_type']) + 1
challenge_df[['aircraft_type','aircraft_type_category']]

In [None]:
le.fit(challenge_df['wtc'])
challenge_df['wtc_category'] = le.transform(challenge_df['wtc']) + 1
challenge_df[['wtc','wtc_category']]

In [None]:
le.fit(challenge_df['airline'])
challenge_df['airline_category'] = le.transform(challenge_df['airline']) + 1
challenge_df[['airline','airline_category']]

In [None]:
import lightgbm as lgb

# Splitting the data

In [None]:
from sklearn.model_selection import train_test_split
import gc

In [None]:
list(challenge_df)

In [None]:
x, y = challenge_df[['flight_id','flight_duration',
 'taxiout_time',
 'flown_distance', 'adep_category',
 'country_code_adep_category',
 'ades_category',
 'country_code_ades_category',
 'actual_offblock_time_month',
 'actual_offblock_time_day',
 'actual_offblock_time_hour',
 'actual_offblock_time_minue',
 'actual_offblock_time_month_sin',
 'actual_offblock_time_month_cos',
 'actual_offblock_time_day_sin',
 'actual_offblock_time_day_cos',
 'actual_offblock_time_hour_sin',
 'actual_offblock_time_hour_cos',
 'actual_offblock_time_minue_sin',
 'actual_offblock_time_minue_cos',
 'arrival_time_month',
 'arrival_time_day',
 'arrival_time_hour',
 'arrival_time_minue',
 'arrival_time_month_sin',
 'arrival_time_month_cos',
 'arrival_time_day_sin',
 'arrival_time_day_cos',
 'arrival_time_hour_sin',
 'arrival_time_hour_cos',
 'arrival_time_minue_sin',
 'arrival_time_minue_cos',
 'aircraft_type_category',
 'wtc_category',
 'airline_category']], challenge_df['tow']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
d_train = lgb.Dataset(x_train, label=y_train)

In [None]:
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.001
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          
params['sub_feature'] = 0.5      
params['bagging_fraction'] = 0.85 
params['bagging_freq'] = 40
params['num_leaves'] = 512        
params['min_data'] = 500         
params['min_hessian'] = 0.05     
params['verbose'] = 0
print("\nFitting LightGBM model ...")
clf = lgb.train(params, d_train)

In [None]:
sample = pd.read_csv('../data/submission_set.csv')
sample['flight_id'] = sample['flight_id']

In [None]:
p_test = clf.predict(x_test)
print( "\nUnadjusted LightGBM predictions:" )
print( pd.DataFrame(p_test).head() )

In [None]:
pd.DataFrame(p_test)

In [None]:
y_pred=[]
for i,predict in enumerate(p_test):
    y_pred.append(round(predict,4))
y_pred= np.array(y_pred)
y_test= np.array(y_test)

In [None]:
from sklearn.metrics import root_mean_squared_error, r2_score
r2_score(y_test, y_pred)

In [None]:
root_mean_squared_error(y_test, y_pred)

# Rate of climb(roc) and speed profile