In [1]:
import numpy as np
import pandas as pd
import os
import datetime
# import pandas_profiling
import gc
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from tqdm import tqdm_notebook as tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score, confusion_matrix, recall_score, precision_score, f1_score,mean_absolute_error
from math import sqrt

from sklearn import metrics                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, CatBoostClassifier

pd.set_option('display.max_columns', 250)
pd.set_option('display.max_rows', 250)
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
from multiprocessing import  Pool
def parallelize_dataframe(df, func, n_cores=12):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def mape(y_true, y_pred): 
    mape = (np.abs((y_true - y_pred)/(y_true))).mean()
    return mape 

def preprocessing(train,test):
    cat_cols = train.select_dtypes("object").columns
    for col in tqdm(cat_cols):
        if col in train.columns:
            train[col] = train[col].fillna("unseen").astype(str)
            test[col] = test[col].fillna("unseen").astype(str)
            le = LabelEncoder()
            le.fit(list(train[col])+list(test[col]))
            train[col] = le.transform(train[col])
            train[col] = train[col].astype("category")        
            test[col] = le.transform(test[col])
            test[col] = test[col].astype("category")         
    return train, test

def kfold_lightgbm(params, train_df, test_df, FEATS_EXCLUDED,n_estimators, LOCAL_TEST=True):
    print("Starting LightGBM. Train shape: {}".format(train_df.shape))
    num_folds = 3
    folds = KFold(n_splits = num_folds, shuffle=True, random_state=326)
    models = []
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance = pd.DataFrame()
    final_rmse = 0
    final_mape = 0
    final_mae = 0
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED+["PSGR_COUNT"]]
    clfs = []
    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['PSGR_COUNT'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['PSGR_COUNT'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['PSGR_COUNT'].iloc[valid_idx]
    
        lgb_train = lgb.Dataset(train_x,label=train_y,free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,label=valid_y,free_raw_data=False)

        reg = lgb.train(params,lgb_train,valid_sets=[lgb_train, lgb_test],valid_names=['train', 'valid'],
                        num_boost_round=n_estimators,early_stopping_rounds= 200,verbose_eval=1000)
        
        oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)

        # set data structure
        #reg = lgb.LGBMRegressor(**params, n_estimators = n_estimators)   
        #reg.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
        #          eval_metric="auc",verbose=333, early_stopping_rounds=333)       
        
        #oof_preds[valid_idx] = reg.predict(valid_x)
        if LOCAL_TEST==False:
            sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits

        imp = pd.DataFrame()
        imp["feature"] = train_x.columns
        imp["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        imp["fold"] = n_fold + 1
        feature_importance = pd.concat([feature_importance, imp], axis=0)
         
        curr_rmse = sqrt(mean_squared_error(valid_y, oof_preds[valid_idx]))
        curr_mape = mape(valid_y, oof_preds[valid_idx]) 
        curr_mae = mean_absolute_error(valid_y, oof_preds[valid_idx])
        
        print('Fold %2d -> rmse : %.6f -- mape : %.6f -- mae : %.6f' % (n_fold + 1, curr_rmse,curr_mape,curr_mae)) 
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()
        
    final_rmse = sqrt(mean_squared_error(train_df['PSGR_COUNT'], oof_preds))
    final_mape = mape(train_df['PSGR_COUNT'], oof_preds) 
    final_mae = mean_absolute_error(train_df['PSGR_COUNT'], oof_preds)
              
    print('Overall RMSE : %.6f - Overall MAPE : %.6f - Overall MAE : %.6f' % (final_rmse,final_mape,final_mae))
        
    return sub_preds, oof_preds, feature_importance

In [3]:
def day_features(df):
    df['LEG1_DEP_DATE_GMT_MONTH'] = df['LEG1_DEP_DATE_GMT'].dt.month.astype(np.int8)
    df['LEG1_DEP_DATE_GMT_WEEK'] = df['LEG1_DEP_DATE_GMT'].dt.weekofyear.astype(np.int8)
    df['LEG1_DEP_DATE_GMT_DAYOFYEAR'] = df['LEG1_DEP_DATE_GMT'].dt.dayofyear.astype(np.int16)
    df['LEG1_DEP_DATE_GMT_DAYOFWEEK'] = df['LEG1_DEP_DATE_GMT'].dt.dayofweek.astype(np.int16)
    df['LEG1_DEP_DATE_GMT_DAYOFMONTH'] = df['LEG1_DEP_DATE_GMT'].dt.day.astype(np.int16)

def time_features(df):
    df["LEG1_DEP_TIME_GMT_HOUR"] = df["LEG1_DEP_TIME_GMT"].apply(lambda x: int(x)//100)
    df["LEG1_DEP_TIME_GMT_MINUTE"] = df["LEG1_DEP_TIME_GMT"].apply(lambda x: int(x)%100)
    df["LEG1_DEP_DATE_GMT"] = df["LEG1_DEP_DATE_GMT"].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))
    df["LEG1_DEP_DATE_GMT"] = pd.to_datetime(df["LEG1_DEP_DATE_GMT"]) + pd.to_timedelta(df["LEG1_DEP_TIME_GMT_HOUR"], unit='h') + pd.to_timedelta(df["LEG1_DEP_TIME_GMT_MINUTE"], unit='m')

    df["LEG2_DEP_TIME_GMT_HOUR"] = df["LEG2_DEP_TIME_GMT"].apply(lambda x: int(x)//100)
    df["LEG2_DEP_TIME_GMT_MINUTE"] = df["LEG2_DEP_TIME_GMT"].apply(lambda x: int(x)%100)
    df["LEG2_DEP_DATE_GMT"] = df["LEG2_DEP_DATE_GMT"].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))
    df["LEG2_DEP_DATE_GMT"] = pd.to_datetime(df["LEG2_DEP_DATE_GMT"]) + pd.to_timedelta(df["LEG2_DEP_TIME_GMT_HOUR"], unit='h') + pd.to_timedelta(df["LEG2_DEP_TIME_GMT_MINUTE"], unit='m')

    df["LEG1_ARR_TIME_GMT_HOUR"] = df["LEG1_ARR_TIME_GMT"].apply(lambda x: int(x)//100)
    df["LEG1_ARR_TIME_GMT_MINUTE"] = df["LEG1_ARR_TIME_GMT"].apply(lambda x: int(x)%100)
    df["LEG1_ARR_DATE_GMT"] = df["LEG1_ARR_DATE_GMT"].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))
    df["LEG1_ARR_DATE_GMT"] = pd.to_datetime(df["LEG1_ARR_DATE_GMT"]) + pd.to_timedelta(df["LEG1_ARR_TIME_GMT_HOUR"], unit='h') + pd.to_timedelta(df["LEG1_ARR_TIME_GMT_MINUTE"], unit='m')

    df["LEG2_ARR_TIME_GMT_HOUR"] = df["LEG2_ARR_TIME_GMT"].apply(lambda x: int(x)//100)
    df["LEG2_ARR_TIME_GMT_MINUTE"] = df["LEG2_ARR_TIME_GMT"].apply(lambda x: int(x)%100)
    df["LEG2_ARR_DATE_GMT"] = df["LEG2_ARR_DATE_GMT"].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))
    df["LEG2_ARR_DATE_GMT"] = pd.to_datetime(df["LEG2_ARR_DATE_GMT"]) + pd.to_timedelta(df["LEG2_ARR_TIME_GMT_HOUR"], unit='h') + pd.to_timedelta(df["LEG2_ARR_TIME_GMT_MINUTE"], unit='m')
    return df

In [None]:
# https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'PSGR_COUNT'}),
        on=trn_series.name,
        how='left')['PSGR_COUNT'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'PSGR_COUNT'}),
        on=tst_series.name,
        how='left')['PSGR_COUNT'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [4]:
dtypes_dict ={}
for col in ['CARRIER', 'FLIGHT_NO', 'AIRCRAFT_TYPE', 'OND', 'OND_SELL_CLASS','LEG1_SELL_CLASS', 'OND_CABIN_CLASS',
            'LEG1_CABIN_CLASS', 'ORIGIN','HUB', 'DESTINATION']:
    dtypes_dict[col]="category"

In [7]:
train = pd.read_csv("datathon_case_2/case_2_train_data.csv",dtype=dtypes_dict)
test = pd.read_csv("datathon_case_2/case_2_result.csv",dtype=dtypes_dict)
print(train.shape, test.shape)

(90562838, 23) (100000, 23)


In [8]:
train = parallelize_dataframe(df = train, func = time_features)
test = parallelize_dataframe(df = test, func = time_features)

In [10]:
# train.to_pickle('train.pkl')
# test.to_pickle('test.pkl')

# train = pickle.load( open( "train.pkl", "rb" ) )
# test = pickle.load( open( "test.pkl", "rb" ) )

In [6]:
train["FLIGHT_NO"] = train["FLIGHT_NO"].astype(str).fillna("UNSEEN").astype("category")
train["AIRCRAFT_TYPE"] = train["AIRCRAFT_TYPE"].astype(str).fillna("UNSEEN").astype("category")
test["FLIGHT_NO"] = test["FLIGHT_NO"].astype(str).fillna("UNSEEN").astype("category")
test["AIRCRAFT_TYPE"] = test["AIRCRAFT_TYPE"].astype(str).fillna("UNSEEN").astype("category")

### Target Encoding

In [None]:
for col in ['CARRIER', 'FLIGHT_NO', 'AIRCRAFT_TYPE', 'OND', 'ORIGIN','HUB', 'DESTINATION']:
    trn, sub = target_encode(train[col],test[col], target=train["PSGR_COUNT"], 
                             min_samples_leaf=100,smoothing=10, noise_level=0.01)
    
    train[col+"_MEAN"] = trn
    test[col+"_MEAN"] = sub

### Frequency Encoding

In [8]:
#Frequency of Categorical values
for col in ['CARRIER', 'FLIGHT_NO', 'AIRCRAFT_TYPE', 'OND', 'ORIGIN','HUB', 'DESTINATION']:
    print(col)
    freq_dict = pd.concat([train[col],test[col]]).value_counts().to_dict()
    train[col+"_FREQ"] = train[col].map(freq_dict)
    test[col+"_FREQ"] = test[col].map(freq_dict)
    
#     agg_opts = ['mean']
#     agg_dict = train.groupby(col).agg({'PSGR_COUNT':agg_opts}).to_dict()
#     for opt in agg_opts:
#         train[col+"_"+opt] = train[col].map(agg_dict[('PSGR_COUNT', opt)])


CARRIER
FLIGHT_NO
AIRCRAFT_TYPE
OND
ORIGIN
HUB
DESTINATION


In [11]:
# for col in ['CARRIER', 'FLIGHT_NO', 'AIRCRAFT_TYPE', 'OND', 'ORIGIN','HUB', 'DESTINATION']:
#     print(col)
#     temp = train[[col,col+'_mean']].drop_duplicates()
#     test = test.merge(temp,'left',col)

CARRIER
FLIGHT_NO
AIRCRAFT_TYPE
OND
ORIGIN
HUB
DESTINATION


In [None]:
# train.to_pickle('train2.pkl')
# test.to_pickle('test2.pkl')

In [None]:
train = parallelize_dataframe(df = train, func = day_features)
test = parallelize_dataframe(df = test, func = day_features)
# train.to_pickle('train3.pkl')
# test.to_pickle('test3.pkl')

In [4]:
# train = pickle.load( open( "train3.pkl", "rb" ) )
# test = pickle.load( open( "test3.pkl", "rb" ) )

In [5]:
train["OND_ALL"] = (train["ORIGIN"].astype("str")+train["HUB"].astype(str)+train["DESTINATION"].astype(str)).astype("category")
test["OND_ALL"] = (test["ORIGIN"].astype("str")+test["HUB"].astype(str)+test["DESTINATION"].astype(str)).astype("category")

In [6]:
train["LEG1_DURATION"] = ((train["LEG1_ARR_DATE_GMT"] - train["LEG1_DEP_DATE_GMT"])/np.timedelta64(60, 's')).astype(int)
train["LEG2_DURATION"] = ((train["LEG2_ARR_DATE_GMT"] - train["LEG2_DEP_DATE_GMT"])/np.timedelta64(60, 's')).astype(int)
train["LEG_DURATION"] = ((train["LEG2_ARR_DATE_GMT"] - train["LEG1_DEP_DATE_GMT"])/np.timedelta64(60, 's')).astype(int)

test["LEG1_DURATION"] = ((test["LEG1_ARR_DATE_GMT"] - test["LEG1_DEP_DATE_GMT"])/np.timedelta64(60, 's')).astype(int)
test["LEG2_DURATION"] = ((test["LEG2_ARR_DATE_GMT"] - test["LEG2_DEP_DATE_GMT"])/np.timedelta64(60, 's')).astype(int)
test["LEG_DURATION"] = ((test["LEG2_ARR_DATE_GMT"] - test["LEG1_DEP_DATE_GMT"])/np.timedelta64(60, 's')).astype(int)

In [7]:
col = "OND_ALL"
freq_dict = pd.concat([train[col],test[col]]).value_counts().to_dict()
train[col+"_FREQ"] = train[col].map(freq_dict)
test[col+"_FREQ"] = test[col].map(freq_dict)

In [8]:
train.shape

(90562838, 55)

In [10]:
# for col in ['CARRIER', 'FLIGHT_NO', 'AIRCRAFT_TYPE', 'OND', 'OND_SELL_CLASS', 'LEG1_SELL_CLASS', 'OND_CABIN_CLASS', 'LEG1_CABIN_CLASS', 'ORIGIN', 'HUB', 'DESTINATION', 'OND_ALL']:
#     print(col,test.loc[test[col].isin(train[col])==False,col].nunique())

CARRIER 0
FLIGHT_NO 0
AIRCRAFT_TYPE 0
OND 39
OND_SELL_CLASS 0
LEG1_SELL_CLASS 0
OND_CABIN_CLASS 0
LEG1_CABIN_CLASS 0
ORIGIN 0
HUB 0
DESTINATION 0
OND_ALL 127


In [11]:
FEATS_EXCLUDED = ["ID_REC","LEG1_DEP_DATE_GMT","LEG1_ARR_DATE_GMT","LEG2_DEP_DATE_GMT","LEG2_ARR_DATE_GMT","LEG1_DEP_TIME_GMT",
                  "LEG1_DEP_TIME_GMT","LEG1_ARR_TIME_GMT","LEG2_DEP_TIME_GMT","LEG2_ARR_TIME_GMT",
                  'CARRIER', 'FLIGHT_NO', 'AIRCRAFT_TYPE', 'OND', 'ORIGIN','HUB', 'DESTINATION']
for col in FEATS_EXCLUDED:
    if col in train:
        del train[col], test[col]
        
gc.collect()

41

In [23]:
train["CARRIER_FREQ"] = train["CARRIER_FREQ"].astype(int)
train["CARRIER_mean"] = train["CARRIER_mean"].astype(float)

train["FLIGHT_NO_FREQ"] = train["FLIGHT_NO_FREQ"].astype(int)
train["FLIGHT_NO_mean"] = train["FLIGHT_NO_mean"].astype(float)

train["AIRCRAFT_TYPE_FREQ"] = train["AIRCRAFT_TYPE_FREQ"].astype(int)
train["AIRCRAFT_TYPE_mean"] = train["AIRCRAFT_TYPE_mean"].astype(float)

test["CARRIER_FREQ"] = test["CARRIER_FREQ"].astype(int)
test["CARRIER_mean"] = test["CARRIER_mean"].astype(float)

test["FLIGHT_NO_FREQ"] = test["FLIGHT_NO_FREQ"].astype(int)
test["FLIGHT_NO_mean"] = test["FLIGHT_NO_mean"].astype(float)

test["AIRCRAFT_TYPE_FREQ"] = test["AIRCRAFT_TYPE_FREQ"].astype(int)
test["AIRCRAFT_TYPE_mean"] = test["AIRCRAFT_TYPE_mean"].astype(float)

In [None]:
params = {'objective': 'regression','metric': 'rmse','learning_rate': 0.05,'verbose': -1,'nthread':32,
          'num_leaves': 10, 'min_data': 50, 'max_depth': 10, 'num_leaves': 31, 'min_data_in_leaf': 50, 'feature_fraction': 0.8,
          'bagging_fraction': 0.8}

models, sub_preds, oof_preds, feature_importance = kfold_lightgbm(params,train,test,FEATS_EXCLUDED,9999,LOCAL_TEST=False)

Starting LightGBM. Train shape: (90562838, 39)
Training until validation scores don't improve for 200 rounds.
[1000]	train's rmse: 2.60389	valid's rmse: 2.63329
[2000]	train's rmse: 2.57422	valid's rmse: 2.60941
[3000]	train's rmse: 2.55596	valid's rmse: 2.59551
[4000]	train's rmse: 2.54272	valid's rmse: 2.58583
[5000]	train's rmse: 2.53129	valid's rmse: 2.57785
[6000]	train's rmse: 2.52148	valid's rmse: 2.57084
[7000]	train's rmse: 2.51352	valid's rmse: 2.56566
[8000]	train's rmse: 2.50573	valid's rmse: 2.5606
[9000]	train's rmse: 2.49892	valid's rmse: 2.55645
Did not meet early stopping. Best iteration is:
[9999]	train's rmse: 2.49246	valid's rmse: 2.55233


In [None]:
oof_preds1 = np.where(oof_preds<1,1,oof_preds)
sub_preds1 = np.where(sub_preds<1,1,sub_preds)
final_rmse = sqrt(mean_squared_error(train['PSGR_COUNT'], oof_preds1))
final_mape = mape(train['PSGR_COUNT'], oof_preds1) 
final_mae = mean_absolute_error(train['PSGR_COUNT'], oof_preds1)
print('Final rmse : %.6f -- mape : %.6f -- mae : %.6f' % (final_rmse,final_mape,final_mae)) 

In [None]:
train_id = pd.read_csv("datathon_case_2/case_2_train_data.csv",usecols=["ID_REC"])
test_id = pd.read_csv("datathon_case_2/case_2_result.csv",usecols=["ID_REC"])

In [37]:
train_pred = pd.DataFrame()
train_pred["ID_REC"] = train_id["ID_REC"]
train_pred["PSGR_COUNT"] = oof_preds1
train_pred.to_csv("model_target_oof_v1.csv",index=False)

In [None]:
test_pred = pd.DataFrame()
test_pred["ID_REC"] = test_id["ID_REC"]
test_pred["PSGR_COUNT"] = sub_preds1
test_pred.to_csv("Result.csv",index=False)