In [None]:
__author__ = 'Nick Sarris (ngs5st)'

import os
import random
import time
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix, hstack

import xgboost as xgb
import lightgbm as lgb
import catboost as ctb

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras import backend as K

print(os.listdir("./data/"))

In [None]:
def seed_everything(seed=1235):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(1235)

In [None]:
start_time = time.time()
print("Loading Data ...")

directory = "./data/"
train_df = pd.read_csv(directory + 'train.csv')
test_df = pd.read_csv(directory + 'test.csv')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Feature Engineering [1] ...")

train_df['pickup_datetime'] = pd.to_datetime(train_df.pickup_datetime)
test_df['pickup_datetime'] = pd.to_datetime(test_df.pickup_datetime)
train_df['dropoff_datetime'] = pd.to_datetime(train_df.dropoff_datetime)
train_df['store_and_fwd_flag'] = 1 * (train_df.store_and_fwd_flag.values == 'Y')
test_df['store_and_fwd_flag'] = 1 * (test_df.store_and_fwd_flag.values == 'Y')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Feature Engineering [2] ...")

train_df.loc[:, 'pickup_date'] = train_df['pickup_datetime'].dt.date
train_df.loc[:, 'pickup_weekday'] = train_df['pickup_datetime'].dt.weekday
train_df.loc[:, 'pickup_day'] = train_df['pickup_datetime'].dt.day
train_df.loc[:, 'pickup_month'] = train_df['pickup_datetime'].dt.month
train_df.loc[:, 'pickup_hour'] = train_df['pickup_datetime'].dt.hour
train_df.loc[:, 'pickup_minute'] = train_df['pickup_datetime'].dt.minute
train_df.loc[:, 'pickup_dt'] = (train_df['pickup_datetime'] - train_df['pickup_datetime'].min()).map(
    lambda x: x.total_seconds())

test_df.loc[:, 'pickup_date'] = test_df['pickup_datetime'].dt.date
test_df.loc[:, 'pickup_weekday'] = test_df['pickup_datetime'].dt.weekday
test_df.loc[:, 'pickup_day'] = test_df['pickup_datetime'].dt.day
test_df.loc[:, 'pickup_month'] = test_df['pickup_datetime'].dt.month
test_df.loc[:, 'pickup_hour'] = test_df['pickup_datetime'].dt.hour
test_df.loc[:, 'pickup_minute'] = test_df['pickup_datetime'].dt.minute
test_df.loc[:, 'pickup_dt'] = (test_df['pickup_datetime'] - train_df['pickup_datetime'].min()).map(
    lambda x: x.total_seconds())

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

In [None]:
start_time = time.time()
print("Feature Engineering [3] ...")

train_df['distance_haversine'] = haversine_array(
    train_df['pickup_latitude'].values, train_df['pickup_longitude'].values,
    train_df['dropoff_latitude'].values, train_df['dropoff_longitude'].values)

train_df['distance_dummy_manhattan'] = dummy_manhattan_distance(
    train_df['pickup_latitude'].values, train_df['pickup_longitude'].values,
    train_df['dropoff_latitude'].values, train_df['dropoff_longitude'].values)

test_df['distance_haversine'] = haversine_array(
    test_df['pickup_latitude'].values, test_df['pickup_longitude'].values,
    test_df['dropoff_latitude'].values, test_df['dropoff_longitude'].values)

test_df['distance_dummy_manhattan'] = dummy_manhattan_distance(
    test_df['pickup_latitude'].values, test_df['pickup_longitude'].values,
    test_df['dropoff_latitude'].values, test_df['dropoff_longitude'].values)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Feature Engineering [4] ...")

train_df['center_latitude'] = (train_df['pickup_latitude'].values + train_df['dropoff_latitude'].values) / 2
train_df['center_longitude'] = (train_df['pickup_longitude'].values + train_df['dropoff_longitude'].values) / 2
test_df['center_latitude'] = (test_df['pickup_latitude'].values + test_df['dropoff_latitude'].values) / 2
test_df['center_longitude'] = (test_df['pickup_longitude'].values + test_df['dropoff_longitude'].values) / 2

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Feature Engineering [5] ...")

train_df['pickup_lat_bin'] = np.round(train_df['pickup_latitude'], 2)
train_df['pickup_long_bin'] = np.round(train_df['pickup_longitude'], 2)
train_df['center_lat_bin'] = np.round(train_df['center_latitude'], 2)
train_df['center_long_bin'] = np.round(train_df['center_longitude'], 2)
train_df['pickup_dt_bin'] = (train_df['pickup_dt'] // (3 * 3600))
test_df['pickup_lat_bin'] = np.round(test_df['pickup_latitude'], 2)
test_df['pickup_long_bin'] = np.round(test_df['pickup_longitude'], 2)
test_df['center_lat_bin'] = np.round(test_df['center_latitude'], 2)
test_df['center_long_bin'] = np.round(test_df['center_longitude'], 2)
test_df['pickup_dt_bin'] = (test_df['pickup_dt'] // (3 * 3600))

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Feature Engineering [6] ...")

train_df.loc[:, 'direction'] = bearing_array(
    train_df['pickup_latitude'].values, train_df['pickup_longitude'].values,
    train_df['dropoff_latitude'].values, train_df['dropoff_longitude'].values)

test_df.loc[:, 'direction'] = bearing_array(
    test_df['pickup_latitude'].values, test_df['pickup_longitude'].values,
    test_df['dropoff_latitude'].values, test_df['dropoff_longitude'].values)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Feature Engineering [7] ...")

full = pd.concat([train_df, test_df]).reset_index(drop=True)
coords = np.vstack((full[['pickup_latitude', 'pickup_longitude']],
                    full[['dropoff_latitude', 'dropoff_longitude']]))

pca = PCA().fit(coords)
train_df['pickup_pca0'] = pca.transform(train_df[['pickup_latitude', 'pickup_longitude']])[:, 0]
train_df['pickup_pca1'] = pca.transform(train_df[['pickup_latitude', 'pickup_longitude']])[:, 1]
train_df['dropoff_pca0'] = pca.transform(train_df[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
train_df['dropoff_pca1'] = pca.transform(train_df[['dropoff_latitude', 'dropoff_longitude']])[:, 1]
test_df['pickup_pca0'] = pca.transform(test_df[['pickup_latitude', 'pickup_longitude']])[:, 0]
test_df['pickup_pca1'] = pca.transform(test_df[['pickup_latitude', 'pickup_longitude']])[:, 1]
test_df['dropoff_pca0'] = pca.transform(test_df[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
test_df['dropoff_pca1'] = pca.transform(test_df[['dropoff_latitude', 'dropoff_longitude']])[:, 1]

kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords)
train_df.loc[:, 'pickup_cluster'] = kmeans.predict(train_df[['pickup_latitude', 'pickup_longitude']])
train_df.loc[:, 'dropoff_cluster'] = kmeans.predict(train_df[['dropoff_latitude', 'dropoff_longitude']])
test_df.loc[:, 'pickup_cluster'] = kmeans.predict(test_df[['pickup_latitude', 'pickup_longitude']])
test_df.loc[:, 'dropoff_cluster'] = kmeans.predict(test_df[['dropoff_latitude', 'dropoff_longitude']])

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Feature Engineering [8] ...")

group_freq = '60min'
df_all = pd.concat((train_df, test_df))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
train_df.loc[:, 'pickup_datetime_group'] = train_df['pickup_datetime'].dt.round(group_freq)
test_df.loc[:, 'pickup_datetime_group'] = test_df['pickup_datetime'].dt.round(group_freq)

df_counts = df_all.set_index('pickup_datetime')[['id']].sort_index()
df_counts['count_60min'] = df_counts.isnull().rolling(group_freq).count()['id']
train_df = train_df.merge(df_counts, on='id', how='left')
test_df = test_df.merge(df_counts, on='id', how='left')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Feature Engineering [9] ...")

train_df['pca_manhattan'] = np.abs(train_df['dropoff_pca1'] - train_df['pickup_pca1']) + \
                         np.abs(train_df['dropoff_pca0'] - train_df['pickup_pca0'])

test_df['pca_manhattan'] = np.abs(test_df['dropoff_pca1'] - test_df['pickup_pca1']) + \
                        np.abs(test_df['dropoff_pca0'] - test_df['pickup_pca0'])

train_df['direction_ns'] = (train_df.pickup_latitude > train_df.dropoff_latitude) * 1 + 1
indices = train_df[(train_df.pickup_latitude == train_df.dropoff_longitude) & (train_df.pickup_latitude != 0)].index
train_df.loc[indices, 'direction_ns'] = 0

train_df['direction_ew'] = (train_df.pickup_longitude > train_df.dropoff_longitude) * 1 + 1
indices = train_df[(train_df.pickup_longitude == train_df.dropoff_longitude) & (train_df.pickup_longitude != 0)].index
train_df.loc[indices, 'direction_ew'] = 0

test_df['direction_ns'] = (test_df.pickup_latitude > test_df.dropoff_latitude) * 1 + 1
indices = test_df[(test_df.pickup_latitude == test_df.dropoff_longitude) & (test_df.pickup_latitude != 0)].index
test_df.loc[indices, 'direction_ns'] = 0

test_df['direction_ew'] = (test_df.pickup_longitude > test_df.dropoff_longitude) * 1 + 1
indices = test_df[(test_df.pickup_longitude == test_df.dropoff_longitude) & (test_df.pickup_longitude != 0)].index
test_df.loc[indices, 'direction_ew'] = 0

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Features ...")

cols_to_drop = ['id', 'log_trip_duration', 'pickup_datetime', 'dropoff_datetime',
                'trip_duration', 'check_trip_duration', 'pickup_date', 'pickup_lat_bin', 'pickup_long_bin',
                'center_lat_bin', 'center_long_bin', 'pickup_dt_bin', 'pickup_datetime_group']

ids = test_df["id"].values
labels = np.log(train_df['trip_duration'].values + 1)
features = [f for f in train_df.columns if f not in cols_to_drop]

train_df = train_df[features]
test_df = test_df[features]

for f in train_df.columns:
    if train_df[f].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_df[f].values))
        train_df[f] = lbl.transform(list(train_df[f].values))
        test_df[f] = lbl.transform(list(test_df[f].values))

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
class XgbWrapper(object):
    
    def __init__(self, seed=2017, params=None):
        self.param = params
        self.param['seed'] = seed

    def train(self, xtra, ytra, xte, yte):
        dtrain = xgb.DMatrix(xtra, label=ytra)
        dvalid = xgb.DMatrix(xte, label=yte)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        self.gbdt = xgb.train(self.param, dtrain, 200,
            watchlist, early_stopping_rounds=10, verbose_eval=20)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

class LgbWrapper(object):
    
    def __init__(self, seed=2017, params=None):
        self.param = params
        self.param['seed'] = seed

    def train(self, xtra, ytra, xte, yte):
        ytra = ytra.ravel()
        yte = yte.ravel()
        dtrain = lgb.Dataset(xtra, label=ytra)
        dvalid = lgb.Dataset(xte, label=yte)
        watchlist = [dvalid]
        self.gbdt = lgb.train(self.param, dtrain, 400, 
            watchlist, early_stopping_rounds=10, verbose_eval=20)

    def predict(self, x):
        return self.gbdt.predict(x)

class CtbWrapper(object):
    
    def __init__(self, seed=2017, params=None):
        self.seed = seed

    def train(self, xtra, ytra, xte, yte):
        self.gbdt = ctb.CatBoostRegressor(depth=14,
            iterations=250, random_seed=self.seed,
            use_best_model=True, loss_function='RMSE',
            thread_count=16, eval_metric='RMSE')

        xtra = pd.DataFrame(xtra)
        ytra = pd.DataFrame(ytra)
        xte = pd.DataFrame(xte)
        yte = pd.DataFrame(yte)

        self.gbdt.fit(X=xtra, y=ytra, eval_set=(xte, yte),
                      use_best_model=True, verbose_eval=20)

    def predict(self, x):
        return self.gbdt.predict(x)

In [None]:
def get_oof(clf, ntrain, ntest, kf, train, labels, test):

    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((5, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = train[train_index]
        y_tr = labels[train_index]
        x_te = train[test_index]
        y_te = labels[test_index]

        clf.train(x_tr, y_tr, x_te, y_te)
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
start_time = time.time()
print("Splitting Data ...")

train_x = np.array(train_df)
test_x = np.array(test_df)
labels = np.array(labels)

ntrain = train_x.shape[0]
ntest = test_x.shape[0]

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Establishing Parameters ...")

lgb_params = {}
lgb_params['boosting_type'] = 'gbdt'
lgb_params['objective'] = 'regression'
lgb_params['metric'] = 'mse'
lgb_params['num_leaves'] = 96
lgb_params['max_depth'] = 10
lgb_params['feature_fraction'] = 0.9
lgb_params['bagging_fraction'] = 0.95
lgb_params['bagging_freq'] = 5
lgb_params['learning_rate'] = 0.1
lgb_params['early_stopping_round'] = 20

xgb_params = {}
xgb_params['booster'] = 'gbtree'
xgb_params['objective'] = 'reg:linear'
xgb_params['learning_rate'] = 0.1
xgb_params['max_depth'] = 14
xgb_params['subsample'] = 0.8
xgb_params['colsample_bytree'] = 0.7
xgb_params['colsample_bylevel'] = 0.7
xgb_params['silent'] = 1

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Training Catboost ...")

cg = CtbWrapper()
kf = KFold(n_splits=5, shuffle=True, random_state=2018).split(train_x)
cg_oof_train, cg_oof_test = get_oof(cg, ntrain, ntest, kf, train_x, labels, test_x)
print("CG-CV: {}".format(mean_squared_error(labels, cg_oof_train)))

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Training XGBoost ...")

xg = XgbWrapper(seed=2017, params=xgb_params)
kf = KFold(n_splits=5, shuffle=True, random_state=2018).split(train_x)
xg_oof_train, xg_oof_test = get_oof(xg, ntrain, ntest, kf, train_x, labels, test_x)
print("XG-CV: {}".format(mean_squared_error(labels, xg_oof_train)))

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Training LightGBM ...")

lg = LgbWrapper(seed=2017, params=lgb_params)
kf = KFold(n_splits=5, shuffle=True, random_state=2018).split(train_x)
lg_oof_train, lg_oof_test = get_oof(lg, ntrain, ntest, kf, train_x, labels, test_x)
print("LG-CV: {}".format(mean_squared_error(labels, lg_oof_train)))

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Combining Data ...")

train_conc = np.concatenate((cg_oof_train, xg_oof_train, lg_oof_train), axis=1)
test_conc = np.concatenate((cg_oof_test, xg_oof_test, lg_oof_test), axis=1)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Training Stacking Ensemble ...")

dtrain = xgb.DMatrix(train_conc, label=labels)
dtest = xgb.DMatrix(test_conc)

xgb_params = {}
xgb_params["objective"] = "reg:linear"
xgb_params["eta"] = 0.1
xgb_params["subsample"] = 0.9
xgb_params["silent"] = 1
xgb_params["max_depth"] = 5
xgb_params['eval_metric'] = 'rmse'
xgb_params['min_child_weight'] = 10
xgb_params['seed'] = 2017

res = xgb.cv(xgb_params, dtrain, num_boost_round=500, nfold=5, seed=2017, stratified=False,
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))
bst = xgb.train(xgb_params, dtrain, best_nrounds)
stacking_preds = np.exp(bst.predict(dtest))

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
def nn_model():

    model = Sequential()
    model.add(Dense(units=400, input_dim=28, kernel_initializer='uniform', activation='relu'))
    model.add(BatchNormalization())
    model.add(PReLU())
    model.add(Dropout(0.6))

    model.add(Dense(units=100, kernel_initializer='uniform', activation='relu'))
    model.add(BatchNormalization())
    model.add(PReLU())
    model.add(Dropout(0.4))

    model.add(Dense(units=1, kernel_initializer='uniform', activation='relu'))
    model.compile(loss='mean_squared_error', optimizer='sgd')
    return model

In [None]:
start_time = time.time()
print("Preparing Data for NN ...")

tr_te = pd.concat([train_df, test_df]).reset_index(drop=True)

ntrain = train_df.shape[0]
ntest = test_df.shape[0]

scaler = StandardScaler()
tr_te = scaler.fit_transform(tr_te)

train_x = tr_te[:ntrain, :]
test_x = tr_te[ntrain:, :]

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Training Neural Net ...")

model = nn_model()
model.fit(train_x, labels, batch_size=64, epochs=20)
neural_preds = model.predict(test_x)[:, 0]

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Generate Submission ...")

combined_preds = (stacking_preds * 0.85) + (neural_preds * 0.15)

submission = pd.DataFrame()
submission['id'] = ids
submission['trip_duration'] = combined_preds
submission.to_csv("output_data.csv", index=False)

print("--- %s seconds ---" % (time.time() - start_time))