In [None]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from geopy import distance
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, KFold
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from lightgbm import LGBMRegressor
sns.set_style('darkgrid')

In [None]:
train_df = pd.read_csv('train.csv')
train_df.head()

In [None]:
test_dataset = pd.read_csv('test.csv')
test_dataset.head()

# test file does not have dropoff datetime 

In [None]:
train_df.describe()

In [None]:
# Can extend to include month and minutes
train_df['pickup_datetime'] = pd.to_datetime(train_df['pickup_datetime'])
train_df['dropoff_datetime'] = pd.to_datetime(train_df['dropoff_datetime'])

train_df['pickup_day_of_week'] = train_df['pickup_datetime'].apply(lambda a: a.weekday())
train_df['dropoff_day_of_week'] = train_df['dropoff_datetime'].apply(lambda a: a.weekday())
train_df['pickup_time_of_day'] = train_df['pickup_datetime'].apply(lambda a: a.hour)
train_df['dropoff_time_of_day'] = train_df['dropoff_datetime'].apply(lambda a: a.hour)

In [None]:
# Implement a faster way to calculate the distance
def get_distance(pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude):
    pickup_coord = (pickup_latitude, pickup_longitude)
    dropoff_coord = (dropoff_latitude, dropoff_longitude)
    dist = distance.distance(pickup_coord, dropoff_coord).miles
    return dist


# Implement a way to calculate the direction of travel 
def get_bearing(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude):
    dLon = (dropoff_longitude - pickup_longitude)
    x = math.cos(math.radians(dropoff_latitude)) * math.sin(math.radians(dLon))
    y = math.cos(math.radians(pickup_latitude)) * math.sin(math.radians(dropoff_latitude)) - math.sin(math.radians(pickup_latitude)) * math.cos(math.radians(dropoff_latitude)) * math.cos(math.radians(dLon))
    brng = np.arctan2(x,y)
    brng = np.degrees(brng)

    return brng


In [None]:
train_df['geo_dist'] = train_df.apply(
    lambda a: get_distance(a['pickup_longitude'], a['pickup_latitude'], a['dropoff_longitude'], a['dropoff_latitude']),
     axis =1)

train_df['bearing'] = train_df.apply(
    lambda a: get_bearing(a['pickup_longitude'], a['pickup_latitude'], a['dropoff_longitude'], a['dropoff_latitude']),
     axis =1)

In [None]:
def get_sample_df(df, sample_ratio = 0.5):
    df_len = len(df)
    sample_size = int(df_len*sample_ratio)
    rndm_inc = np.random.permutation(df_len)
    sample_df = df.iloc[rndm_inc[:sample_size]]
    return sample_df

In [None]:
# Constraints based on duration and distance, using quantile

upp_lim_duration = train_df['trip_duration'].quantile(0.99)
low_lim_duration = train_df['trip_duration'].quantile(0.01)
upp_lim_distance = train_df['geo_dist'].quantile(0.99)
low_lim_distance = train_df['geo_dist'].quantile(0.01)

train_df_constrained = train_df[
    (train_df['trip_duration'] < upp_lim_duration) & 
    (train_df['trip_duration'] > low_lim_duration) &
    (train_df['geo_dist'] < upp_lim_distance) &
    (train_df['geo_dist'] > low_lim_distance)
]

In [None]:
# Get a sample fo the constrained data

sample_train_df = get_sample_df(train_df_constrained, 0.1)

In [None]:
sns.scatterplot(data = sample_train_df, x = 'geo_dist', y = 'trip_duration', alpha = 0.1).set(yscale = "log", xscale="log");

In [None]:
sns.boxplot(data=train_df_constrained, y='trip_duration')

In [None]:
sns.displot(data = sample_train_df, x = 'trip_duration', kde = True, log_scale=True);

In [None]:
sns.countplot(data = train_df_constrained, x = 'pickup_time_of_day')

In [None]:
sns.countplot(data = train_df_constrained, x = 'pickup_day_of_week')

In [None]:
trip_duration_by_hour = train_df_constrained.groupby('pickup_time_of_day').median()['trip_duration'].to_frame()
sns.lineplot(data = trip_duration_by_hour, x = 'pickup_time_of_day', y = 'trip_duration')


In [None]:
trip_duration_by_day = train_df_constrained.groupby('pickup_day_of_week').median()['trip_duration'].to_frame()
sns.lineplot(data = trip_duration_by_day, x = 'pickup_day_of_week', y = 'trip_duration')

In [None]:
sns.countplot(data=train_df, x = 'vendor_id')

In [None]:
sns.scatterplot(data = train_df_constrained, x = 'pickup_longitude', y = 'pickup_latitude', alpha = 0.5);

In [None]:
sns.scatterplot(data = train_df_constrained, x = 'dropoff_longitude', y = 'dropoff_latitude', alpha = 0.5);

In [None]:
sns.boxplot(data = train_df_constrained, y = 'passenger_count');

In [None]:
# Remove rows which are way outside of NYC or with passenger count = 0
passenger_outlier = train_df_constrained['passenger_count'] > 0
pickup_outlier = (train_df_constrained['pickup_longitude'] < -73.0) & (train_df_constrained['pickup_longitude'] >-74.5) & (train_df_constrained['pickup_latitude'] > 40) &(train_df_constrained['pickup_latitude'] < 42) 
dropoff_outlier = (train_df_constrained['dropoff_longitude'] < -73.0) & (train_df_constrained['dropoff_longitude'] >-74.5) &(train_df_constrained['dropoff_latitude'] > 40) &(train_df_constrained['dropoff_latitude'] < 42)
train_df_clean = train_df_constrained[passenger_outlier & dropoff_outlier & pickup_outlier]


In [None]:
sns.histplot(data = train_df_clean, x = 'trip_duration', stat = 'count', bins = 100);

In [None]:
# Log transformation due to skewed distribution
train_df_clean['trip_duration'] = train_df_clean['trip_duration'].apply(lambda a: np.log(a))
sns.histplot(data = train_df_clean, x = 'trip_duration', stat = 'count', bins = 100);

In [None]:
# Encode a categorical column
store_fwd_encoded = pd.get_dummies(train_df_clean['store_and_fwd_flag'], drop_first=True)
train_df_clean = pd.concat([train_df_clean, store_fwd_encoded], axis = 1)
train_df_clean.drop(columns=['store_and_fwd_flag', 'id', 'pickup_datetime', 'dropoff_datetime', 'dropoff_day_of_week', 'dropoff_time_of_day' ], inplace= True)
train_df_clean.head()

In [None]:
corr_df = train_df_clean.corr()
fig, ax = plt.subplots(figsize=(10,6))  
sns.heatmap(data = corr_df, annot=True )

In [None]:
# Get separate train datasets
Y = train_df_clean['trip_duration']
X = train_df_clean.drop(columns=['trip_duration'])

In [None]:
# Train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=42)

In [None]:
# Light GBM model

# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
# params = {
#     'task': 'train',
#     'objective': 'regression',
#     'boosting': 'gbdt',
#     'learning_rate': 0.1,
#     'metric' : 'root_mean_squared_error'
# }

# cv_results = lgb.cv(
#     params = params,
#     train_set = lgb_train, 
#     nfold = 5,
#     shuffle = True,
#     stratified = False
# )

# reg = lgb.train(params,
#     train_set = lgb_train, 
#     valid_sets  = lgb_eval)

fold = KFold(n_splits = 5, shuffle = True, random_state=42)
lgbm = LGBMRegressor()

cv_scores = cross_val_score(estimator=lgbm, X = X_train, y = y_train, scoring = 'neg_mean_squared_log_error')
print("CV Scores: ", cv_scores)
print("Mean CV Score: %.3f" % cv_scores.mean())


In [None]:
hyper_params = {
    'num_leaves' : np.arange(30, 40, 2),
    'learning_rate' : np.logspace(0.01, 0.1, 3),
    'max_bins' : np.arange(200, 300, 25)
}

In [None]:
lgbm = LGBMRegressor()

grid_search = RandomizedSearchCV(estimator = lgbm, param_distributions=hyper_params, scoring = 'neg_mean_squared_log_error', cv = 3)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(best_params)
print(grid_search.best_score_)

In [None]:
# prediction

lgbm = LGBMRegressor(**best_params)
lgbm.fit(X_train, y_train)

y_pred = lgbm.predict(X_test)

# accuracy check
msle = mean_squared_log_error(y_test, y_pred)
rmsle = msle**(0.5)
mae = mean_absolute_error(y_test, y_pred)

print("MAE: %.2f" % mae)
print("MSLE: %.2f" % msle)
print("RMSLE: %.2f" % rmsle) 

In [None]:
lgbm.feature_importances_

In [None]:
test_dataset = pd.read_csv('test.csv')
test_dataset.head()

In [None]:
# Treat the test data in the same way as train data 

# Convert datatime column in time od day and day of week 
test_dataset['pickup_datetime'] = pd.to_datetime(test_dataset['pickup_datetime'])
test_dataset['pickup_day_of_week'] = test_dataset['pickup_datetime'].apply(lambda a: a.weekday())
test_dataset['pickup_time_of_day'] = test_dataset['pickup_datetime'].apply(lambda a: a.hour)

# Add distance and bearing columns
test_dataset['geo_dist'] = test_dataset.apply(
    lambda a: get_distance(a['pickup_longitude'], a['pickup_latitude'], a['dropoff_longitude'], a['dropoff_latitude']),
     axis =1)

test_dataset['bearing'] = test_dataset.apply(
    lambda a: get_bearing(a['pickup_longitude'], a['pickup_latitude'], a['dropoff_longitude'], a['dropoff_latitude']),
     axis =1)

# dummy encode store_and_fwd_flg column
test_store_fwd_encoded = pd.get_dummies(test_dataset['store_and_fwd_flag'], drop_first=True)
test_dataset = pd.concat([test_dataset, test_store_fwd_encoded], axis = 1)

# Drop columns that are not needed
test_dataset_clean = test_dataset.drop(columns=['store_and_fwd_flag', 'id', 'pickup_datetime', ])
test_dataset_clean.head()

In [None]:
# Make prediction
predictions = lgbm.predict(test_dataset_clean)


In [None]:
submission = pd.DataFrame({'id': test_dataset['id'], 'trip_duration': np.exp(predictions)})
submission.head()

In [None]:
#output to csv
submission.to_csv("submission1.csv", index = False)