# Import Libraries


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import xgboost as xgb
import optuna
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
plt.rcParams['figure.figsize'] = [10, 10]

# Data Checking and Preparation

In [None]:
# training_file_path = '/content/drive/MyDrive/MEC_dataset/training_WeeklyAggregate.xlsx'
# testing_file_path = '/content/drive/MyDrive/MEC_dataset/testing_dataset.csv'
# london_file_path = '/content/drive/MyDrive/MEC_dataset/london.json'

training_data = pd.read_csv("../MEC_dataset/training_dataset.csv",index_col=0)
# testing_data = pd.read_csv(testing_file_path)
# london = gpd.read_file(london_file_path)

In [None]:
training_data.columns

# Data Analysis

In [None]:
df = training_data.copy()

In [None]:
def hist_plot(df):
    """ Histogram Plot """
    
    data_var = list(df.columns)
  
    figure, axes = plt.subplots(df.shape[1], 1, figsize=(16,len(data_var)*2))
    figure.subplots_adjust(hspace=1)
    for i, n in enumerate(df.columns):
        sns.histplot(data = df[n], ax=axes.flatten()[i])  # widths = 0.5, patch_artist=True, vert=0
    plt.tight_layout()
    plt.style.use('seaborn-whitegrid')
    plt.close()
  
    return figure

In [None]:
hist_plot(df)

In [None]:
df.skew().sort_values(ascending=False)

In [None]:
corr = df.corr()
sns.heatmap(corr, cmap="RdBu")

##### Strong positive correlation between mean travel time and distance

# Feature Transformation

In [None]:
df.columns

In [None]:
df['area_src_reci'] =  1/df.area_src
df['area_dst_reci'] =  1/df.area_dst
df["oneway_length_sqrt"] = np.sqrt(df["oneway_length"])
df["fastest_travel_time_sqrt"] = np.sqrt(df["fastest_travel_time"])

In [None]:
df.head()

In [None]:
df.skew().sort_values(ascending=False)

# Modelling

In [None]:
y = df.mean_travel_time
features = ['sourceid','dstid',#'dow','area_src_reci','area_dst_reci',#'dist_geo_sqrt',
            'la_name_src','la_name_dst','direction','maxspeed_traveltime','fastest_travel_time_sqrt','oneway_length_sqrt']
X = df[features]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=123)

## Model 1

In [None]:
my_model_1 = XGBRegressor(random_state = 123)

my_model_1.fit(X_train,y_train)

predictions_1 = my_model_1.predict(X_valid)

mse_1 = metrics.mean_squared_error(y_valid,predictions_1)

print("Mean Squared Error:" , mse_1)

In [None]:
xgb.plot_importance(my_model_1, max_num_features=10)

In [None]:
plt.scatter(y_valid,predictions_1)
xpoints = ypoints = plt.xlim()
plt.plot(xpoints, ypoints, 'k-', alpha=0.75, zorder=0)
plt.xlabel('Actual')
plt.ylabel('Pred')
plt.show()

## Model 2

In [None]:
my_model_2 = XGBRegressor(n_estimators = 725, learning_rate = 0.05, random_state = 123)

my_model_2.fit(X_train,y_train,
              early_stopping_rounds = 10,
              eval_set = [(X_valid,y_valid)])

predictions_2 = my_model_2.predict(X_valid)

mse_2 = metrics.mean_squared_error(y_valid,predictions_2)

print("Mean Squared Error:" , mse_2)

In [None]:
print("Mean Squared Error:" , mse_2)

In [None]:
xgb.plot_importance(my_model_2, max_num_features=10)

In [None]:
plt.scatter(y_valid,predictions_2)
xpoints = ypoints = plt.xlim()
plt.plot(xpoints, ypoints, 'k-', alpha=0.75, zorder=0)
plt.xlabel('Actual')
plt.ylabel('Pred')
plt.show()

## Model 3

In [None]:
params = {'learning_rate': 0.024704224090189218,
          'reg_lambda': 2.936733664947117,
          'reg_alpha': 57.40870727344214,
          'subsample': 0.6214411004415425,
          'colsample_bytree': 0.6251175460789683,
          'max_depth': 6}
my_model_3 = XGBRegressor(n_estimators = 3642, random_state = 123,**params)

my_model_3.fit(X_train,y_train,
              early_stopping_rounds = 10,
              eval_set = [(X_valid,y_valid)])

predictions_3 = my_model_3.predict(X_valid)
mse_3 = metrics.mean_squared_error(y_valid,predictions_3)

In [None]:
print("RMSE:" , np.sqrt(mse_3))

In [None]:
xgb.plot_importance(my_model_3, max_num_features=10)

In [None]:
plt.scatter(y_valid,predictions_3)
xpoints = ypoints = plt.xlim()
plt.plot(xpoints, ypoints, 'k-', alpha=0.75, zorder=0)
plt.xlabel('Actual')
plt.ylabel('Pred')
plt.show()

## Model 4

In [None]:
def run(trial):
    fold = 0
    n_estimators = trial.suggest_int("n_estimators",50,5000)
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.5, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 12)

    my_model_4 = XGBRegressor(
                 random_state=123,
                 n_estimators=n_estimators,
                 learning_rate=learning_rate,
                 reg_lambda=reg_lambda,
                 reg_alpha=reg_alpha,
                 subsample=subsample,
                 colsample_bytree=colsample_bytree,
                 max_depth=max_depth,
    )
    
    my_model_4.fit(X_train,y_train,
              early_stopping_rounds = 100,
              eval_set = [(X_valid,y_valid)])

    predictions_4 = my_model_4.predict(X_valid)
    mse_4 = metrics.mean_squared_error(y_valid,predictions_4)
    
    return mse_4
    

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=500)

In [None]:
study.best_params

# Result Analysis

In [None]:
result = X_valid
result["mean_travel_time"] = y_valid
result["predicted"] = predictions_3
result["diff"] = abs(result["mean_travel_time"] - result["predicted"])
result["diff_in_%"] = ((abs(result["mean_travel_time"] - result["predicted"]))/result["mean_travel_time"]) * 100
result['sqrt_diff'] = (abs(result['mean_travel_time'] - result['predicted']))**(1/2)

In [None]:
result.sort_values('diff_in_%',inplace=True)
result

In [None]:
plt.scatter(result["mean_travel_time"],result["diff_in_%"])
plt.show()

In [None]:
result["dist_geo_sqrt"].sort_values()

# Final Model

In [None]:
y = df.mean_travel_time
features = ['sourceid','dstid','dow','area_src_reci','area_dst_reci','dist_geo_sqrt','la_name_src','la_name_dst','direction']
X = df[features]

Final_model = my_model_3
Final_model.fit(X,y)
              

In [None]:
testing1 = testing_data.merge(london, left_on = "sourceid", right_on = "MOVEMENT_ID", how = "left")
testing2 = testing1.rename({"geoeast":"geoeast_src", "geonorth":"geonorth_src","popeast":"popeast_src","popnorth":"popnorth_src"}, axis = 1)
testing3 = testing2.merge(london, left_on = "dstid", right_on = "MOVEMENT_ID", how = "left")
testing4 = testing3.rename({"geoeast":"geoeast_dst", "geonorth":"geonorth_dst","popeast":"popeast_dst","popnorth":"popnorth_dst"}, axis = 1)
features = ["sourceid","dstid","dow","la_name_x","area_km2_x","geoeast_src","geonorth_src","popeast_src","popnorth_src","la_name_y","area_km2_y","geoeast_dst","geonorth_dst","popeast_dst","popnorth_dst"]
testing5 = testing4[features]
Final_testing_data = testing5.rename({"la_name_x":"la_name_src","la_name_y":"la_name_dst","area_km2_x":"area_src","area_km2_y":"area_dst"},axis=1)
Final_testing_data

In [None]:
Final_testing_data['dist_geo'] = np.linalg.norm(Final_testing_data.loc[:, ['geoeast_src','geonorth_src']].values - Final_testing_data.loc[:, ['geoeast_dst','geonorth_dst']], axis=1)
Final_testing_data['dist_pop'] = np.linalg.norm(Final_testing_data.loc[:, ['popeast_src','popnorth_src']].values - Final_testing_data.loc[:, ['popeast_dst','popnorth_dst']], axis=1)
xDiff = Final_testing_data.loc[:, 'geonorth_dst'].values - Final_testing_data.loc[:, 'geonorth_src'].values
yDiff = Final_testing_data.loc[:, 'geoeast_dst'].values - Final_testing_data.loc[:, 'geoeast_src'].values
Final_testing_data.loc[:,'direction'] = np.degrees(np.arctan2(yDiff,xDiff))

In [None]:
ordinal_encoder = OrdinalEncoder()
object_cols = ["la_name_src","la_name_dst"]
Final_testing_data[object_cols] = ordinal_encoder.fit_transform(Final_testing_data[object_cols])
Final_testing_data

In [None]:
Final_testing_data = Final_testing_data.drop(['geoeast_src','geonorth_src','popeast_src','popnorth_src','geoeast_dst','geonorth_dst','popeast_dst','popnorth_dst'], axis=1)

In [None]:
Final_testing_data['dist_geo_sqrt'] =  np.sqrt(Final_testing_data['dist_geo'])
Final_testing_data['dist_pop_sqrt'] =  np.sqrt(Final_testing_data['dist_pop'])
Final_testing_data['area_src_reci'] =  1/Final_testing_data.area_src
Final_testing_data['area_dst_reci'] =  1/Final_testing_data.area_dst

In [None]:
Final_testing_data

In [None]:
features = ['sourceid','dstid','dow','area_src_reci','area_dst_reci','dist_geo_sqrt','la_name_src','la_name_dst','direction']
test_X  = Final_testing_data[features]

In [None]:
test_preds = Final_model.predict(test_X)

In [None]:
test_preds

In [None]:
output = pd.DataFrame({'sourceid': testing_data.sourceid,
                       'dstid': testing_data.dstid,
                       'dow':testing_data.dow,
                       'predicted_mean_travel_time':test_preds})
output.to_csv('submission.csv', index=False)

In [None]:
output

# Cross Validation


In [None]:
kfold = KFold(n_splits=5, random_state=123, shuffle=True)
results = cross_val_score(my_model_3, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
results