# Import Libraries


In [None]:
!pip install seaborn
!pip install plotly
!pip install openpyxl
!pip install xgboost
!pip install optuna
!pip install osmnx

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
import xgboost as xgb
import optuna
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor


In [None]:
plt.rcParams['figure.figsize'] = [10, 10]

# Data Checking and Preparation

In [None]:
training_file_path = '../MEC_dataset/training_dataset.csv'
testing_file_path = '../MEC_dataset/testing_dataset.csv'
london_file_path = '../MEC_dataset/london.json'
pd_file_path = '../MEC_dataset/population_density.xlsx'


training_data = pd.read_csv(training_file_path)
testing_data = pd.read_csv(testing_file_path)
london = gpd.read_file(london_file_path)
pd = pd.read_excel(pd_file_path)

## Training Data Handling

In [None]:
training_data.head()

In [None]:
training_data.info()

In [None]:
training_data.describe().transpose()

In [None]:
training_data.nunique()

In [None]:
data = training_data['mean_travel_time']
fig = go.Figure()
fig.add_trace(go.Box(y= data))
fig.show()

In [None]:
#outliers = [x for x in training_data["mean_travel_time"] if x > 3704.49]
#outliers_removed = [x for x in training_data["mean_travel_time"] if x <= 3704.49]

## London Data Handling

In [None]:
london.head()

In [None]:
london["MOVEMENT_ID"] = london["MOVEMENT_ID"].astype("int64")
london.info()

# Data Analysis

In [None]:
df = training_data



In [None]:
def hist_plot(df):
    """ Histogram Plot """
    
    data_var = list(df.columns)
  
    figure, axes = plt.subplots(df.shape[1], 1, figsize=(16,len(data_var)*2))
    figure.subplots_adjust(hspace=1)
    for i, n in enumerate(df.columns):
        sns.histplot(data = df[n], ax=axes.flatten()[i])  # widths = 0.5, patch_artist=True, vert=0
    plt.tight_layout()
    plt.style.use('seaborn-whitegrid')
    plt.close()
  
    return figure

In [None]:
hist_plot(df)

In [None]:
df.skew().sort_values(ascending=False)

In [None]:
sns.pairplot(df)

In [None]:
corr = df.corr()
sns.heatmap(corr, annot= True)

##### Strong positive correlation between mean travel time and distance

# Feature Transformation

In [None]:
df['road_distance_by_traveltime_sqrt'] =  np.sqrt(df['road_distance_by_traveltime'])
df['oneway_length_sqrt'] =  np.sqrt(df['oneway_length'])


In [None]:
df.head()

In [None]:
hist_plot(df)

In [None]:
df.skew().sort_values(ascending=False)

# Modelling

In [None]:
y = df.mean_travel_time
features = ['sourceid','dstid','dow','area_reci_src','area_reci_dst','fastest_travel_time','maxspeed_traveltime','dist_geo_sqrt','oneway_length_sqrt','la_name_src','la_name_dst','pd_src','pd_dst','direction']
X = df[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=123)

In [None]:
params = {'learning_rate': 0.024704224090189218,
 'reg_lambda': 2.936733664947117,
 'reg_alpha': 57.40870727344214,
 'subsample': 0.6214411004415425,
 'colsample_bytree': 0.6251175460789683,
 'max_depth': 6}
my_model_3 = XGBRegressor(n_estimators = 3642, random_state = 123,**params)

my_model_3.fit(X_train,y_train,
              early_stopping_rounds = 10,
              eval_set = [(X_valid,y_valid)])

predictions_3 = my_model_3.predict(X_valid)
mse_3 = metrics.mean_squared_error(y_valid,predictions_3)

In [None]:
print("Mean Squared Error:" , mse_3)

RMSE = 195.39

In [None]:
xgb.plot_importance(my_model_3, max_num_features=15)

In [None]:
plt.scatter(y_valid,predictions_3)
xpoints = ypoints = plt.xlim()
plt.plot(xpoints, ypoints, 'k-', alpha=0.75, zorder=0)
plt.xlabel('Actual')
plt.ylabel('Pred')
plt.show()

In [None]:
para = {
 'max_depth':165,
 'max_samples': 0.984425471209481,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1832}
rf2 = RandomForestRegressor(**para, random_state=123)

rf2.fit(X_train,y_train)

predictions_rf2 = rf2.predict(X_valid)

mse_rf2 = metrics.mean_squared_error(y_valid,predictions_rf2)

print("RMSE:" , np.sqrt(mse_rf2))



In [None]:
plt.scatter(y_valid,predictions_rf2)
xpoints = ypoints = plt.xlim()
plt.plot(xpoints, ypoints, 'k-', alpha=0.75, zorder=0)
plt.xlabel('Actual')
plt.ylabel('Pred')
plt.show()

In [None]:
rf2.feature_importances_

In [None]:
plt.barh(features, rf2.feature_importances_)

In [None]:
params = {'degree': 5, 'coef0': 4.998232431157724, 'tol': 0.8292520780334797, 'C': 96.53621561878447, 'epsilon': 0.20830073226294563}
sv = SVR(kernel = 'poly',
                gamma = 'scale',
                shrinking = True,
                cache_size = 200,
                verbose = False,
                max_iter =-1,**params)

sv.fit(X_train, y_train)
SVR_predictions = sv.predict(X_valid)
mse_svr = metrics.mean_squared_error(y_valid,SVR_predictions)
print("RMSE:" , np.sqrt(mse_svr))

In [None]:
plt.scatter(y_valid,SVR_predictions)
xpoints = ypoints = plt.xlim()
plt.plot(xpoints, ypoints, 'k-', alpha=0.75, zorder=0)
plt.xlabel('Actual')
plt.ylabel('Pred')
plt.show()

In [None]:
def run_rf(trial):
    fold = 0
    n_estimators = trial.suggest_int("n_estimators",100,2500)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 5)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)
    max_samples = trial.suggest_float("max_samples", 0, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 300)
    
    

    rf3 = RandomForestRegressor(max_features = 'auto', n_jobs = -1,
                 random_state=123,
                 n_estimators=n_estimators,
                 min_samples_split=min_samples_split,
                 min_samples_leaf=min_samples_leaf,
                 max_samples=max_samples,
                 max_depth=max_depth)
    
    rf3.fit(X_train,y_train)
              

    predictions_rf3 = rf3.predict(X_valid)
    rmse_rf3 = np.sqrt(metrics.mean_squared_error(y_valid,predictions_rf3))
    
    return rmse_rf3

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(run_rf, n_trials=1000)

In [None]:
study.best_params

{'n_estimators': 398,
 'min_samples_split': 2,
 'max_leaf_nodes': 100,
 'min_samples_leaf': 2,
 'max_samples': 0.3096678338818894,
 'max_depth': 19}
 
 rmse - 223.53955368449803
 {'n_estimators': 1499,
 'min_samples_split': 3,
 'min_samples_leaf': 1,
 'max_samples': 0.9807743661462416,
 'max_depth': 71}
 
 rmse - 223.34405822395667
 {'n_estimators': 1832,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_samples': 0.984425471209481,
 'max_depth': 165}

In [None]:
def run(trial):
    fold = 0
    n_estimators = trial.suggest_int("n_estimators",50,5000)
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.5, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 12)
    min_child_weight = trial.suggest_int("min_child_weight",1,10)
    

    my_model_4 = XGBRegressor(
                 random_state=123,
                 n_estimators=n_estimators,
                 learning_rate=learning_rate,
                 reg_lambda=reg_lambda,
                 reg_alpha=reg_alpha,
                 subsample=subsample,
                 colsample_bytree=colsample_bytree,
                 max_depth=max_depth,
                 min_child_weight=min_child_weight
    )
    
    my_model_4.fit(X_train,y_train,
              early_stopping_rounds = 10,
              eval_set = [(X_valid,y_valid)])

    predictions_4 = my_model_4.predict(X_valid)
    mse_4 = metrics.mean_squared_error(y_valid,predictions_4)
    
    return mse_4
    

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=1000)

In [None]:
study.best_params

rmse - 267.99805
{'n_estimators': 2611,
 'learning_rate': 0.033936810024779974,
 'reg_lambda': 2.411226992822402e-06,
 'reg_alpha': 0.05280311765925585,
 'subsample': 0.7839141035249971,
 'colsample_bytree': 0.9059163697590156,
 'max_depth': 6}
 
 rmse - 267.48697
 {'n_estimators': 2702,
 'learning_rate': 0.024368124105536324,
 'reg_lambda': 1.0137028551773858e-05,
 'reg_alpha': 11.476304626451821,
 'subsample': 0.7237891878291387,
 'colsample_bytree': 0.8551575170389154,
 'max_depth': 6}
 
 rmse - 265.75238
{'n_estimators': 4231,
 'learning_rate': 0.020242333551370135,
 'reg_lambda': 0.5143599572024665,
 'reg_alpha': 3.81094005425332,
 'subsample': 0.8566966217796573,
 'colsample_bytree': 0.7878827596556054,
 'max_depth': 7}
 
 rmse - 257.63965
 {'n_estimators': 4830,
 'learning_rate': 0.020684652733365408,
 'reg_lambda': 6.281685725234899e-08,
 'reg_alpha': 0.23257186932668625,
 'subsample': 0.8271638666949983,
 'colsample_bytree': 0.9227151487800732,
 'max_depth': 7}
 
 rmse - 256.11658
 {'n_estimators': 2794,
 'learning_rate': 0.020436805917267438,
 'reg_lambda': 2.7212555280640823e-08,
 'reg_alpha': 0.007221325684119264,
 'subsample': 0.8306883017220584,
 'colsample_bytree': 0.9258892579933528,
 'max_depth': 7}
 
 rmse - 237.83017
 {'n_estimators': 3910,
 'learning_rate': 0.038222719197485336,
 'reg_lambda': 0.08992129176632746,
 'reg_alpha': 97.4677840263797,
 'subsample': 0.7958015466113378,
 'colsample_bytree': 0.9337331481974602,
 'max_depth': 6}
 
 rmse - 233.94916
 {'n_estimators': 1858,
 'learning_rate': 0.027406462066360184,
 'reg_lambda': 8.006824394058746e-05,
 'reg_alpha': 2.1554443619678322e-05,
 'subsample': 0.7226906014805654,
 'colsample_bytree': 0.9470933940693741,
 'max_depth': 6}
 
 rmse - 195.38582
 {'n_estimators': 3642,
 'learning_rate': 0.024704224090189218,
 'reg_lambda': 2.936733664947117,
 'reg_alpha': 57.40870727344214,
 'subsample': 0.6214411004415425,
 'colsample_bytree': 0.6251175460789683,
 'max_depth': 6}

In [None]:
level0 = list()
level0.append(('xgb', my_model_3))
level0.append(('svr',sv))
level0.append(('rf',rf2))

# define meta learner model
level1 = LinearRegression()
# define the stacking ensemble
model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)

model.fit(X_train,y_train)
predictions_ensemble = model.predict(X_valid)
rmse_ensemble = np.sqrt(metrics.mean_squared_error(y_valid,predictions_ensemble))
              
         

In [None]:
print (rmse_ensemble)

In [None]:
plt.scatter(y_valid,predictions_ensemble)
xpoints = ypoints = plt.xlim()
plt.plot(xpoints, ypoints, 'k-', alpha=0.75, zorder=0)
plt.xlabel('Actual')
plt.ylabel('Pred')
plt.show()

# Result Analysis

In [None]:
result3a = X_valid
result3a["mean_travel_time"] = y_valid
result3a["predicted"] = predictions_ensemble
result3a["diff"] = abs(result3a["mean_travel_time"] - result3a["predicted"])
result3a["diff_in_%"] = ((abs(result3a["mean_travel_time"] - result3a["predicted"]))/result3a["mean_travel_time"]) * 100
result3a['sqrt_diff'] = (abs(result3a['mean_travel_time'] - result3a['predicted']))**(1/2)



In [None]:
result3a.sort_values('diff_in_%',inplace=True)
result3a

# Final Model

In [None]:
y = df.mean_travel_time
features = ['sourceid','dstid','dow','area_reci_src','area_reci_dst','fastest_travel_time','maxspeed_traveltime','dist_geo_sqrt','oneway_length_sqrt','la_name_src','la_name_dst','pd_src','pd_dst','direction']
X = df[features]

Final_model = model
Final_model.fit(X,y)
              

In [None]:
testing1 = testing_data.merge(london, left_on = "sourceid", right_on = "MOVEMENT_ID", how = "left")
testing2 = testing1.rename({"geoeast":"geoeast_src", "geonorth":"geonorth_src","popeast":"popeast_src","popnorth":"popnorth_src"}, axis = 1)
testing3 = testing2.merge(london, left_on = "dstid", right_on = "MOVEMENT_ID", how = "left")
testing4 = testing3.rename({"geoeast":"geoeast_dst", "geonorth":"geonorth_dst","popeast":"popeast_dst","popnorth":"popnorth_dst"}, axis = 1)
features = ["sourceid","dstid","dow","la_name_x","area_km2_x","geoeast_src","geonorth_src","popeast_src","popnorth_src","la_name_y","area_km2_y","geoeast_dst","geonorth_dst","popeast_dst","popnorth_dst"]
testing5 = testing4[features]
Final_testing_data = testing5.rename({"la_name_x":"la_name_src","la_name_y":"la_name_dst","area_km2_x":"area_src","area_km2_y":"area_dst"},axis=1)
Final_testing_data = Final_testing_data.merge(pd, left_on = "la_name_src", right_on = "la_name", how = "left")
Final_testing_data = Final_testing_data.merge(pd, left_on = "la_name_dst", right_on = "la_name", how = "left")
Final_testing_data = Final_testing_data.rename({'population_density_x':'pd_src','population_density_y':'pd_dst'},axis =1)
Final_testing_data = Final_testing_data.drop(['la_name_x','la_name_y'], axis=1)
Final_testing_data

In [None]:
Final_testing_data['dist_geo'] = np.linalg.norm(Final_testing_data.loc[:, ['geoeast_src','geonorth_src']].values - Final_testing_data.loc[:, ['geoeast_dst','geonorth_dst']], axis=1)
Final_testing_data['dist_pop'] = np.linalg.norm(Final_testing_data.loc[:, ['popeast_src','popnorth_src']].values - Final_testing_data.loc[:, ['popeast_dst','popnorth_dst']], axis=1)
xDiff = Final_testing_data.loc[:, 'geonorth_dst'].values - Final_testing_data.loc[:, 'geonorth_src'].values
yDiff = Final_testing_data.loc[:, 'geoeast_dst'].values - Final_testing_data.loc[:, 'geoeast_src'].values
Final_testing_data.loc[:,'direction'] = np.degrees(np.arctan2(yDiff,xDiff)) 
Final_testing_data['direction'] = Final_testing_data['direction'].apply(np.ceil)

In [None]:
ordinal_encoder = OrdinalEncoder()
object_cols = ["la_name_src","la_name_dst"]
Final_testing_data[object_cols] = ordinal_encoder.fit_transform(Final_testing_data[object_cols])
Final_testing_data

In [None]:
Final_testing_data = Final_testing_data.drop(['geoeast_src','geonorth_src','popeast_src','popnorth_src','geoeast_dst','geonorth_dst','popeast_dst','popnorth_dst'], axis=1)

In [None]:
Final_testing_data['dist_geo_sqrt'] =  np.sqrt(Final_testing_data['dist_geo'])
Final_testing_data['dist_pop_sqrt'] =  np.sqrt(Final_testing_data['dist_pop'])
Final_testing_data['area_reci_src'] =  1/Final_testing_data.area_src
Final_testing_data['area_reci_dst'] =  1/Final_testing_data.area_dst

In [None]:
Final_testing_data

In [None]:
import pandas as pd
test_route = pd.read_csv('C:/Users/lowmi/Downloads/Compressed/MEC_dataset/testing_route.csv')
test_route

In [None]:
Final_testing_data = pd.concat([Final_testing_data,test_route],axis=1)
Final_testing_data

In [None]:
Final_testing_data['road_distance_by_traveltime_sqrt'] =  np.sqrt(Final_testing_data['road_distance_by_traveltime'])
Final_testing_data['oneway_length_sqrt'] =  np.sqrt(Final_testing_data['oneway_length'])
Final_testing_data = Final_testing_data.rename({'fastest_traveltime':'fastest_travel_time'},axis=1)
Final_testing_data

In [None]:
features = ['sourceid','dstid','dow','area_reci_src','area_reci_dst','fastest_travel_time','maxspeed_traveltime','dist_geo_sqrt','oneway_length_sqrt','la_name_src','la_name_dst','pd_src','pd_dst','direction']
test_X  = Final_testing_data[features]

In [None]:
test_preds = Final_model.predict(test_X)

In [None]:
test_preds

In [None]:
output = pd.DataFrame({'sourceid': testing_data.sourceid,
                       'dstid': testing_data.dstid,
                       'dow':testing_data.dow,
                       'predicted_mean_travel_time':test_preds})
output.to_csv('submission8.csv', index=False)

In [None]:
output