In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import geopandas as gpd
import requests
import shapely
import matplotlib.pyplot as plot
%matplotlib inline


# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading Inputs

Here, we consider Bangalore travel time data for January 2020 to March 2020.

In [None]:
travel_times = pd.read_csv('/kaggle/input/uber-movement-data/Travel_Times.csv')
travel_times_daily = pd.read_csv('/kaggle/input/uber-movement-data/Travel_Times_Daily.csv')
travel_times_day = pd.read_csv('/kaggle/input/uber-movement-data/Travel_Times_time_of_day.csv')
travel_times_week = pd.read_csv('/kaggle/input/uber-movement-data/Travel_Times_day_of_week.csv')
bnglr_wards_hourly = pd.read_csv('/kaggle/input/uber-movement-data/bangalore-wards-2020-1-All-HourlyAggregate.csv')
bnglr_wards_weekly = pd.read_csv('/kaggle/input/uber-movement-data/bangalore-wards-2020-1-WeeklyAggregate.csv')
bnglr_wards_monthly = pd.read_csv('/kaggle/input/uber-movement-data/bangalore-wards-2020-1-All-MonthlyAggregate.csv')

We'll consider the hourly travel time average to model.

In [None]:
bnglr_wards_hourly.head(2)

In [None]:
mean_travel_time_by_hour_of_day = bnglr_wards_hourly.groupby('hod')['mean_travel_time'].mean()/60
plt = mean_travel_time_by_hour_of_day.plot(kind="bar", figsize=(16,7))
plt.set_title('Mean travel times around Bangalore',fontsize=20)
plt.set_xlabel('Hour of day', fontsize=16)
_ = plt.set_ylabel('Mean travel time in mins', fontsize=16)

# The ward data

In [None]:
bglr=gpd.read_file('/kaggle/input/uber-movement-data/bangalore_wards.json')
bglr.plot()

In [None]:
print(bglr.geometry)

# Picking centroids to represent an area

In [None]:
bglr_c = bglr.copy()
bglr_c.geometry= bglr_c['geometry'].centroid
fig, ax = plot.subplots(figsize=(9,9))
bglr.plot(color='grey',ax=ax)
bglr_c.plot(color='red',ax=ax)

In [None]:
id_to_dest = travel_times[['Destination Movement ID', 'Destination Display Name']]
id_to_dest.columns = ['id', 'name']
id_to_dest.head()


# Cleaning up data for analysis

Got source and destination data and removed unneeded columns like geometric_mean_travel_time and geometric_standard_deviation_travel_time.

In [None]:
time_df = pd.merge(bnglr_wards_hourly, id_to_dest, left_on=['sourceid'], right_on=['id'], how='inner')
time_df = time_df.drop(columns=['id', 'geometric_mean_travel_time', 'geometric_standard_deviation_travel_time'])
time_df = time_df.rename(columns={'name': 'Source Name'})
time_df = pd.merge(time_df, id_to_dest, left_on=['dstid'], right_on=['id'], how='inner')
time_df = time_df.loc[time_df['sourceid'] != time_df['dstid']]
time_df = time_df.drop(columns=['id'])
time_df = time_df.rename(columns={'name': 'Destination Name'})
time_df = time_df.sort_values(by=['sourceid', 'dstid', 'hod'])
time_df.tail(5)

In [None]:
bglr_c.geometry

In [None]:
bglr_c.DISPLAY_NAME

In [None]:
time_df2 = pd.merge(time_df, bglr_c, left_on=['Source Name'], right_on=['DISPLAY_NAME'], how='inner')
time_df2 = time_df2.drop(columns=['WARD_NO', 'MOVEMENT_ID', 'DISPLAY_NAME'])
time_df2 = time_df2.rename(columns = {'WARD_NAME': 'Source Ward Name', 'geometry': 'Source Geometry'})
time_df2 = pd.merge(time_df2, bglr_c, left_on=['Destination Name'], right_on=['DISPLAY_NAME'], how='inner')
time_df2 = time_df2.drop(columns=['WARD_NO', 'MOVEMENT_ID', 'DISPLAY_NAME'])
time_df2 = time_df2.rename(columns = {'WARD_NAME': 'Destination Ward Name', 'geometry': 'Destination Geometry'})
time_df2.sample(3)

In [None]:
import pickle
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

import os.path
def file_exists(filename):
    return os.path.exists(filename)

# Calculating distances to use as a feature

In [None]:
import geopy.distance

def calc_distance(x):
    src_point = (x['Source Geometry'].y, x['Source Geometry'].x)
    dest_point = (x['Destination Geometry'].y, x['Destination Geometry'].x)
    return geopy.distance.geodesic(src_point, dest_point).kilometers

filename = '/kaggle/input/calc-data/Df_with_geodesic_distance.bin'
if file_exists(filename):
    with open(filename, 'rb') as file:
        df = pickle.load(file)
else:
    time_df2['Geodesic Distance'] = time_df2.apply(func = calc_distance, axis=1)
    save_object(time_df2, filename)
    df = time_df2
df.sample(3)

In [None]:
# import osrm

# def calc_osrm_distance(x):
#     src_point = (x['Source Geometry'].y, x['Source Geometry'].x)
#     dest_point = (x['Destination Geometry'].y, x['Destination Geometry'].x)
#     result = osrm.simple_route(src_point, dest_point, output='route', overview="full", geometry='wkt')
#     return result[0]['distance']


# filename = 'Df_with_osrm_distance.bin'
# if file_exists(filename):
#     df = pickle.load(filename)
# else:
#     df['OSRM Distance'] = df.apply(func = calc_osrm_distance, axis=1)
#     save_object(df, filename)
# df.sample(3)

In [None]:
# !pip install osrm-py

In [None]:
# import osrm

# client = osrm.Client()

# response = client.route(
#     coordinates=[[13.102805, 77.560038], [13.121709, 77.580422]],
#     overview=osrm.overview.full)

# print(response)

In [None]:
def compare(actual, predicted):
    actual = [act[0] for act in actual[outcome].values.tolist()]
    predicted = predicted.tolist()
    return pd.DataFrame(data = {'actual': actual, 'prediction': predicted})

In [None]:
final_df = df.copy()
final_df['Source lat'] = final_df['Source Geometry'].apply(lambda pt: float(pt.y))
final_df['Source long'] = final_df['Source Geometry'].apply(lambda pt: float(pt.x))
final_df['Dest lat'] = final_df['Destination Geometry'].apply(lambda pt: float(pt.y))
final_df['Dest long'] = final_df['Destination Geometry'].apply(lambda pt: float(pt.x))

test_sample = final_df.sample(20)

In [None]:
# test_sample

# Modelling

Model 1

In [None]:
features = ['Source lat', 'Source long', 'Dest lat', 'Dest long', 'hod', 'Geodesic Distance']
outcome = ['mean_travel_time']
final_df = final_df[features + outcome]

import xgboost as xgb

filename = '/kaggle/input/calc-data/XGB_model_1.bin'
if file_exists(filename):
    with open(filename, 'rb') as file:
        my_model = pickle.load(file)
else:
    my_model = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, n_jobs=4)
    my_model.fit(final_df[features], final_df[outcome],
             verbose=False)
    save_object(my_model, filename)

In [None]:
test_sample1 = test_sample.copy()
test_sample1 = test_sample1[features]
actuals = test_sample[outcome]
prediction = my_model.predict(test_sample1)

from sklearn.metrics import mean_squared_error
mean_squared_error(actuals, prediction)

In [None]:
compare(actuals, prediction)

Model 2

In [None]:
features = ['hod', 'Geodesic Distance']
outcome = ['mean_travel_time']
final_df = final_df[features + outcome]

filename = 'XGB_model_2.bin'
if file_exists(filename):
    with open(filename, 'rb') as file:
        my_model2 = pickle.load(file)
else:
    my_model2 = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, n_jobs=4)
    my_model2.fit(final_df[features], final_df[outcome],
             verbose=False)
    save_object(my_model2, filename)

In [None]:
test_sample2 = test_sample.copy()
test_sample2 = test_sample2[features]
actuals = test_sample[outcome]
prediction = my_model2.predict(test_sample2)

from sklearn.metrics import mean_squared_error
mean_squared_error(actuals, prediction)

In [None]:
compare(actuals, prediction)

In [None]:
def get_distance(lat1, long1, lat2, long2):
    src_point = (lat1, long1)
    dest_point = (lat2, long2)
    return geopy.distance.geodesic(src_point, dest_point).kilometers

def prepare_df(lat1, long1, lat2, long2, hod):
    distance = get_distance(lat1, long1, lat2, long2)
    return pd.DataFrame(columns = ['Source lat', 'Source long', 'Dest lat', 'Dest long', 'hod', 'Geodesic Distance'],
                 data = [[lat1, long1, lat2, long2, hod, distance]])
    
def predict(df):
    return my_model.predict(df)

In [None]:
lat1 = 13.002385
long1 = 77.568491
lat2 = 13.061071
long2 = 77.597371
hod = 10

df = prepare_df(lat1, long1, lat2, long2, hod)
predict(df)