In [1]:
import numpy as np
import pandas as pd
import h3
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
# !pip install h3

In [3]:
df =pd.read_csv("./suratITMSDPtest/suratITMSDPtest.csv")

df = df.drop_duplicates(subset=['trip_id', 'observationDateTime'], ignore_index=True)
df = df.drop(columns = [
			"trip_direction",
			"last_stop_id",
			"last_stop_arrival_time",
			"route_id",
			"actual_trip_start_time",
			"trip_delay",
			"vehicle_label",
			"id",
			"location.type",
			"trip_id"
		])

# separating latitude and longitude from location
lat_lon = df["location.coordinates"].astype(str).str.strip('[]').str.split(",")
lon = lat_lon.apply(lambda x: x[0])
lat = lat_lon.apply(lambda x: x[1])

# assigning h3 index to the latitude and longitude coordinates in separate dataframe 
dflen = len(df)
h3index = [None] * dflen
resolution = 7
for i in range(dflen):
    h3index[i] = h3.geo_to_h3(lat=float(lat[i]), lng=float(lon[i]), resolution=resolution)
    
df["h3index"] = h3index

# assigning date and time to separate dataframe and creating a timeslot column
df["Date"] = pd.to_datetime(df["observationDateTime"]).dt.date
df["Time"] = pd.to_datetime(df["observationDateTime"]).dt.time
time = df["Time"]
df["Timeslot"] = time.apply(lambda x: x.hour)

# assigning HATs from H3index and timeslot
df["HAT"] = (df["Timeslot"].astype(str) + " " + df["h3index"])

# Filtering time slots by start and end time 
startTime = 9
endTime = 20
df = df[(df["Timeslot"] >= startTime) & (df["Timeslot"] <= endTime)]

# Selecting h3 indices where a min number of events occur in all timeslots of the day
tmp_df1 = (df.groupby(["Timeslot", "Date", "h3index"]).agg({
    "license_plate": "nunique"
}).reset_index())

tmp_df2 = tmp_df1.groupby(["Timeslot", "h3index"]).agg({
    "license_plate": "sum"
}).reset_index()

date = df["Date"].unique()
minEventOccurences = 20
limit = len(date) * minEventOccurences

tmp_df3 = tmp_df2[tmp_df2["license_plate"] >= limit]
tmp_df4 = tmp_df3.groupby("h3index").agg({"Timeslot": "count"}).reset_index()
maxTimeSlots = tmp_df4["Timeslot"].max()
tmp_df5 = tmp_df4[tmp_df4["Timeslot"] == maxTimeSlots]

t_df = df["h3index"].isin(tmp_df5["h3index"])
# print(t_df.head())
df = df[t_df]

print(df.head(10))

       speed        observationDateTime license_plate    location.coordinates  \
20418   27.0  2022-11-01 09:00:00+05:30    GJ05BX0587  [72.840505, 21.180951]   
20420   33.0  2022-11-01 09:00:01+05:30    GJ05BX2218  [72.802348, 21.181223]   
20422    0.0  2022-11-01 09:00:02+05:30    GJ05BX2389  [72.839214, 21.204881]   
20423   43.0  2022-11-01 09:00:03+05:30    GJ05BX2238  [72.839566, 21.172077]   
20425   24.0  2022-11-01 09:00:05+05:30    GJ05CU1303  [72.865474, 21.211831]   
20426   26.0  2022-11-01 09:00:05+05:30    GJ05BX2048  [72.835912, 21.205766]   
20427    0.0  2022-11-01 09:00:05+05:30    GJ05CU6899   [72.790628, 21.23196]   
20428    0.0  2022-11-01 09:00:05+05:30    GJ05BX1806  [72.840091, 21.204044]   
20429   37.0  2022-11-01 09:00:06+05:30    GJ05CU5972  [72.863654, 21.216128]   
20430   21.0  2022-11-01 09:00:06+05:30    GJ05CU5917  [72.799143, 21.192198]   

               h3index        Date      Time  Timeslot                HAT  
20418  8742d98b6ffffff  2022-11-

In [4]:
df_gb_hdl = df.groupby(["HAT", "Date", "license_plate"]).agg({"speed": "count"}).reset_index()
print(df_gb_hdl.head(10))
# print(df.columns)

                  HAT        Date license_plate  speed
0  10 8742d98b0ffffff  2022-11-01    GJ05BV4080     31
1  10 8742d98b0ffffff  2022-11-01    GJ05BV4252     33
2  10 8742d98b0ffffff  2022-11-01    GJ05BV4449      1
3  10 8742d98b0ffffff  2022-11-01    GJ05BV4520     14
4  10 8742d98b0ffffff  2022-11-01    GJ05BX0867      6
5  10 8742d98b0ffffff  2022-11-01    GJ05BX1150     18
6  10 8742d98b0ffffff  2022-11-01    GJ05BX1314      1
7  10 8742d98b0ffffff  2022-11-01    GJ05BX1436      6
8  10 8742d98b0ffffff  2022-11-01    GJ05BX1583     23
9  10 8742d98b0ffffff  2022-11-01    GJ05BX1806      8


In [5]:
x = df_gb_hdl["speed"].values

n_bins = int((x.max() - x.min()) // 2)
print(n_bins)
fig = go.Figure()
fig.add_trace(go.Histogram(x=x, nbinsx=n_bins))
# fig = px.histogram(x, nbins=n_bins)
fig.show()

170


In [6]:
df_gb_hdl = df.groupby(["HAT"]).agg({"license_plate": "nunique"}).reset_index()
print(df_gb_hdl["license_plate"].max())
max_hat = df_gb_hdl[df_gb_hdl["license_plate"] == df_gb_hdl["license_plate"].max() ]["HAT"].iloc[0]
print(max_hat)

135
18 8742d98b4ffffff


In [7]:
hats = df["HAT"].unique()
test_hats = [max_hat]

for hat in test_hats:
    df_plot = df[df["HAT"]==hat]
    df_plot = df_plot[df_plot["speed"]>0]
    print(df_plot.shape[0])
    print(df_plot["speed"].mean())
    print(df_plot["speed"].std())
    sample = np.random.normal(df_plot["speed"].mean(), df_plot["speed"].std(), len(df_plot["speed"]))
    error = ((np.absolute(np.sort(df_plot["speed"]) - np.sort(sample)).sum()) / (np.absolute(sample).sum())) * 100
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=df_plot["speed"], name="Speed"))
    fig.add_trace(go.Histogram(x=sample, name="Gaussian"))

    # Overlay both histograms
    fig.update_layout(title="Error: " + str(error) + "%", barmode='overlay')
    # Reduce opacity to see both histograms
    fig.update_traces(opacity=0.6)
    fig.show()



5245
18.569609151572937
10.332769254234044


In [8]:
df_gb_max = df.groupby(["HAT", "license_plate"]).agg({"speed": "count"}).reset_index()
df_gb_max = df_gb_max[df_gb_max["HAT"] == max_hat]
# print(df_gb_max["speed"].sum())
df_gb_max["speed"] = df_gb_max.apply(lambda x: 8 if x["speed"] > 8 else x["speed"], axis=1)
print(df_gb_max["speed"].sum())


1029
