# Exploratory Data Analysis

This notebook aims at providing a better understanding of the data.<br>
Project goal : Estabilishing a STIB QoS overview 

## Files available

3 weeks of data has been collected containing :
- The location of all the vehicules every 30 seconds
- ESRI shape file describing lines and stops
- GTFS files which are timetables -> 2 snapshots: 3 and 23 september 


## Requirement

In [None]:
%pip install gtfs_functions partridge numpy pandas shapely gdal fiona pyproj six rtree geopandas branca defusedxml Pillow jupyter-packaging packaging keplergl

In [None]:
import gtfs_functions as gtfs

## GTFS

### Library 1 : gtfs_functions

In [None]:
routes, stops, stop_times, trips, shapes = gtfs.import_gtfs("data/timetables/gtfs3Sept.zip")

In [None]:
routes.head(2)

In [None]:
stops

In [None]:
stops[stops["location_type"] == 0]

In [None]:
stop_times.head(2)

In [None]:
cutoffs = [0,6,9,15.5,19,22,24]
stop_freq = gtfs.stops_freq(stop_times, stops, cutoffs = cutoffs)
condition_dir = stop_freq.dir_id == 'Inbound'
condition_window = stop_freq.window == '6:00-9:00'

gdf = stop_freq.loc[(condition_dir & condition_window),:].reset_index()

gtfs.map_gdf(gdf = gdf, 
              variable = 'ntrips', 
              colors = ["#d13870", "#e895b3" ,'#55d992', '#3ab071', '#0e8955','#066a40'], 
              tooltip_var = ['frequency'] , 
              tooltip_labels = ['Frequency: '], 
              breaks = [10, 20, 30, 40, 120, 200])

In [None]:
cutoffs = [0,6,9,15.5,19,22,24]
line_freq = gtfs.lines_freq(stop_times, trips, shapes, routes, cutoffs = cutoffs)
#condition_dir = line_freq.dir_id == 'Inbound'
#condition_window = line_freq.window == '6:00-9:00'

#gdf = line_freq.loc[(condition_dir & condition_window),:].reset_index()

gtfs.map_gdf(gdf = gdf, 
              variable = 'ntrips', 
              colors = ["#d13870", "#e895b3" ,'#55d992', '#3ab071', '#0e8955','#066a40'], 
              tooltip_var = ['route_name'] , 
              tooltip_labels = ['Route: '], 
              breaks = [5, 10, 20, 50])

In [None]:
seg_freq

In [None]:
import keplergl as kp

segments_gdf = gtfs.cut_gtfs(stop_times, stops, shapes)
segments_gdf.head(2)
cutoffs = [0,6,9,15.5,19,22,24]
seg_freq = gtfs.segments_freq(segments_gdf, stop_times, routes, cutoffs = cutoffs)
seg_freq.head(2)
#m = kp.KeplerGl(data=dict(data=seg_freq, name='Segment frequency'), height=400)
#m

gtfs.map_gdf(gdf = seg_freq, 
              variable = 'ntrips', 
              colors = ["#d13870", "#e895b3" ,'#55d992', '#3ab071', '#0e8955','#066a40'], 
              tooltip_var = ['route_name'] , 
              tooltip_labels = ['Route: '], 
              breaks = [5, 10, 20, 50])

### Library 2 : gtfs-kit

In [None]:
%pip install gtfs-kit

In [None]:
import gtfs_kit as gk

In [None]:
feed = gk.read_feed("data/timetables/gtfs3Sept.zip", dist_units='km')

In [None]:
feed = feed.append_dist_to_stop_times()
feed.stop_times.head().T

In [None]:
feed.map_routes(feed.routes.route_id.iloc[:], include_stops=True)

### Pandas

In [None]:
import pandas as pd

In [None]:
routes = pd.read_csv('data/timetables/gtfs3Sept/routes.txt', sep = ",")

In [None]:
stops = pd.read_csv('data/timetables/gtfs3Sept/stops.txt', sep = ",")

In [None]:
stops

In [None]:
stops[stops["stop_name"] == "GARE CENTRALE"]

In [None]:
routes

In [None]:
trips = pd.read_csv('data/timetables/gtfs3Sept/trips.txt', sep = ",")

In [None]:
trips

In [None]:
trips[trips.block_id == 8902800]

In [None]:
trips.trip_id.unique().shape

In [None]:
trips = pd.read_csv('data/timetables/gtfs3Sept/trips.txt', sep = ",")

In [None]:
shapes = pd.read_csv('data/timetables/gtfs3Sept/shapes.txt', sep = ",")

In [None]:
shapes

In [None]:
calendar = pd.read_csv('data/timetables/gtfs3Sept/calendar.txt', sep = ",")

In [None]:
calendar

In [None]:
calendar[calendar.service_id == 237580031]

In [None]:
calendar.service_id.unique().shape

In [None]:
calendar_dates = pd.read_csv('data/timetables/gtfs3Sept/calendar_dates.txt', sep = ",")

In [None]:
calendar_dates

In [None]:
calendar_dates.groupby("service_id").count().sort_values("date", ascending=False)

In [None]:
calendar_dates[calendar_dates.service_id == 237580031]

In [None]:
trips[trips.service_id == 237580031]

In [None]:
calendar_dates.service_id.unique().shape

In [None]:
calendar.merge(
    calendar_dates,
    "left",
    "service_id"
)

In [None]:
stop_times = pd.read_csv('data/timetables/gtfs3Sept/stop_times.txt', sep = ",")

In [None]:
stop_times[stop_times.trip_id == 112377729235536070]

## Esri - ShapeFiles

### Library 1 : Geopandas

https://geopandas.org/en/stable/docs/user_guide.html <br>
https://automating-gis-processes.github.io/CSC/notebooks/L2/geopandas-basics.html

In [None]:
%pip install folium mapclassify

In [None]:
import geopandas as gpd

lines = gpd.read_file("data/map/2109_STIB_MIVB_Network/ACTU_LINES.shp")
stops_geom = gpd.read_file("data/map/2109_STIB_MIVB_Network/ACTU_STOPS.shp")


In [None]:
stops_geom

In [None]:
routes

In [None]:
routes.groupby("route_id").count()

In [None]:
routes[routes.route_id == "001m"]

In [None]:
lines

In [None]:
lines[lines.LIGNE == "001m"]

In [None]:
lines.groupby("LIGNE").count()

In [None]:
stops

In [None]:
stops_geom[stops_geom.stop_id == "0089"]

In [None]:
stops_geom.groupby("stop_id").count().sort_values(by="coord_x", ascending= False)

In [None]:
stops_geom[stops_geom.stop_id == "2595"]

In [None]:
stops.groupby("stop_id").count().sort_values(by="stop_lat", ascending= False)

In [None]:
stops[stops.location_type > 0]

In [None]:
stops[stops.stop_id == "62"]

In [None]:
stops[stops.stop_name == "BOURSE"]

In [None]:
stops_geom[stops_geom.stop_id == "62"]

In [None]:
stops_bis = stops_geom.merge(
    right = stops,
    how = "left",
    left_on="stop_id",
    right_on="stop_id"
)

In [None]:
stops_bis

In [None]:
stops[stops.location_type > 0]

In [None]:
stops_bis[stops_bis.stop_id == "0520161"]

In [None]:
stops_bis.explore(column="location_type")

In [None]:
%matplotlib inline

base = lines.plot(color='grey', figsize=(15, 15))
stops.plot(ax=base, marker='o', color='red', markersize=8)
#base.set_axis_off()

In [None]:
m = lines.explore( 
    column = "LIGNE",
    cmap="Set1"
)
stops_bis.explore(
    m=m,
    color="red")

## Vehicule position

In [None]:
from pprint import pprint
import pandas as pd

In [None]:
import json

with open('/Users/mjdaoudi/Library/CloudStorage/OneDrive-UniversitéLibredeBruxelles/_MS-BDGA.2022-2023/Semester_1/INFO-H423 - Data Mining/STIB-Network-QoS-Analysis/data/vehicule_positions/vehiclePosition01.json', 'r') as f:
  data = json.load(f)

In [None]:
pprint(data)

In [None]:
pprint(data["data"][0])

In [None]:
len(data["data"][0])

In [None]:
pprint(data["data"][0]["Responses"])

In [None]:
len(data["data"][0]["Responses"])

In [None]:
pprint(data["data"][0]["Responses"][0])

In [None]:
data["data"][1]

In [None]:
pd.read_json('/Users/mjdaoudi/Library/CloudStorage/OneDrive-UniversitéLibredeBruxelles/_MS-BDGA.2022-2023/Semester_1/INFO-H423 - Data Mining/STIB-Network-QoS-Analysis/data/vehicule_positions/vehiclePosition01.json')

In [None]:
for 