<a href="https://colab.research.google.com/github/micazev/analiseDeDados_mexicoCityGTFS/blob/main/analiseDeDados_mexicoCityGTFS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# GTFS estático (General Transit Feed Specification) de la Ciudad de México
# https://datos.cdmx.gob.mx/dataset/gtfs

In [None]:
# import the folium, pandas libraries
import folium # folium documentation https://python-visualization.github.io/folium/
import pandas as pd # pandas documentation https://pandas.pydata.org/docs/

In [None]:
# initialize the map and set its initial location and zoom level
m = folium.Map(location=[19.40, -99], zoom_start=12)

# save the map as an HTML file
m.save('my_map.html')

# and show it
m

In [None]:
# read data from the 'shapes.txt', 'stop_times.txt', and 'stops.txt' files
routes = pd.read_csv("routes.txt", sep=",")
shapes = pd.read_csv("shapes.txt", sep=",")
stops = pd.read_csv("stops.txt", sep=",")
stop_times = pd.read_csv("stop_times.txt", sep=",")
trips = pd.read_csv("trips.txt", sep=",")

# lets show the data avaliable
shapes.head(5)

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,SH01003A000_0,19.515728,-99.188722,1,0.0
1,SH01003A000_0,19.515742,-99.188628,2,0.01
2,SH01003A000_0,19.515815,-99.188589,3,0.02
3,SH01003A000_0,19.517328,-99.188834,4,0.19
4,SH01003A000_0,19.520009,-99.189307,5,0.49


"Colinha" for each datatable:
*   **routes:** route_id,agency_id,route_short_name,route_long_name,route_type,route_color,route_text_color;
*   **shapes:** shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled;
*   **stops:**  stop_id,	stop_name,	stop_lat,	stop_lon,	zone_id,	wheelchair_boarding;
*   **stop_times:** trip_id,arrival_time,departure_time,stop_id,stop_sequence,timepoint;

*   **trips:** route_id,	service_id,	trip_id,	trip_headsign,	trip_short_name,	direction_id,	shape_id





# Show the Stops

In [None]:
# add markers for each bus stop to the map
for index, stop in stops.iterrows():
    stop_id = stop['stop_id']
    stop_name = stop['stop_name']
    stop_lat = stop['stop_lat']
    stop_lon = stop['stop_lon']

    # create a marker for the bus stop and add it to the map
    folium.Marker(
        location=[stop_lat, stop_lon],
        popup=f"Stop ID: {stop_id}<br>Stop Name: {stop_name}",
        tooltip=stop_name,  # Tooltip to display the stop name on hover
        icon=folium.Icon(color='blue')  # You can customize the marker icon color here
    ).add_to(m)

# and show it

In [None]:
# a bit messy... maybe we can color code it by zone

# Step 3 (Updated): Define a function to dynamically generate colors based on zones
# Using a hash function to generate a color code for each zone_id
# This will ensure different zone_ids are mapped to unique colors
# The function will convert the zone_id to bytes and calculate its hash value using hashlib.md5
# Then, it will extract the first 6 characters of the hash value and convert it to a valid color format (RRGGBB)
# This approach will create visually distinguishable colors for each unique zone_id

def get_color(zone_id):
    import hashlib

    zone_hash = hashlib.md5(str(zone_id).encode()).hexdigest()
    color_code = f"#{zone_hash[:6]}"
    return color_code

# Step 4: Add markers for each bus stop to the map
# Loop through the 'stops' DataFrame to get information for each bus stop
# Create a marker for the bus stop and add it to the map using the Folium library
# The color of the marker is determined by the zone_id, dynamically generated using the 'get_color' function

for index, stop in stops.iterrows():
    stop_id = stop['stop_id']
    zone_id = stop['zone_id']
    stop_name = stop['stop_name']
    stop_lat = stop['stop_lat']
    stop_lon = stop['stop_lon']

    # Create a marker for the bus stop and add it to the map
    folium.Marker(
        location=[stop_lat, stop_lon],
        popup=f"Stop ID: {stop_id}<br>Stop Name: {stop_name}",
        tooltip=stop_name,  # Tooltip to display the stop name on hover
        icon=folium.Icon(color=get_color(zone_id))  # Use the get_color function to get the marker color
    ).add_to(m)

# and show it
# The map with colored markers representing bus stops will be displayed here


  icon=folium.Icon(color=get_color(zone_id))  # Use the get_color function to get the marker color


# Show the bus lines

In [None]:
# Step 1: Merge 'routes' DataFrame with 'trips' DataFrame to get the corresponding 'shape_id' for each route.
# We are using a left join to preserve all rows from the 'routes' DataFrame, even if there is no match in 'trips'.
df = routes.merge(trips[['shape_id', 'route_id']], on='route_id', how='left')

# Step 2: Merge the previously merged DataFrame 'df' with the 'shapes' DataFrame.
# This step adds latitude ('shape_pt_lat') and longitude ('shape_pt_lon') columns to 'df',
# which contain the geographical coordinates of each shape (path) corresponding to the routes.
# We are using a left join to keep all rows from 'df' even if there is no match in 'shapes'.
routes_with_coordinates = df.merge(shapes[['shape_id', 'shape_pt_lat', 'shape_pt_lon']],
                                   on='shape_id', how='left')

# The 'routes_with_coordinates' DataFrame now contains the information from the 'routes' DataFrame
# along with the latitude ('shape_pt_lat') and longitude ('shape_pt_lon') columns, representing the paths
# of each route as determined by their corresponding 'shape_id'.
# This DataFrame provides a comprehensive view of each route, including its geographical path.
routes_with_coordinates

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type,route_color,route_text_color,shape_id,shape_pt_lat,shape_pt_lon
0,CMX05SE12,RTP,SE12,Constitución de 1917 a Tláhuac,3,76B729,FFFFFF,SH05SE12000_1,19.344980,-99.064620
1,CMX05SE12,RTP,SE12,Constitución de 1917 a Tláhuac,3,76B729,FFFFFF,SH05SE12000_1,19.344960,-99.064560
2,CMX05SE12,RTP,SE12,Constitución de 1917 a Tláhuac,3,76B729,FFFFFF,SH05SE12000_1,19.344900,-99.064450
3,CMX05SE12,RTP,SE12,Constitución de 1917 a Tláhuac,3,76B729,FFFFFF,SH05SE12000_1,19.344890,-99.064430
4,CMX05SE12,RTP,SE12,Constitución de 1917 a Tláhuac,3,76B729,FFFFFF,SH05SE12000_1,19.344870,-99.064410
...,...,...,...,...,...,...,...,...,...,...
229155,CMX020L12,METRO,L12,Tláhuac-Mixcoac,1,B99E51,FFFFFF,SH020L12000_0,19.288206,-99.014958
229156,CMX020L12,METRO,L12,Tláhuac-Mixcoac,1,B99E51,FFFFFF,SH020L12000_0,19.288059,-99.014878
229157,CMX020L12,METRO,L12,Tláhuac-Mixcoac,1,B99E51,FFFFFF,SH020L12000_0,19.287826,-99.014780
229158,CMX020L12,METRO,L12,Tláhuac-Mixcoac,1,B99E51,FFFFFF,SH020L12000_0,19.287010,-99.014518


In [None]:
# Step 3: Add colored PolyLine for each bus line to the map
for index, row in routes_with_coordinates.dropna(subset=['shape_pt_lat', 'shape_pt_lon']).iterrows():
    route_id = row['route_id']
    route_short_name = row['route_short_name']
    route_long_name = row['route_long_name']
    route_type = row['route_type']
    route_color = row['route_color']
    route_text_color = row['route_text_color']
    shape_pt_lat = row['shape_pt_lat']
    shape_pt_lon = row['shape_pt_lon']

    # Lets use a list of coordinates that represent the path (shape) of the bus line
    # We create a list of [latitude, longitude] pairs for each bus line using the available columns.
    coordinates = [[shape_pt_lat, shape_pt_lon]]

    folium.PolyLine(
        locations=coordinates,
        color=f'#{route_color}',  # Convert the hex color code to a valid color format
        weight=3,
        opacity=0.7,
        popup=f"{route_short_name} - {route_long_name}",
        line_cap='round',
        line_join='round',
    ).add_to(m)

# Step 4: Save the map as an HTML file to visualize the bus lines
m.save('bus_lines_map.html')

# and show it
m

# Fazer outra coisa

In [None]:
# Step 4: Merge data to create a full table with stop information and coordinates
table_stoptimes_stop = stop_times.merge(stops, left_on='stop_id', right_on='stop_id', how='inner')
# And take a look
table_stoptimes_stop.head(5)

In [None]:
# Step 5: Prepare the DataFrame 'df' with selected columns for further analysis
df = table_stoptimes_stop[['arrival_time', 'trip_id', 'stop_lat', 'stop_lon']]

In [None]:
# Step 6: Clean the data by removing duplicate entries
df = df.drop_duplicates()

# Step 7: Convert 'arrival_time' column to a datetime format for time-based analysis
df['arrival_time'] = pd.to_datetime(df['arrival_time'], format='%H:%M:%S')

# Step 8: Create an additional 'hour' column to represent the hour of arrival time
df['hour'] = df['arrival_time'].apply(lambda x: x.hour + 1)


# Step 9: Group the data to find the maximum arrival time for each hour and trip
df2 = pd.DataFrame(df.groupby(['hour', 'trip_id'])['arrival_time'].max())
df2.reset_index(inplace=True)

# Step 10: Merge the grouped data with the original DataFrame to get additional trip details
df3 = pd.merge(df2, df, left_on=['hour', 'trip_id', 'arrival_time'], right_on=['hour', 'trip_id', 'arrival_time'])

# Step 11: Prepare the latitude and longitude list for HeatMapWithTime
lat_long_list = []
for i in range(1, 25):
    temp = []
    for index, instance in df3[df3['hour'] == i].iterrows():
        temp.append([instance['stop_lat'], instance['stop_lon']])
    lat_long_list.append(temp)

In [None]:
# Step 11: Prepare the latitude and longitude list for HeatMapWithTime
lat_long_list = []  # Create an empty list to store the latitude and longitude data

# Loop through each hour of the day (from 1 to 24)
for i in range(1, 25):
    temp = []  # Create a temporary list to store the latitude and longitude pairs for each hour

    # Iterate through the rows of DataFrame 'df3' for the current hour 'i'
    for index, instance in df3[df3['hour'] == i].iterrows():
        # Append the latitude and longitude coordinates of the bus stop to the temporary list
        temp.append([instance['stop_lat'], instance['stop_lon']])

    # Append the temporary list (containing the coordinates for the current hour) to 'lat_long_list'
    lat_long_list.append(temp)

In [None]:
# Step 12: Create a HeatMapWithTime overlay using the latitude and longitude list and add it to the map
HeatMapWithTime(lat_long_list, radius=5, auto_play=True, position='bottomright').add_to(m)

# The map with the HeatMapWithTime overlay is now ready for visualization.
# Display the map with the bus stop data and their frequencies over time.
m

In [None]:
# join stop id
# join to match the coordinates of each stop
full_table = stop_times.merge(stops, left_on='stop_id', right_on='stop_id', how='inner')
full_table.head(5)

In [None]:
df=full_table[['arrival_time','trip_id','stop_lat','stop_lon']]
df.head()

In [None]:
df.duplicated().value_counts()
df=df.drop_duplicates()


In [None]:
df.isnull().sum()


In [None]:
# Converting column to datetime
df['arrival_time']=pd.to_datetime(df['arrival_time'],format='%Y-%m-%d %H:%M:%S')
# Creating hour column
df['hour']=df['arrival_time'].apply(lambda x: x.hour+1)
df.head()

In [None]:
df2=pd.DataFrame(df.groupby(['hour','trip_id'])['arrival_time'].max())
df2.reset_index(inplace=True)
df2.head()

In [None]:
df3=pd.merge(df2,df,left_on=['hour','trip_id','arrival_time'],right_on=['hour','trip_id','arrival_time'])
df3.head()

In [None]:
lat_long_list = []
for i in range(1,25):
    temp=[]
    for index, instance in df3[df3['hour'] == i].iterrows():
        temp.append([instance['stop_lat'],instance['stop_lon']])
    lat_long_list.append(temp)

In [None]:

HeatMapWithTime(lat_long_list,radius=5,auto_play=True,position='bottomright').add_to(m)
m

# New Section