# MASTER - Notebook 2
### Matteo Grazioso 884055

In [35]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
from datetime import datetime
from pandas import Timestamp
import json
import warnings
warnings.filterwarnings('ignore')

import myfunctions as mf # Custom functions

In [36]:
# Disply all columns and all rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [37]:
# Find all txt files in the data folder
txt_files = mf.find_txt_files("data/processed")

# Choose a dataset from the list of txt files
selected_dataset = mf.choose_dataset(txt_files)

if selected_dataset:
    print(f"You selected the dataset {selected_dataset}")
else:
    print("No dataset selected.")

path  = selected_dataset

The following TXT files were found:
1. data/processed/dataset_cleaned_esportazioneCompleta.txt
2. data/processed/dataset_cleaned_esportazionePasqua23.txt
3. data/processed/dataset_cleaned_validazioni.txt
4. data/processed/old smaller/dataset_cleaned_esportazioneCompleta.txt
5. data/processed/old smaller/dataset_cleaned_validazioni.txt
You selected the dataset data/processed/dataset_cleaned_esportazioneCompleta.txt


In [38]:
# The file contains the data of the validation of tickets in the city of public transport of Venice.
# The file has been created by the Notebook 1.ipynb

df = pd.read_csv(path, header=0, sep='\t')

# Save the name of the file in a variable for future use extracting the name of the file from the path
file_name = path.split('_')[-1].split('.')[0]
subfolder = file_name

# Display the first 5 rows of the dataframe
df.head()

# Convert the column 'DATA' to datetime format
df['DATA'] = pd.to_datetime(df['DATA'], format='%Y-%m-%d')

In [39]:
# Open the dictionary of trajectories
def open_dict_trajectories(is_focus_on_ticket_code: bool = False, ticket_code: str = None) -> dict:
    """
        This function opens the dictionary of trajectories and returns it.
        :param is_focus_on_ticket_code: if True, the dictionary will be filtered by ticket code
        :param ticket_code: the ticket code to filter the dictionary
        :return: the dictionary of trajectories
    """
    if is_focus_on_ticket_code:
        with open('data/dictionaries/trajectories/' + subfolder + '/dict_trajectories_' + file_name + '_tc:' + ticket_code + '.json') as f:
            data = json.load(f)
        return data
    else:
        with open('data/dictionaries/trajectories/' + subfolder + '/dict_trajectories_' + file_name + '.json') as f:
            data = json.load(f)
        return data

# Open the dictionary of trajectories
dict_trajectories = open_dict_trajectories(is_focus_on_ticket_code=True, ticket_code='5')


In [40]:
def get_rows_from_key(df: pd.DataFrame, key: tuple) -> pd.DataFrame:
    """
        This function returns the rows of the dataframe for the specified key.
        Note that the key is in the format (serial, day), where day can be None that means that the seriale doesn't change over the days.
        :param df: the dataframe
        :param dict_trajectories: the dictionary with the trajectories
        :param key: the key
        :return: the row of the dataframe
    """
    # If the day is None, return the dataframe with the trajectories of the user
    if key[1] == None:
        #print('The key is: {}'.format(key))
        return df[df['SERIALE'] == key[0]]
    # Otherwise, return the dataframe with the trajectories of the user in the specified day
    else:
        # Notice that the data in the dataframe is a string while the data in the key is a Timestamp
        #print('The key is: {}'.format(key))
        # Convert the data in the dataframe to a Timestamp
        df['DATA'] = pd.to_datetime(df['DATA'], format='%Y-%m-%d %H:%M:%S')
        return df[(df['SERIALE'] == key[0]) & (df['DATA'].dt.date == key[1])]

In [140]:
# Using the function get_rows_from_key, we can get the rows of the dataframe for the specified key
# use the file stop_all to obtain the coordinates of the stops
# An example of element of the dictionary is:
# [{"5501": ["p.le roma",  "45.438667",  "12.319465"]
# Get the dictionary of stops
with open('MASTER/transformData/stop_all.json', 'r') as f:
    data = json.load(f)
# Convert the dictionary to a dataframe
df_stop_all = pd.DataFrame.from_dict(data, orient='index')

# The dictionary is {stop_id: [stop_name, stop_lat, stop_lon]}
# Add header to the dataframe
df_stop_all.columns = ['stop_name', 'stop_lat', 'stop_lon']
print(df_stop_all.head())

def get_most_similar_string(string: str, strings: list) -> str:
    """
        This function returns the most similar string in the list of strings.
        :param string: the string
        :param strings: the list of strings
        :return: the most similar string
    """
    # Import the library
    from difflib import get_close_matches
    # Get the most similar string
    most_similar_string = get_close_matches(string, strings, n=1, cutoff=0.0)[0]
    return most_similar_string

# Define a function that returns the coordinates of the stop given the stop_name
def get_coordinates(stop_name: str) -> list:
    """
        This function returns the coordinates of the stop given the stop_name.
        :param stop_name: the name of the stop
        :return: the coordinates of the stop
    """
    # Get the row of the dataframe
    row = df_stop_all[df_stop_all['stop_name'] == stop_name]

    if row.empty:
        # raise a warning
        # Find the most similar stop_name in the dataframe
        # Get the list of stop_names
        stop_names = df_stop_all['stop_name'].values
        # Get the most similar stop_name
        most_similar_stop_name = get_most_similar_string(stop_name, stop_names)
        # Get the row of the dataframe
        row = df_stop_all[df_stop_all['stop_name'] == most_similar_stop_name]
        # Print a warning
        print('The stop_name {} is not in the dataframe. The most similar stop_name is {}.'.format(stop_name, most_similar_stop_name))

    # Get the coordinates
    lat = row['stop_lat'].values[0]
    lon = row['stop_lon'].values[0]
    
    return [lat, lon]

# Call the function get_coordinates
get_coordinates('p.le roma')
get_coordinates('p.l')



       stop_name   stop_lat   stop_lon
5501   p.le roma  45.438667  12.319465
15060  f.te nove  45.443016  12.340888
15085  tre archi  45.446289  12.319780
15108   sabbioni  45.447338  12.424333
5001        lido  45.417992  12.368725
The stop_name p.l is not in the dataframe. The most similar stop_name is p.le roma.


['45.438667', '12.319465']

In [137]:
from geopy.geocoders import Nominatim
def get_coordinates_geopy(stop_name: str) -> list:
    """
        This function returns the coordinates of the stop given the stop_name.
        :param stop_name: the name of the stop
        :return: the coordinates of the stop
    """
    geolocator = Nominatim(user_agent="my-app")
    location = geolocator.geocode(stop_name + ', Venezia')
    if location == None:
        print('The stop {} is not in the dataframe.'.format(stop_name))
        # Try to retrieve the coordinates of the stop using the function get_coordinates
        coordinates = get_coordinates(stop_name)
        if coordinates == None:
            return None
        else:
            return coordinates
    return [location.latitude, location.longitude]

In [138]:
# Insert in the map all the stops of the stop_all
import folium

# Create the map of Venice
venice_latitude = 45.4371908
venice_longitude = 12.3345898
m = folium.Map(location=[venice_latitude, venice_longitude], zoom_start=13)

# Add the stops to the map
# Piazza Roma
# Stazione ferroviaria Santa Lucia
# Ca' Foscari
# Use the function get_coordinates_geopy
stop_names = ['p.le roma', 'stazione venezia s.lucia', 'santa marta']
for stop_name in stop_names:
    coordinates = get_coordinates_geopy(stop_name)
    if coordinates != None:
        folium.Marker(coordinates, popup=stop_name).add_to(m)
        
# Add the lines to the map
# Piazza Roma - Stazione ferroviaria Santa Lucia
# Stazione ferroviaria Santa Lucia - Ca' Foscari
# Ca' Foscari - Piazza Roma
# Use the function get_coordinates_geopy
stop_names = ['p.le roma', 'stazione venezia s.lucia', 'santa marta']
for i in range(len(stop_names)):
    if i == len(stop_names) - 1:
        coordinates1 = get_coordinates_geopy(stop_names[i])
        coordinates2 = get_coordinates_geopy(stop_names[0])
    else:
        coordinates1 = get_coordinates_geopy(stop_names[i])
        coordinates2 = get_coordinates_geopy(stop_names[i+1])
    if coordinates1 != None and coordinates2 != None:
        folium.PolyLine([coordinates1, coordinates2], color="red", weight=2.5, opacity=1).add_to(m)

m


In [95]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="my-app")
location = geolocator.geocode("p.le roma")
print(location.longitude, location.latitude)


12.318850957988698 45.438007


In [111]:
def get_trajectories_from_key(df: pd.DataFrame, key: tuple) -> pd.DataFrame:
    """
        This function returns the rows of the dataframe for the specified key.
        Note that the key is in the format (serial, day), where day can be None that means that the seriale doesn't change over the days.
        :param df: the dataframe
        :param dict_trajectories: the dictionary with the trajectories
        :param key: the key
        :return: the row of the dataframe
    """
    # If the day is None, return the dataframe with the trajectories of the user
    if key[1] == None:
        #print('The key is: {}'.format(key))
        return df[df['SERIALE'] == key[0]]
    # Otherwise, return the dataframe with the trajectories of the user in the specified day
    else:
        # Notice that the data in the dataframe is a string while the data in the key is a Timestamp
        #print('The key is: {}'.format(key))
        # Convert the data in the dataframe to a Timestamp
        df['DATA'] = pd.to_datetime(df['DATA'], format='%Y-%m-%d %H:%M:%S')
        return df[(df['SERIALE'] == key[0]) & (df['DATA'].dt.date == key[1])]

In [141]:
# Given a user, represent the trajectories of the user in the map
# key: 40834232622848516, None

# Get the rows of the dataframe for the specified key
df_key = get_trajectories_from_key(df, key=(40834232622848516, None))
print(df_key.head())

# Get the coordinates of the stops
stop_names = df_key['DESCRIZIONE'].unique()
# To each stop name, concatenate the string ', Venezia' to get the coordinates
stop_names = [stop_name + ', Venezia' for stop_name in stop_names]
print(stop_names)


# Map the stop names to the coordinates
stop_names_coordinates = {}
for stop_name in stop_names:
    stop_names_coordinates[stop_name] = get_coordinates_geopy(stop_name)
print(stop_names_coordinates)



# Create the map of Venice
venice_latitude = 45.4371908
venice_longitude = 12.3345898
m = folium.Map(location=[venice_latitude, venice_longitude], zoom_start=13)

# Add the stops to the map
stop_names_coordinates_copy = stop_names_coordinates.copy()
for stop_name, coordinates in stop_names_coordinates.items():
    if coordinates != None:
        folium.Marker(coordinates, popup=stop_name).add_to(m)
    else:
        print('The stop {} is not in the dataframe.'.format(stop_name))
        # remove the stop from the dictionary
        stop_names_coordinates_copy.pop(stop_name)

# Add the lines to the map
folium.PolyLine(list(stop_names_coordinates_copy.values()), color="red", weight=2.5, opacity=1).add_to(m)

m

            DATA       ORA     DATA_VALIDAZIONE            SERIALE  FERMATA  \
3514  2023-01-13  07:23:00  2023-01-13 07:23:00  40834232622848516     5132   
15650 2023-01-13  09:59:00  2023-01-13 09:59:00  40834232622848516     5013   
19171 2023-01-13  10:37:00  2023-01-13 10:37:00  40834232622848516     5049   
24250 2023-01-13  11:35:00  2023-01-13 11:35:00  40834232622848516     5022   
55948 2023-01-13  18:02:00  2023-01-13 18:02:00  40834232622848516     5132   

        DESCRIZIONE  TITOLO TICKET_CODE        DESCRIZIONE_TITOLO  
3514   S. MARCUOLA-   11261           1  DAILYP-TPL19,90-C.VE5,10  
15650  S. MARCO-SAN   11261           1  DAILYP-TPL19,90-C.VE5,10  
19171       ZATTERE   11261           1  DAILYP-TPL19,90-C.VE5,10  
24250  GIUDECCA PAL   11261           1  DAILYP-TPL19,90-C.VE5,10  
55948  S. MARCUOLA-   11261           1  DAILYP-TPL19,90-C.VE5,10  
['S. MARCUOLA-, Venezia', 'S. MARCO-SAN, Venezia', 'ZATTERE, Venezia', 'GIUDECCA PAL, Venezia', 'SALUTE, Venezia']
Th