# MASTER - Notebook 2
### Matteo Grazioso 884055

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import json
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Disply all columns and all rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# The file contains the data of the validation of tickets in the city of public transport of Venice.
# The file has been created by the Notebook 1.ipynb

# Import the data into a dataframe of a txt file
path = 'data/processed/dataset_cleaned_validazioni.txt'
# path = 'data/processed/dataset_cleaned_esportazioneCompleta.txt'

df = pd.read_csv(path, header=0, sep='\t')

# Display the first 5 rows of the dataframe
df.head()

df['DATA'] = pd.to_datetime(df['DATA'], format='%Y-%m-%d')


---------


## Focus on specific types of tickets

In [None]:
from datetime import datetime

def to_datetime(date: np.datetime64) -> datetime:
    """
      Converts a numpy datetime64 object to a python datetime object 
      Input:
        date - a numpy datetime64 object
      Output:
        DATE - a python datetime object

      Credit: Brian Blaylock on GitHub Gist https://gist.github.com/blaylockbk/1677b446bc741ee2db3e943ab7e4cabd
    """
    # timestamp = ((date - np.datetime64('1970-01-01T00:00:00')) / np.timedelta64(1, 's'))
    # return datetime.utcfromtimestamp(timestamp)
    # date = pd.to_datetime(date)
    date = datetime.strptime(date, '%Y-%m-%d')
    return date

In [None]:
def get_ticket_code_description(ticket_code: str) -> str:
    """
      Given the key of the ticket code, return the description of the ticket code contained in the dictionary dict_ticket_code.json
      Input:
        ticket_code - a string that is the key of the dictionary
      Output:
        description - a string that is the description of the ticket code, value of the dictionary
    """
    with open('data/dictionaries/dict_ticket_codes.json') as f:
        data = json.load(f)
    description = data[ticket_code]
    return description

In [None]:
# Focus on TICKET_CODE and TICKET_TYPE
def focus_on_ticket_code(df_tc: pd.DataFrame, ticket_code: str) -> pd.DataFrame:
    """
        This function returns a dataframe with only the rows of the specified ticket code.
        :param df: the dataframe
        :param ticket_code: the ticket code
        :return: the dataframe with only the rows of the specified ticket code
    """
    # Select only the rows of the specified ticket code
    df_tc = df_tc[df_tc['TICKET_CODE'] == ticket_code]
    df_tc.head()
    return df_tc

# Focus on TICKET_TYPE
def focus_on_ticket_type(df_tt: pd.DataFrame, ticket_type: str) -> pd.DataFrame:
    """
        This function returns a dataframe with only the rows of the specified ticket type.
        :param df: the dataframe
        :param ticket_type: the ticket type
        :return: the dataframe with only the rows of the specified ticket type
    """
    # Select only the rows of the specified ticket type
    df_tt = df_tt[df['DESCRIZIONE_TITOLO'] == ticket_type]
    df_tt.head()
    return df_tt

# Functions to focus on a specific ticket code or ticket type
def number_of_tickets_per_day(df_d: pd.DataFrame, target_ticket_code_or_type: str, is_ticket_code: bool) -> None:
    """
        This function plots the number of validations of the specified ticket code for each day.
        :param df: the dataframe
        :param target_ticket_code: the ticket code
        :param is_ticket_code: a boolean that specifies if the target is a ticket code or a ticket type
        :return: None
    """
    # Group the dataframe by date and hour and count the number of validations of the specified ticket code
    df_d = df_d.groupby('DATA').count()['SERIALE'].reset_index()    
    df_d['cumulative_sum'] = df_d.groupby('DATA')['SERIALE'].cumsum()
    
    # Plot the cumulative sum of the number of validations of the target ticket code or type for each day
    plt.figure(figsize=(20, 10))
    plt.plot(df_d['DATA'], df_d['cumulative_sum'])

    if is_ticket_code:
        descr = get_ticket_code_description(target_ticket_code_or_type)
        plt.title('Cumulative sum of the number of validations of the ticket code "{}" - "{}" for each day'.format(target_ticket_code_or_type, descr), fontsize=20)
    else:
        plt.title('Cumulative sum of the number of validations of the ticket type "{}" for each day'.format(target_ticket_code_or_type), fontsize=20)
    plt.xlabel('Date - days', fontsize=15)
    plt.ylabel('Cumulative sum', fontsize=15)

    # Calculate the step of the y-axis
    step = int(df_d['cumulative_sum'].max()/10)
    # Manage the y-axis
    plt.yticks(ticks=np.arange(0, df_d['cumulative_sum'].max()+step, step))

    # Manage the x-axis: rotate the labels of the x-axis
    plt.xticks(rotation=45)

    plt.show()

def min_max_number_of_tickets_per_day(df_d: pd.DataFrame, target_ticket_code_or_type: str, is_ticket_code: bool) -> None:
    """
        This function prints the day with the highest and the day with the lowest number of validations of the specified ticket code or ticket type.
        :param df: the dataframe
        :param target_ticket_code_or_type: the ticket code or ticket type
        :param is_ticket_code: a boolean that specifies if the target is a ticket code or a ticket type
        :return: None
    """
    # Group the dataframe by date and hour and count the number of validations of the specified ticket code
    df_d = df_d.groupby('DATA').count()['SERIALE'].reset_index()
    max = df_d[df_d['SERIALE'] == df_d['SERIALE'].max()]
    min = df_d[df_d['SERIALE'] == df_d['SERIALE'].min()]

    # Composing the string to print the result, specifying: 
    # - if the target is a ticket code or a ticket type,
    # - converting the date to string format '%Y-%m-%d' and
    # - if is a ticket code, getting the description of the ticket code
    if is_ticket_code:
        # Get the description of the ticket code
        descr = ' - "' + get_ticket_code_description(target_ticket_code_or_type) + '"'
    else:
        descr = ''
    
    s = 'The {} number of validations of the {} "{}"' + descr + ' was {} with {} validations'
    for i in range(0, 2):
        if i == 0:
            if is_ticket_code:
                print(s.format('highest', 'ticket code', target_ticket_code_or_type, max['DATA'].values[0].astype('datetime64[D]').astype(str), max['SERIALE'].values[0]))
            else:
                print(s.format('highest', 'ticket type', target_ticket_code_or_type, max['DATA'].values[0].astype('datetime64[D]').astype(str), max['SERIALE'].values[0]))
        else:
            if is_ticket_code:
                print(s.format('lowest', 'ticket code', target_ticket_code_or_type, min['DATA'].values[0].astype('datetime64[D]').astype(str), min['SERIALE'].values[0]))
            else:
                print(s.format('lowest', 'ticket type', target_ticket_code_or_type, min['DATA'].values[0].astype('datetime64[D]').astype(str), min['SERIALE'].values[0]))


def barplot_number_of_tickets_per_month(df_m: pd.DataFrame, target_ticket_code_or_type: str, is_ticket_code: bool) -> None:
    """
        This function plots the number of validations of the specified ticket code or ticket type for each month.
        :param df: the dataframe
        :param target_ticket_code_or_type: the ticket code or ticket type
        :param is_ticket_code: a boolean that specifies if the target is a ticket code or a ticket type
        :return: None
    """
    # Create a dataframe with the number of validations of the target ticket for each month
    # The column 'DATA' is in pd.datetime format as '%d/%m/%Y'
    # Convert the column 'DATA' to pd.datetime format as '%Y/%m' using astype('datetime64[M]').astype(str)
    # Group the dataframe by the column 'DATA' and count the number of validations of the target ticket

    # Set all the values of the date on the first day of the month
    df_m['DATA'] = df_m['DATA'].dt.strftime('%Y-%m-01')
    #df_m['DATA'] = df_m['DATA'].apply(lambda x: pd.Timestamp(x))


    df_m = df_m.groupby('DATA').count()['SERIALE'].reset_index(drop=False)

    df_m['DATA'] = df_m['DATA'].astype('datetime64[M]').astype(str)

    # Remove the last 3 characters of the date (the day) to have the date in the format '%Y/%m' 
    for i in range(0, len(df_m['DATA'])):
        df_m['DATA'][i] = df_m['DATA'][i][:-3]
    
    # Plot the cumulative sum of the number of validations of the target ticket code or type for each month
    plt.figure(figsize=(20, 10))
    plt.bar(df_m['DATA'], df_m['SERIALE'])

    if is_ticket_code:
        descr = get_ticket_code_description(target_ticket_code_or_type)
        plt.title('Cumulative sum of the number of validations of the ticket code "{}" - "{}" for each month'.format(target_ticket_code_or_type, descr), fontsize=20)
    else:
        plt.title('Cumulative sum of the number of validations of the ticket type "{}" for each month'.format(target_ticket_code_or_type), fontsize=20)
    plt.xlabel('Date - months', fontsize=15)
    plt.ylabel('Cumulative sum', fontsize=15)

    # Calculate the step of the y-axis
    step = int(df_m['SERIALE'].max()/10)
    # Manage the y-axis
    plt.yticks(ticks=np.arange(0, df_m['SERIALE'].max()+step, step))

    # Manage the x-axis: rotate the labels of the x-axis
    # Print the x-axis labels as '%Y/%m', so remove the day from the date and print only the year and the month of the date removing the last 3 characters
    # for i in range(0, len(df_m['DATA'])):
        #df_m['DATA'][i] = df_m['DATA'][i][:-3]
    
    plt.xticks(rotation=45, ticks=df_m['DATA'])

    plt.show()    

def min_max_number_of_tickets_per_month(df_m: pd.DataFrame, target_ticket_code_or_type: str, is_ticket_code: bool) -> None:
    """
        This function prints the month with the highest and the month with the lowest number of validations of the specified ticket code or ticket type.
        :param df: the dataframe
        :param target_ticket_code_or_type: the ticket code or ticket type
        :param is_ticket_code: a boolean that specifies if the target is a ticket code or a ticket type
        :return: None
    """
    # Convert DATA to datetime 
    df_m['DATA'] = pd.to_datetime(df_m['DATA'], format='%Y/%m/%d')
    # Set all the values of the date on the first day of the month
    df_m['DATA'] = df_m['DATA'].dt.strftime('%Y-%m-01')   
    
    # Group the dataframe by date and hour and count the number of validations of the specified ticket code
    df_m = df_m.groupby('DATA').count()['SERIALE'].reset_index()
        
    max = df_m[df_m['SERIALE'] == df_m['SERIALE'].max()]
    min = df_m[df_m['SERIALE'] == df_m['SERIALE'].min()]

    # Composing the string to print the result, specifying: 
    # - if the target is a ticket code or a ticket type,
    # - converting the date to string format '%Y-%m-%d' and
    # - if is a ticket code, getting the description of the ticket code
    if is_ticket_code:
        # Get the description of the ticket code
        descr = ' - "' + get_ticket_code_description(target_ticket_code_or_type) + '"'
    else:
        descr = ''
    
    s = 'The {} number of validations of the {} "{}"' + descr + ' was the month {} with {} validations.'
    for i in range(0, 2):
        if i == 0:
            if is_ticket_code:
                print(s.format('highest', 'ticket code', target_ticket_code_or_type, max['DATA'].values[0][:-3], max['SERIALE'].values[0]))
            else:
                print(s.format('highest', 'ticket type', target_ticket_code_or_type, max['DATA'].values[0][:-3], max['SERIALE'].values[0]))
        else:
            if is_ticket_code:
                print(s.format('lowest', 'ticket code', target_ticket_code_or_type, min['DATA'].values[0][:-3], min['SERIALE'].values[0]))
            else:
                print(s.format('lowest', 'ticket type', target_ticket_code_or_type, min['DATA'].values[0][:-3], min['SERIALE'].values[0]))

In [None]:
# Focus on all TICKET_CODEs
def focus_on_all_ticket_codes(df_tca: pd.DataFrame, dictionary: str) -> None:
    """
        This function focuses on the different ticket codes.
        :param df_tca: the dataframe
        :param dictionary: the dictionary with the ticket codes and their description
        :return: None
    """
    # Open the dictionary
    import json
    with open(dictionary) as json_file: 
        dict_ticket_code = json.load(json_file)

    for ticket_code in dict_ticket_code.keys():
        # Select only the rows of the specified ticket code
        df_tca_sup = df_tca[df_tca['TICKET_CODE'] == ticket_code]
        # If the dataframe is empty, skip the ticket code but launch a warning
        if df_tca_sup.shape[0] == 0:
            print('WARNING: There are no validations of the ticket code "{}"'.format(ticket_code))
        else:
            # If the dataframe is not empty, focus on the ticket code
            df_tc = focus_on_ticket_code(df_tca_sup, ticket_code)
            number_of_tickets_per_day(df_tc, ticket_code, is_ticket_code=True)
            min_max_number_of_tickets_per_day(df_tc, ticket_code, is_ticket_code=True)
            barplot_number_of_tickets_per_month(df_tc, ticket_code, is_ticket_code=True)
            min_max_number_of_tickets_per_month(df_tc, ticket_code, is_ticket_code=True)

### Focus on the type of ticket named ***abbonamento 30 gg.PeopleMover***
The ticket is valid for 30 days and allows you to use the PeopleMover service

In [None]:
target_ticket = 'abbonamento 30 gg.PeopleMover'
df_PM = focus_on_ticket_type(df, target_ticket)
if df_PM.shape[0] == 0:
    print('WARNING: There are no validations of the ticket type "{}"'.format(target_ticket))
else:
    number_of_tickets_per_day(df_PM, target_ticket, is_ticket_code=False)
    min_max_number_of_tickets_per_day(df_PM, target_ticket, is_ticket_code=False)
    barplot_number_of_tickets_per_month(df_PM, target_ticket, is_ticket_code=False)
    min_max_number_of_tickets_per_month(df_PM, target_ticket, is_ticket_code=False)

### Focus on the ticket codes


In [None]:
# The TICKET_CODEs are in the dictionary "dict_ticket_codes.json", created in Notebook 1
focus_on_all_ticket_codes(df, 'data/dictionaries/dict_ticket_codes.json')

### Focus on the ticket code ***1***


In [None]:
# Focus on TICKET_CODE = 1
target_ticket_code = '1'
df_tc1 = focus_on_ticket_code(df, target_ticket_code)
if df_tc1.shape[0] == 0:
    print('WARNING: There are no validations of the ticket code "{}"'.format(target_ticket_code))
else:
    number_of_tickets_per_day(df_tc1, target_ticket_code, is_ticket_code=True)
    min_max_number_of_tickets_per_day(df_tc1, target_ticket_code, is_ticket_code=True)
    barplot_number_of_tickets_per_month(df_tc1, target_ticket_code, is_ticket_code=True)

------

In [None]:
def focus_on_serial(df: pd.DataFrame, serial: str):
    """
        This function focuses on the specified SERIAL.
        :param df: the dataframe
        :param serial: the SERIAL
        :return: the dataframe focused on the specified SERIAL
    """
    return df[df['SERIALE'] == serial]

# Focus on serial over time (all the validations of the SERIAL)
def focus_on_serial_over_time(df: pd.DataFrame, serial: str):
    """
        This function focuses on the specified SERIAL over time.
        :param df: the dataframe
        :param serial: the SERIAL of the user
        :return: None
    """
    # Select only the rows of the specified SERIAL
    df_sup = df[df['SERIALE'] == serial]
    # If the dataframe is empty, skip the SERIAL but launch a warning
    if df_sup.shape[0] == 0:
        print('WARNING: There are no validations of the SERIAL "{}"'.format(serial))
    else:
        # If the dataframe is not empty, focus on the SERIAL
        df_s = focus_on_serial(df_sup, serial)
        # Convert DATA to datetime 
        df_s['DATA'] = pd.to_datetime(df_s['DATA'], format='%Y/%m/%d')
        # Set all the values of the date on the first day of the month
        df_s['DATA'] = df_s['DATA'].dt.strftime('%Y-%m-%d')   
        # Group the dataframe by date and hour and count the number of validations of the specified ticket code
        df_s = df_s.groupby('DATA').count()['SERIALE'].reset_index()
        # Plot the number of validations per day
        plt.figure(figsize=(20, 10))
        plt.plot(df_s['DATA'], df_s['SERIALE'])
        # Add points to the plot
        plt.scatter(df_s['DATA'], df_s['SERIALE'])
        plt.title('Number of validations of the SERIAL "{}" over time'.format(serial))
        plt.xlabel('Date')
        plt.ylabel('Number of validations')
        plt.xticks(rotation=45, ticks=df_s['DATA'])

        # Manage the y-axis
        if df_s['SERIALE'].max() < 10:
            plt.yticks(ticks=range(0, df_s['SERIALE'].max() + 6))
        else:
            plt.yticks(ticks=range(0, df_s['SERIALE'].max() + 6, 2))

        plt.show()

In [None]:
# Select the serial of the user with the highest number of validations
serial = df['SERIALE'].value_counts().index[90]

# Focus on the SERIAL
df_s = focus_on_serial(df, serial)
focus_on_serial_over_time(df_s, serial)

We are interested in analyzing the trajectories of the users that use the public transport in the city of Venice

Note that the stops are identified by the *DESCRIZIONE* column that contains the name of the stop, so each trajectory 
is composed by the name of the stops visited by the user, so a trajectory is composed by a sequence of strings

## Trajectories

In [None]:
def create_dictionary_with_trajectories(df: pd.DataFrame) -> dict:
    """
        This function creates a dictionary with the trajectories of the users.
        :param df: the dataframe
        :return: the dictionary with the trajectories of the users
    """
    # Create a dictionary with the trajectories of the users
    dict_trajectories = {}
    # For each user
    for serial in df['SERIALE'].unique():
        # Select only the rows of the specified user
        df_sup = df[df['SERIALE'] == serial]
        # If the dataframe is not empty, focus on the user
        if df_sup.shape[0] != 0:
            # Create a list with the name of the stops visited by the user
            list_stops = df_sup['DESCRIZIONE'].unique().tolist()
            # Add the list to the dictionary
            dict_trajectories[serial] = list_stops

    return dict_trajectories

In [None]:
def average_length_of_trajectories(dict_trajectories: dict, is_focus_on_ticket_code: bool = False, ticket_code: str = '') -> float:
    """
        This function computes the average length of the trajectories.
        :param dict_trajectories: the dictionary with the trajectories of the users
        :return: the average length of the trajectories
    """
    # Compute the average length of the trajectories
    average_length = 0
    for serial in dict_trajectories.keys():
        average_length += len(dict_trajectories[serial])
    if len(dict_trajectories.keys()) != 0:
        average_length /= len(dict_trajectories.keys())
        if is_focus_on_ticket_code:
            print('The average length of the trajectories with the ticket code {} is: {}'.format(ticket_code, average_length))
        else:
            print('The average length of the trajectories is: {}'.format(average_length))
    else:
        print('WARNING: There are no trajectories to analyze')

    return average_length

In [None]:
def average_lenght_of_trajectories_per_ticket_code_stmp(df: pd.DataFrame):
    """
        This function computes the average length of the trajectories for each ticket code in the dataset.
        The order of the ticket codes is mantained because the ticket codes are stored in the dictionary "dict_ticket_codes.json".
        :param df: the dataframe
        :return: None
    """
    dict_trajectories = {}
    with open('data/dictionaries/dict_ticket_codes.json', 'r') as f:
        dict_ticket_codes = json.load(f)

    # For each ticket code in the dictionary
    for ticket_code in dict_ticket_codes.keys():
        # Select only the rows of the specified ticket code
        df_sup = focus_on_ticket_code(df, ticket_code)
        if df_sup.shape[0] != 0:
            # Create a dictionary with the trajectories of the users
            dict_trajectories[ticket_code] = create_dictionary_with_trajectories(df_sup)
            average_length_of_trajectories(dict_trajectories[ticket_code], is_focus_on_ticket_code=True, ticket_code=ticket_code)
        else:
            # If the dataframe is empty, skip the ticket code but launch a warning
            print('WARNING: There are no validations of the ticket code "{}"'.format(ticket_code))

In [None]:
def average_length_of_trajectories_by_ticket_code_plot(dict_trajectories: dict, df: pd.DataFrame, dictionary: str) -> None:
    """
        This function computes the average length of the trajectories by ticket code.
        :param dict_trajectories: the dictionary with the trajectories of the users
        :param df: the dataframe
        :param dictionary: the path of the dictionary with the ticket codes and their description
        :return: None
    """
    # Compute the average length of the trajectories:
            # There are no colums with the coordinates of the stations, and there are no columns with the distance between the stations
            # So, I compute the average length of the trajectories by the number of stations visited
    
    # Open the dictionary
    with open(dictionary) as json_file:
        dict_ticket_code = json.load(json_file)

    # Create a dictionary with the number of stations visited for each ticket code
    dict_number_of_stations = {}
    for ticket_code in dict_ticket_code.keys():
        # Select only the rows of the specified ticket code
        df_sup = df[df['TICKET_CODE'] == ticket_code]
        # If the dataframe is not empty, focus on the ticket code
        if df_sup.shape[0] != 0:
            # Count the number of stations visited
            number_of_stations = 0
            for serial in df_sup['SERIALE'].unique():
                number_of_stations += len(dict_trajectories[serial])
            # Add the average length of the trajectories to the dictionary
            dict_number_of_stations[ticket_code] = number_of_stations / df_sup['SERIALE'].nunique() 

    # Plot the average length of the trajectories by ticket code
    plt.figure(figsize=(20, 10))
    plt.bar(dict_number_of_stations.keys(), dict_number_of_stations.values())
    plt.title('Average length of the trajectories by ticket code')
    plt.xlabel('Ticket code')   
    plt.ylabel('Average length of the trajectories')

    # Manage the x-axis adding the description of the ticket code; note that it is possible that some ticket codes are not present in the plot
    plt.xticks(ticks=range(0, len(dict_number_of_stations.keys())), labels=[dict_ticket_code[ticket_code] for ticket_code in dict_number_of_stations.keys()], rotation=45)
    
    # Manage the y-axis: note that the y-axis are float numbers: do not convert them to integers and do not use the range function
    if max(dict_number_of_stations.values()) < 0.1:
        plt.yticks(ticks=np.arange(0, max(dict_number_of_stations.values()) + 0.01, 0.01))
    else:
        plt.yticks(ticks=np.arange(0, max(dict_number_of_stations.values()) + 0.01, 0.5))

    plt.show()

In [None]:
def most_frequent_trajectories(dict_trajectories: dict, is_focus_on_ticket_code: bool = False, ticket_code: str = '', summary: bool = True) -> None:
    """
        This function finds the most frequent trajectories.
        :param dict_trajectories: the dictionary with the trajectories of the users
        :param is_focus_on_ticket_code: True if the analysis is focused on a specific ticket code, False otherwise
        :param ticket_code: the ticket code
        :param summary: True if the summary of the most frequent trajectories is printed, False otherwise
        :return: None
    """
    # Find the most frequent trajectories
    # Create a dictionary with the number of times that a trajectory is present
    dict_trajectories_number = {}

    # For each user (identified by the serial number)
    for serial in dict_trajectories.keys():
        # Convert the list of tuples in a tuple of tuples
        trajectory = tuple(dict_trajectories[serial])

        if trajectory in dict_trajectories_number.keys():
            # If the trajectory is already present in the dictionary, 
            # increase the number of times that the trajectory is present
            dict_trajectories_number[trajectory] += 1
        else:
            # Otherwise, add the trajectory to the dictionary
            dict_trajectories_number[trajectory] = 1

    # Sort the dictionary by the number of times that a trajectory is present
    dict_trajectories_number = {k: v for k, v in sorted(dict_trajectories_number.items(), key=lambda item: item[1], reverse=True)}

    if (summary):
        # Print the most frequent trajectories
        if is_focus_on_ticket_code:
            print('The most frequent trajectories with the ticket code {} are:'.format(ticket_code))
        else:
            print('The most frequent trajectories are:')
        for trajectory in list(dict_trajectories_number.keys())[:10]:
            print('The trajectory {} is present {} times'.format(trajectory, dict_trajectories_number[trajectory]))
    else:
        # Return the most frequent trajectories and the number of times that they are present, 
        # ordered by the number of times that they are present
        return dict_trajectories_number

In [None]:
def trajectories_with_at_least_k_stations(dict_trajectories: dict, k: int) -> None:
    """
        This function finds the trajectories with at least k stations visited.
        :param dict_trajectories: the dictionary with the trajectories of the users
        :param k: the number of stations
        :return: None
    """
    # Find the trajectories with at least k stations visited
    # Create a dictionary with the number of times that a trajectory is present
    dict_trajectories_number = {}
    for serial in dict_trajectories.keys():
        trajectory = tuple(dict_trajectories[serial])
        if trajectory in dict_trajectories_number.keys():
            dict_trajectories_number[trajectory] += 1
        else:
            dict_trajectories_number[trajectory] = 1

    # Sort the dictionary by the number of times that a trajectory is present
    dict_trajectories_number = {k: v for k, v in sorted(dict_trajectories_number.items(), key=lambda item: item[1], reverse=True)}

    # Print the trajectories with at least k stations visited
    print('The trajectories with at least {} stations visited are:'.format(k))
    for trajectory in dict_trajectories_number.keys():
        if len(trajectory) >= k:
            print('The trajectory {} is present {} times'.format(trajectory, dict_trajectories_number[trajectory]))

In [None]:
def longest_common_subsequence(trajectory_1: list, trajectory_2: list) -> list:
    """
        This function finds the Longest Common Subsequence (LCS) between two trajectories.
        :param trajectory_1: the first trajectory
        :param trajectory_2: the second trajectory
        :return: the LCS
    """
    # Find the LCS
    # Create a matrix with the length of the LCS between two trajectories
    matrix = [[0 for x in range(len(trajectory_2) + 1)] for y in range(len(trajectory_1) + 1)]
    for i in range(1, len(trajectory_1) + 1):
        for j in range(1, len(trajectory_2) + 1):
            if trajectory_1[i - 1] == trajectory_2[j - 1]:
                matrix[i][j] = matrix[i - 1][j - 1] + 1
            else:
                matrix[i][j] = max(matrix[i - 1][j], matrix[i][j - 1])

    # Find the LCS
    i = len(trajectory_1)
    j = len(trajectory_2)
    lcs = []
    while i > 0 and j > 0:
        if trajectory_1[i - 1] == trajectory_2[j - 1]:
            lcs.append(trajectory_1[i - 1])
            i -= 1
            j -= 1
        elif matrix[i - 1][j] > matrix[i][j - 1]:
            i -= 1
        else:
            j -= 1

    # Reverse the LCS
    lcs.reverse()

    return lcs

### Compute the average length of the trajectories with the entire dataset


In [None]:
# Create a dictionary with the trajectories of the users
dict_trajectories = create_dictionary_with_trajectories(df)
# Compute the average length of the trajectories
average_lenght = average_length_of_trajectories(dict_trajectories)
# Compute the average length of the trajectories by ticket code
average_lenght_of_trajectories_per_ticket_code_stmp(df)
# Plot the average length of the trajectories by ticket code
average_length_of_trajectories_by_ticket_code_plot(dict_trajectories, df, 'data/dictionaries/dict_ticket_codes.json')

### Compute the average length of the trajectories with the dataset with the ticket code 1

In [None]:
# Create a dictionary with the trajectories of the users with the ticket code '2'
df_sup = focus_on_ticket_code(df,'1')
# Create a dictionary with the trajectories of the users
dict_trajectories = create_dictionary_with_trajectories(df_sup)
# Compute the average length of the trajectories
average_lenght = average_length_of_trajectories(dict_trajectories, is_focus_on_ticket_code=True, ticket_code='1')

### Find the most frequent trajectories with the entire dataset

In [None]:
# Create a dictionary with the trajectories of the users with the entire dataset
dict_trajectories = create_dictionary_with_trajectories(df)
# Find the most frequent trajectories and print the summary
most_frequent_trajectories(dict_trajectories, summary=True)

### Find the most frequent trajectories with the dataset with a specified ticket code

In [None]:
# Create a dictionary with the trajectories of the users with the ticket code '1'
df_sup = focus_on_ticket_code(df,'1')
# Create a dictionary with the trajectories of the users with the ticket code '1'
dict_trajectories = create_dictionary_with_trajectories(df_sup)
# Find the most frequent trajectories with the ticket code '1' and print the summary
most_frequent_trajectories(dict_trajectories, is_focus_on_ticket_code=True, ticket_code='1', summary=True)

### Finds the trajectories with at least k stations visited with the entire dataset

In [None]:
# Create a dictionary with the trajectories of the users with the entire dataset
dict_trajectories = create_dictionary_with_trajectories(df)
# Find the trajectories with at least 20 stations visited
trajectories_with_at_least_k_stations(dict_trajectories, 20)

### Find the most frequent trajectories with the dataset with a specified ticket code


In [None]:
# Create a dictionary with the trajectories of the users with the ticket code '1'
df_sup = focus_on_ticket_code(df,'1')
# Create a dictionary with the trajectories of the users with the ticket code '1'
dict_trajectories = create_dictionary_with_trajectories(df_sup)
# Find the trajectories with at least 20 stations visited with the ticket code '1'
trajectories_with_at_least_k_stations(dict_trajectories, 20)

### Longest Common Subsequence (LCS)

In [None]:
# Find the Longest Common Subsequence (LCS) between two trajectories
trajectory_1 = ['SAN MARCO', 'P.LE ROMA', 'RIALTO', 'PUNTA SABBIO', 'BURANO']
trajectory_2 = ['SAN MARCO', 'P.LE ROMA', 'PUNTA SABBIO', 'BURANO']

lcs = longest_common_subsequence(trajectory_1, trajectory_2)
print('The Longest Common Subsequence (LCS) between the trajectories {} and {} is {}'.format(trajectory_1, trajectory_2, lcs))

In [None]:
# Find the Longest Common Subsequence (LCS) between two trajectories
# Trajectory 1 is the trajectory of the dataset with the ticket code 1
# Trajectory 2 is the most frequent trajectory of the dataset with the ticket code 2
df_sup = focus_on_ticket_code(df,'1')
dict_trajectories = create_dictionary_with_trajectories(df_sup)
dict_trajectories_number = most_frequent_trajectories(dict_trajectories, is_focus_on_ticket_code=True, ticket_code='1', summary=False)
trajectory_1 = list(dict_trajectories_number.keys())[8]

df_sup = focus_on_ticket_code(df,'2')
dict_trajectories = create_dictionary_with_trajectories(df_sup)
dict_trajectories_number = most_frequent_trajectories(dict_trajectories, is_focus_on_ticket_code=True, ticket_code='2', summary=False)
trajectory_2 = list(dict_trajectories_number.keys())[0]

lcs = longest_common_subsequence(trajectory_1, trajectory_2)
print('The Longest Common Subsequence (LCS) between the trajectory {} and the trajectory {} is {}'.format(trajectory_1, trajectory_2, lcs))