# MASTER - Notebook 4
### Matteo Grazioso 884055

In [34]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
from datetime import datetime
import json
import warnings
warnings.filterwarnings('ignore')

import myfunctions as mf # Custom functions

In [35]:
# Disply all columns and all rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [36]:
path = 'data/raw/2-esportazioneCompleta.txt'
df = pd.read_csv(path, header=0, sep='\t')

# Save the name of the file in a variable for future use extracting the name of the file from the path
file_name = path.split('_')[-1].split('/')[2]

# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,DATA_VALIDAZIONE,SERIALE,FERMATA,DESCRIZIONE,TITOLO,DESCRIZIONE_TITOLO
0,13/01/2023 00:00,40834866809772548,162,Stazione MES,12101,Bigl.Aut.75'Mestre/Lido-tsc
1,13/01/2023 00:00,42242241686217732,3625,Aeroporto MA,12106,Bigl Aer-Venezia TSC
2,13/01/2023 00:00,42242241686217476,3625,Aeroporto MA,12106,Bigl Aer-Venezia TSC
3,13/01/2023 00:00,-3604990320,5049,"Zattere ""B""",23301,Mens.Studente Rete Unica
4,13/01/2023 00:00,-2824230951,5043,"S. Toma' ""B""",23303,Abb stud. ReteUnica 12 mesi


In [37]:
file_name

'2-esportazioneCompleta.txt'

In [38]:
# Dates and hour of the validation of the ticket are in the same column 'DATA_VALIDAZIONE'
# Split the column 'DATA_VALIDAZIONE' into two columns 'DATA' and 'ORA' and convert them to datetime format
df.insert(0, 'DATA', pd.to_datetime(df['DATA_VALIDAZIONE'].str.split(' ').str[0], format='%d/%m/%Y'))
df.insert(1, 'ORA', pd.to_datetime(df['DATA_VALIDAZIONE'].str.split(' ').str[1], format='%H:%M').dt.time)

# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,DATA,ORA,DATA_VALIDAZIONE,SERIALE,FERMATA,DESCRIZIONE,TITOLO,DESCRIZIONE_TITOLO
0,2023-01-13,00:00:00,13/01/2023 00:00,40834866809772548,162,Stazione MES,12101,Bigl.Aut.75'Mestre/Lido-tsc
1,2023-01-13,00:00:00,13/01/2023 00:00,42242241686217732,3625,Aeroporto MA,12106,Bigl Aer-Venezia TSC
2,2023-01-13,00:00:00,13/01/2023 00:00,42242241686217476,3625,Aeroporto MA,12106,Bigl Aer-Venezia TSC
3,2023-01-13,00:00:00,13/01/2023 00:00,-3604990320,5049,"Zattere ""B""",23301,Mens.Studente Rete Unica
4,2023-01-13,00:00:00,13/01/2023 00:00,-2824230951,5043,"S. Toma' ""B""",23303,Abb stud. ReteUnica 12 mesi


In [39]:
# Convert the column 'DATA' to datetime format
df['DATA'] = pd.to_datetime(df['DATA'], format='%Y-%m-%d')

In [40]:
df.head()

Unnamed: 0,DATA,ORA,DATA_VALIDAZIONE,SERIALE,FERMATA,DESCRIZIONE,TITOLO,DESCRIZIONE_TITOLO
0,2023-01-13,00:00:00,13/01/2023 00:00,40834866809772548,162,Stazione MES,12101,Bigl.Aut.75'Mestre/Lido-tsc
1,2023-01-13,00:00:00,13/01/2023 00:00,42242241686217732,3625,Aeroporto MA,12106,Bigl Aer-Venezia TSC
2,2023-01-13,00:00:00,13/01/2023 00:00,42242241686217476,3625,Aeroporto MA,12106,Bigl Aer-Venezia TSC
3,2023-01-13,00:00:00,13/01/2023 00:00,-3604990320,5049,"Zattere ""B""",23301,Mens.Studente Rete Unica
4,2023-01-13,00:00:00,13/01/2023 00:00,-2824230951,5043,"S. Toma' ""B""",23303,Abb stud. ReteUnica 12 mesi


In [41]:
df.tail()

Unnamed: 0,DATA,ORA,DATA_VALIDAZIONE,SERIALE,FERMATA,DESCRIZIONE,TITOLO,DESCRIZIONE_TITOLO
5537461,2023-03-14,23:58:00,14/03/2023 23:58,-2864643315,162,Stazione MES,11209,Bigl RETE UNICA 75'
5537462,2023-03-14,23:58:00,14/03/2023 23:58,-2854956628,5026,Tronchetto F,11209,Bigl RETE UNICA 75'
5537463,2023-03-14,23:59:00,14/03/2023 23:59,-2850025054,384,Mestre Centr,23101,Mensile ordinario Rete Unica
5537464,2023-03-14,23:59:00,14/03/2023 23:59,-2824225710,5024,"Tronchetto """,23101,Mensile ordinario Rete Unica
5537465,2023-03-14,23:59:00,14/03/2023 23:59,-3604916033,5039,"Rialto ""C""",23101,Mensile ordinario Rete Unica


---------


In [42]:
# Restrict the dataset to only the specified period given by the user
def restrict_dataset_to_period(df, start_date, end_date):
    '''
    Restrict the dataset to only the specified period given by the user
        :param df: the dataset to be restricted
        :param start_date: the start date of the period
        :param end_date: the end date of the period
        :return: the restricted dataset        
    ''' 

    # Filter the dataset to only the specified period
    df = df[(df['DATA'] >= start_date) & (df['DATA'] <= end_date)]
    
    return df

In [43]:
print('df.shape: ', df.shape)
start_date = '2023-02-04'
end_date = '2023-02-21'
df = restrict_dataset_to_period(df, start_date, end_date)

# Remove DATA and ORA columns 
df = df.drop(['DATA', 'ORA'], axis=1)

print('start_date: ', start_date)
print('end_date: ', end_date)
print('df.shape: ', df.shape)
print('df.head(2):\n ', df.head(2))

df.shape:  (5537466, 8)
start_date:  2023-02-04
end_date:  2023-02-21
df.shape:  (1948517, 6)
df.head(2):
           DATA_VALIDAZIONE     SERIALE  FERMATA   DESCRIZIONE  TITOLO  \
1616055  04/02/2023 00:00 -3604916095     5001  Lido S.M.E.    23102   
1616056  04/02/2023 00:00 -2824231146     5031  P.le Roma "G   23102   

                   DESCRIZIONE_TITOLO  
1616055  Annuale ordinario Rete Unica  
1616056  Annuale ordinario Rete Unica  


In [44]:
# Export the data to a txt file
name_file = 'restricted_' + str(start_date) + '_' + str(end_date) + '_' + file_name
df.to_csv('data/raw/' + name_file, sep='\t', index=False)

print('Data exported to ' + name_file)

Data exported to restricted_2023-02-04_2023-02-21_2-esportazioneCompleta.txt.txt


In [None]:
# This dataset must be cleaned before it can be used. The cleaning process is done in the following steps:
# 1. Execute the notebook 1_b_only_temp_cleaning.ipynb to clean the dataset deleting useless stamps once the algorithm has determined the minimum temporal gap between two consecutive stamps.

# The result of the cleaning process is a new dataset that must be used to obtain the dataset with geographical coordinates. This process is done in the following steps:
# 1. Execute the Notebook 3 AUX.ipynb to obtain the dataset with geographical coordinates.