## Download dataset

In [2]:
import requests
import os
import zipfile

def download_file(url, filename):
    response = requests.get(url)
    with open(filename, 'wb') as file:
        file.write(response.content)

def extract_zip(zip_file, destination):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(destination)

In [4]:
urls = [
    ("https://zenodo.org/record/1186215/files/adelaide.zip?download=1", "adelaide.zip"),
    ("https://zenodo.org/record/1186215/files/belfast.zip?download=1", "belfast.zip"),
    ("https://zenodo.org/record/1186215/files/berlin.zip?download=1", "berlin.zip"),
    ("https://zenodo.org/record/1186215/files/bordeaux.zip?download=1", "bordeaux.zip"),
    ("https://zenodo.org/record/1186215/files/brisbane.zip?download=1","brisbane.zip"),
    ("https://zenodo.org/record/1186215/files/canberra.zip?download=1","canberra.zip"),
    ("https://zenodo.org/record/1186215/files/detroit.zip?download=1","detroit.zip"),
    ("https://zenodo.org/record/1186215/files/dublin.zip?download=1","dublin.zip"),
    ("https://zenodo.org/record/1186215/files/grenoble.zip?download=1","grenoble.zip"),
    ("https://zenodo.org/record/1186215/files/helsinki.zip?download=1","helsinki.zip"),
    ("https://zenodo.org/record/1186215/files/kuopio.zip?download=1","kuopio.zip"),
    ("https://zenodo.org/record/1186215/files/lisbon.zip?download=1","lisbon.zip"),
    ("https://zenodo.org/record/1186215/files/luxembourg.zip?download=1","luxembourg.zip"),
    ("https://zenodo.org/record/1186215/files/melbourne.zip?download=1","melbourne.zip"),
    ("https://zenodo.org/record/1186215/files/nantes.zip?download=1","nantes.zip"),
    ("https://zenodo.org/record/1186215/files/palermo.zip?download=1","palermo.zip"),
    ("https://zenodo.org/record/1186215/files/paris.zip?download=1","paris.zip"),
    ("https://zenodo.org/record/1186215/files/prague.zip?download=1","prague.zip"),
    ("https://zenodo.org/record/1186215/files/rennes.zip?download=1","rennes.zip"),
    ("https://zenodo.org/record/1186215/files/rome.zip?download=1","rome.zip"),
    ("https://zenodo.org/record/1186215/files/sydney.zip?download=1","sydney.zip"),
    ("https://zenodo.org/record/1186215/files/toulouse.zip?download=1","toulouse.zip"),
    ("https://zenodo.org/record/1186215/files/turku.zip?download=1","turku.zip"),
    ("https://zenodo.org/record/1186215/files/venice.zip?download=1","venice.zip"),
    ("https://zenodo.org/record/1186215/files/winnipeg.zip?download=1","winnipeg.zip")
]

In [5]:
destination_dir = "extracted_files"

for url, filename in urls:
    # Download the file
    download_file(url, filename)
    
    # Extract the contents of the ZIP file to the destination directory
    extract_zip(filename, destination_dir)
    
    # Remove the downloaded ZIP file if needed
    os.remove(filename)

In [11]:
#remove useless files. For all cities, remove ,week.gtfs.zip,week.sqlite,sections.geojson
cities = [t[1].split(".zip")[0] for t in urls]
print(cities)
for city in cities:
    try:
        os.remove("../extracted_files/" + city + "/week.gtfs.zip")
    except Exception as e:
        print(e)
    try: 
        os.remove("../extracted_files/" + city + "/week.sqlite")
    except:
        pass

    try:
        os.remove("../extracted_files/" + city + "/sections.geojson")
    except:
        pass

['adelaide', 'belfast', 'berlin', 'bordeaux', 'brisbane', 'canberra', 'detroit', 'dublin', 'grenoble', 'helsinki', 'kuopio', 'lisbon', 'luxembourg', 'melbourne', 'nantes', 'palermo', 'paris', 'prague', 'rennes', 'rome', 'sydney', 'toulouse', 'turku', 'venice', 'winnipeg']


In [None]:
#remove useless files. For all cities, remove network_temporal_weeek
cities = [t[1].split(".zip")[0] for t in urls]
print(cities)
for city in cities:
    try:
        os.remove("../extracted_files/" + city + "/network_temporal_week.csv")
    except Exception as e:
        print(e)
    try: 
        os.remove("../extracted_files/" + city + "/network_temporal_day.csv")
    except:
        pass

    try:
        os.remove("../extracted_files/" + city + "/sections.geojson")
    except:
        pass

['adelaide', 'belfast', 'berlin', 'bordeaux', 'brisbane', 'canberra', 'detroit', 'dublin', 'grenoble', 'helsinki', 'kuopio', 'lisbon', 'luxembourg', 'melbourne', 'nantes', 'palermo', 'paris', 'prague', 'rennes', 'rome', 'sydney', 'toulouse', 'turku', 'venice', 'winnipeg']


## Prepare datasets

In [None]:
import pandas as pd
import os

# Iterate over each city
cities = [t[1].split(".zip")[0] for t in urls]
max_nr_transport_mode = 5 #    nr_transport_mode = combined_df['route_type'].nunique() (berlin is the highest)
for city in cities:
    print(f"Processing city: {city}")
    
    # Read the combined network file
    combined_df = pd.read_csv(f"../extracted_files/{city}/network_combined.csv",delimiter=";")
    
    # Group by 'from_stop_I' and 'to_stop_I' columns
    grouped = combined_df.groupby(['from_stop_I', 'to_stop_I'])
    
    def get_route_types(group):
        return group['route_type'].unique().tolist()
    
    # Apply the get_route_types function to the grouped dataframe
    result = grouped.apply(get_route_types).reset_index()
    result.columns = ['from_stop_I', 'to_stop_I', 'route_type']
    
    def convert_route_type(route_list):
        new_list = [0] * max_nr_transport_mode
        for val in route_list:
            if val >= 0 and val < max_nr_transport_mode:
                new_list[val] = 1
        return new_list
    
    # Apply the convert_route_type function to the 'route_type' column
    result['new'] = result['route_type'].apply(convert_route_type)
    
    columns_to_include = ['d','n_vehicles']  # Add the names of the columns you want to include
    for column in columns_to_include:
        result[column] = grouped[column].mean().values
    # Rename the 'new' column to 'value'
    result = result.rename(columns={'new': 'label'})
    # drop route_type
    result = result.drop(columns=['route_type'])
    
    # Save the result to the edge_list.csv file in the city directory
    os.makedirs(city, exist_ok=True)
    result.to_csv(f"../extracted_files/{city}/edge_list.csv", index=False)
    
    print(f"Processing for {city} completed.\n")


In [None]:
#move all edge_list for all cities in a new folder called "processed_files", where each file is of the form "city_edge_list.csv"
for city in cities:
    print(f"Processing city: {city}")
    # Read the combined network file
    combined_df = pd.read_csv(f"../extracted_files/{city}/edge_list.csv")
    combined_df.to_csv(f"../processed_files/{city}_edge_list.csv", index=False)
    print(f"Processing for {city} completed.\n")

In [None]:
#check for each city if there are not an int in from_stop_I or to_stop_I
for city in cities:
    print(f"Processing city: {city}")
    # Read the combined network file
    combined_df = pd.read_csv(f"../processed_files/{city}_edge_list.csv")
    for i in combined_df['from_stop_I']:
        if isinstance(i, int) == False:
            print(f"Error in {city} for from_stop_I")
    for i in combined_df['to_stop_I']:
        if isinstance(i, int) == False:
            print(f"Error in {city} for to_stop_I")
    print(f"Processing for {city} completed.\n")
    