In [19]:
import pandas as pd
import numpy as np
from datetime import datetime
import json
from tqdm.notebook import tqdm

In [4]:
bus_prefix = "./data/delays/ttc-bus-delay-data-"
file_names = []
for i in range(2016, 2025):
    file_names.append(f"{bus_prefix}{i}.xlsx")

In [None]:
def combine_all_sheets_to_csv(file_names, output_csv="combined.csv"):
    """
    Reads all sheets from every Excel file in file_names (without interpreting any row as header),
    concatenates the data from all sheets vertically, and exports the combined data to a CSV file
    without any column headers.

    Parameters:
        file_names (list of str): List of paths to Excel files.
        output_csv (str): The path for the output CSV file.
    """
    all_data = []

    column_data = ['Report Date','Route','Time','Day','Location','Incident', 'Min Delay','Min Gap','Direction','Vehicle']

    for file in file_names:
        # Open the excel sheet
        try:
            excel_file = pd.ExcelFile(file)
        except Exception as e:
            print(f"Error opening file '{file}': {e}")
            continue

        # Read and append values for each sheet in the file
        for sheet in excel_file.sheet_names:
            try:
                # Header is first row
                dummy = pd.read_excel(excel_file, sheet_name=sheet, header=0)
                dummy.columns = column_data
                all_data.append(dummy)
            except Exception as e:
                print(f"Error reading sheet '{sheet}' in file '{file}': {e}")
                continue

    if not all_data:
        print("There was no data in any file given or the passed file_name vector was empty")
        return

    # Concat all the data vertically
    combined_df = pd.concat(all_data, ignore_index=True)
    combined_df.columns = column_data

    try:
        # Export to CSV without column headers and without the index.
        combined_df.to_csv(output_csv, header=column_data, index=False)
        print(f"Combined data successfully exported to '{output_csv}'.")
    except Exception as e:
        print(f"Error exporting data to CSV: {e}")

In [None]:
# Combine all sheets to csv file and process the csv file
combine_all_sheets_to_csv(file_names, output_csv="./data/delays/bus-delay-data-2016-2024.csv")
# !!!!!!!!!!!!!!!!!!DATE TIME PROCESSING MUST BE DONE LOCALLY IN EXCEL THIS NEARLY CRASHED MY COMPUTER!!!!!!!!!!!!!!!!!

Combined data successfully exported to './data/delays/bus-delay-data-2016-2024.csv'.


In [None]:
def pre_processing_full_ttc_csv(file_name, dict_file_name = "info.json"): 
    '''
    Function for processing TTC delay data stored in a CSV file. This function must be run prior to further analysis to allow for some
    functions to run smoothly.
    Manual grunt work may be needed for certain data cleaning, which is why print statements are added for some brief manual parsing

    Input: file name of a TTC Delay CSV File; (optional) file name of a data_dict dump file, if empty it is defaulted to 'info.json'
    Output: returns a dictionary of data for the specified file_name (unique categoricies for a feature, etc)

    Note: the Data Dictionary is also stored as a JSON file for later use if needed
    '''
    df = pd.read_csv(file_name)
    column_data = df.columns
    data_dict = {}
    for col in column_data:
        df[col] = df[col].apply(lambda x: x.lower() if isinstance(x, str) else x)
        if isinstance(df[col][1], str) and col != 'DateTime' and col != 'Report Date' and col != 'Time' and col != 'Day':
            data_dict[col] = df[col].unique().tolist()
            print(data_dict[col])
    
    df.to_csv(file_name, index='DateTime')
    
    with open(dict_file_name, "w") as outfile: 
        json.dump(data_dict, outfile)
        
    return data_dict



In [49]:
data_dict = pre_processing_full_ttc_csv("./data/delays/bus-delay-final copy.csv", dict_file_name= "./data/delays/bus-delay-data.json")

  df = pd.read_csv(file_name)


['longbranch loop' 'kipling stn' 'royal york at romney rd' ...
 '16 lebovic' 'runnymede station (out' 'sloane abd tinder cres']
['mechanical' 'investigation' 'utilized off route' 'late leaving garage'
 'general delay' 'diversion' 'emergency services' 'vision' nan
 'late leaving garage - operator' 'late leaving garage - mechanical'
 'late leaving garage - management' 'late leaving garage - vision'
 'management' 'operations - operator' 'cleaning' 'security'
 'collision - ttc' 'road blocked - non-ttc collision'
 'road block - non-ttc collision' 'roadblock by collision - non-ttc'
 'securitty' 'late entering service - mechanical' 'utilizing off route'
 'held by' 'late leaving garage - operations' 'e' 'late entering service'
 'collision - ttc ' 'operations' 'cleaning - unsanitary'
 'cleaning - disinfection' 'collision - ttc involved' 'late' 'overhead'
 'rail/switches']
['n/b' 'sb' 'nb' 'e/b' 'e' 'w' 'w/b' 's/b' 's' "b/w's" 'n' 'eb' 'wb' 'b/w'
 'bw' 'down' nan 'ob' 'up' 'bothways' 'b' 'r' 'e/

TypeError: Object of type ndarray is not JSON serializable