In [None]:
from datetime import datetime
import pandas as pd
import json
import glob
import os
from tools import save_to_json
pd.set_option('display.max_columns', None)

In [None]:
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')
all_csv = glob.glob('../data/actual_data/raw/*.csv') # TODO
all_csv.sort()

li = []
for filename in all_csv:
    print(filename)
    df = pd.read_csv(filename, sep=';', parse_dates=['Arrival time', 'Arrival forecast', 'Departure time', 'Departure forecast'])
    df = df.dropna(axis=0, subset=["Arrival time", "Arrival forecast"])
    df['Stop name'] = df['Stop name'].str.slice(0,30)
    currentDay = datetime.strptime(filename[-14:-4], '%Y-%m-%d')
    df['currentDay'] = currentDay
    li.append(df.sort_values(by=["Journey identifier", "Arrival time"]))
df = pd.concat(li, axis=0, ignore_index=True)

In [None]:
df

## Train station position


In [None]:
df_all_stops = pd.read_csv('../data/raw/stops.csv', sep=',')
# Only take the stops that are also in the actual data. Remove the bus station and the duplicates.
df_stops = df_all_stops[df_all_stops['stop_name'].isin(df['Stop name'].unique())]
df_stops = df_stops.drop_duplicates(subset="stop_name")
stops = {}
for i, stop in df_stops.iterrows():
    stops[stop['stop_name']] = {'lat': stop['stop_lat'], 'lng': stop['stop_lon']}
save_to_json('../web/data/stops.json', stops)

## Retards par ligne

- Pour chaque row
  - Si le journey identifier est le même que le précédent
    - Prendre la gare des deux row, et y associer le retard de la 2ème row


In [None]:
df_merged = pd.concat([df, df.shift(-1).add_prefix('next_')], axis=1)

In [None]:
class DelayForLine:
    stationA: str
    stationB: str
    nbTrain: int
    nbDelayed: int
    totalDelay: int
    nbCancelled: int

    def __init__(self, stations):
        self.stationA = stations[0]
        self.stationB = stations[1]
        self.nbTrain = 0
        self.nbDelayed = 0
        self.totalDelay = 0
        self.nbCancelled = 0

    def add(self, is_delayed, delay, cancelled):
        self.nbTrain += 1
        self.nbDelayed += is_delayed
        self.totalDelay += delay # TODO maybe only if is_delayed is True
        self.nbCancelled += cancelled

    def __repr__(self):
        return "DelayForLine: " + json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)

In [None]:
test = df_merged.loc[(df_merged['Day of operation'] == '2022-10-26') & (df_merged['Journey identifier'] == '85:11:1077:001')]
test

In [None]:
def removeLinesWithNoTrains(delays):
    return {k: v for k, v in delays.items() if v.nbTrain > 30}

def removeLinesWithNoTrains2(delays):
    #return {k: {v2 for k2, v2 in v.items() if v2.nbTrain > 30} for k, v in delays.items()}
    return {day: {line: info for line, info in lines.items() if info.nbTrain > 30} for day, lines in delays.items()}
    

In [None]:
delay_by_line = {}

for i, row in df_merged.iterrows():
    if row['Journey identifier'] == row['next_Journey identifier']:
        is_delayed = row['Arrival delay']
        delay = abs(row['Arrival forecast'] - row['Arrival time']).total_seconds()
        depart_station = row['Stop name']
        arrival_station = row['next_Stop name']
        cancelled = row['Cancelled TF']
        sorted_stations = sorted([depart_station, arrival_station])
        line = '|'.join(sorted_stations)
        if line not in delay_by_line:
            delay_by_line[line] = DelayForLine(sorted_stations)
        delay_by_line[line].add(is_delayed, delay, cancelled)


In [None]:
save_to_json('../web/data/delay_by_line.json', removeLinesWithNoTrains(delay_by_line))

## Delay by Line by Week day

In [None]:
delay_by_line_by_day = {}

for i, row in df_merged.iterrows():
    if row['Journey identifier'] == row['next_Journey identifier']:
        is_delayed = row['Arrival delay']
        delay = abs(row['Arrival forecast'] - row['Arrival time']).total_seconds()
        depart_station = row['Stop name']
        arrival_station = row['next_Stop name']
        cancelled = row['Cancelled TF']
        sorted_stations = sorted([depart_station, arrival_station])
        line = '|'.join(sorted_stations)
        weekday = row['currentDay'].weekday()
        if weekday not in delay_by_line_by_day:
            delay_by_line_by_day[weekday] = {}
        if line not in delay_by_line_by_day[weekday]:
            delay_by_line_by_day[weekday][line] = DelayForLine(sorted_stations)
        delay_by_line_by_day[weekday][line].add(is_delayed, delay, cancelled)

In [None]:
print(delay_by_line_by_day)

In [None]:
print(removeLinesWithNoTrains2(delay_by_line_by_day))
save_to_json('../web/data/delay_by_line_by_day.json', removeLinesWithNoTrains2(delay_by_line_by_day))

## Delay by line by hour

In [None]:
delay_by_line_by_hour = {}

for i, row in df_merged.iterrows():
    if row['Journey identifier'] == row['next_Journey identifier']:
        is_delayed = row['Arrival delay']
        delay = abs(row['Arrival forecast'] - row['Arrival time']).total_seconds()
        depart_station = row['Stop name']
        arrival_station = row['next_Stop name']
        cancelled = row['Cancelled TF']
        sorted_stations = sorted([depart_station, arrival_station])
        line = '|'.join(sorted_stations)
        hour = row['Arrival forecast'].hour
        if hour not in delay_by_line_by_hour:
            delay_by_line_by_hour[hour] = {}
        if line not in delay_by_line_by_hour[hour]:
            delay_by_line_by_hour[hour][line] = DelayForLine(sorted_stations)
        delay_by_line_by_hour[hour][line].add(is_delayed, delay, cancelled)

In [None]:
save_to_json('../web/data/delay_by_line_by_hour.json', removeLinesWithNoTrains2(delay_by_line_by_hour))

## Retards par gare

Pour chaque row
Associer le retards à la gare


In [None]:
class DelayForStation:
    stationName: str
    #stationGeopos: str # TODO maybe uncessary since we have the station infos already ?
    nbDelayed: int
    nbTrain: int
    totalDelay: int
    nbCancelled: int

    def __init__(self, stationName):
        self.stationName = stationName
        #self.stationGeopos = stationGeopos
        self.nbTrain = 0
        self.nbDelayed = 0
        self.totalDelay = 0
        self.nbCancelled = 0


    def add(self, is_delayed, delay, cancelled):
        self.nbTrain += 1
        self.nbDelayed += is_delayed
        self.totalDelay += delay # TODO maybe only if is_delayed is True
        self.nbCancelled += cancelled

    def __repr__(self):
        return "DelayForLine: " + json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)

In [None]:
delay_by_station = {}

for i, row in df_merged.iterrows():
    is_delayed = row['Arrival delay']
    delay = abs(row['Arrival forecast'] - row['Arrival time']).total_seconds()
    stationName = row['Stop name']
    #stationGeopos = row['Geopos']
    cancelled = row['Cancelled TF']
    if stationName not in delay_by_station:
        delay_by_station[stationName] = DelayForStation(stationName)
    delay_by_station[stationName].add(is_delayed, delay, cancelled)

In [None]:
save_to_json('../web/data/delay_by_station.json', delay_by_station)

## Retard par jour de la semaine


In [None]:
delay_by_station_by_day = {}

for i, row in df_merged.iterrows():
    is_delayed = row['Arrival delay']
    delay = abs(row['Arrival forecast'] - row['Arrival time']).total_seconds()
    stationName = row['Stop name']
    #stationGeopos = row['Geopos']
    cancelled = row['Cancelled TF']
    weekday = row['currentDay'].weekday()
    if weekday not in delay_by_station_by_day:
        delay_by_station_by_day[weekday] = {}
    if stationName not in delay_by_station_by_day[weekday]:
        delay_by_station_by_day[weekday][stationName] = DelayForStation(stationName)
    delay_by_station_by_day[weekday][stationName].add(is_delayed, delay, cancelled)

In [None]:
save_to_json('../web/data/delay_by_station_by_day.json', delay_by_station_by_day)

## Retard par heure


In [None]:
delay_by_station_by_hour = {}

for i, row in df_merged.iterrows():
    is_delayed = row['Arrival delay']
    delay = abs(row['Arrival forecast'] - row['Arrival time']).total_seconds()
    stationName = row['Stop name']
    #stationGeopos = row['Geopos']
    cancelled = row['Cancelled TF']
    hour = row['Arrival forecast'].hour
    if hour not in delay_by_station_by_hour:
        delay_by_station_by_hour[hour] = {}
    if stationName not in delay_by_station_by_hour[hour]:
        delay_by_station_by_hour[hour][stationName] = DelayForStation(stationName)
    delay_by_station_by_hour[hour][stationName].add(is_delayed, delay, cancelled)

In [None]:
save_to_json('../web/data/delay_by_station_by_hour.json', delay_by_station_by_hour)