# path4-delay-prediction data generation

Download public transport data of Zurich:
https://data.stadt-zuerich.ch/dataset/vbz-fahrzeiten-ogd

In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
col_names = ['line', 'direction', 'station_diva_id', 'date', 'should_arrive_at', 
              'actually_arrived_at', 'next_station_diva_nach']

Only working with single public transport station to predict the behaviour on single station

In [3]:
targetStationId = 2572

Read weekly data, filter out and append to stationDS (i.e. station dataset)

In [4]:
stationDS = pd.DataFrame(columns=col_names)

csv_names = ['fahrzeiten_soll_ist_20190630_20190706.csv',
             'fahrzeiten_soll_ist_20190623_20190629.csv',
             'fahrzeiten_soll_ist_20190616_20190622.csv', 
             'fahrzeiten_soll_ist_20190609_20190615.csv',
             'fahrzeiten_soll_ist_20190602_20190608.csv',
             'fahrzeiten_soll_ist_20190526_20190601.csv',
             'fahrzeiten_soll_ist_20190519_20190525.csv',
             'fahrzeiten_soll_ist_20190512_20190518.csv',
             'fahrzeiten_soll_ist_20190505_20190511.csv',
             'fahrzeiten_soll_ist_20190428_20190504.csv',
             'fahrzeiten_soll_ist_20190421_20190427.csv',
             'fahrzeiten_soll_ist_20190414_20190420.csv']

for name in csv_names:
    dataSet = pd.read_csv('data/' + name)
    # filter required columns
    ds = dataSet.loc[:, ['linie', 'richtung', 'halt_diva_von', 'datum_von', 'soll_an_von', 'ist_an_von', 'halt_diva_nach']]
    # filter data for only the target station
    ds = ds.loc[ds['halt_diva_von'] == targetStationId]
    # rename column name to English
    ds.columns = col_names
    # Calculate delays
    ds['delays'] = ds['actually_arrived_at'] - ds['should_arrive_at']
    # Filter out rows with delay 0 <= dealy < 200
    ds = ds.loc[(ds['delays'] >= 0) & (ds['delays'] < 200)]
    # Append to stationDS
    stationDS = stationDS.append(ds.copy())
    # Release memory
    del ds

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [5]:
print('stationDS shape: ', stationDS.shape)
# sort data by date and expected arrival time
stationDS['date'] = pd.to_datetime(stationDS['date'], format='%d.%m.%y')
stationDS = stationDS.sort_values(by=['date', 'should_arrive_at'], ascending=True)
stationDS.head(10)

stationDS shape:  (82874, 8)


Unnamed: 0,actually_arrived_at,date,delays,direction,line,next_station_diva_nach,should_arrive_at,station_diva_id
231946,16832,2019-04-14,32.0,1,12,3447,16800,2572
105772,18190,2019-04-14,136.0,2,14,3447,18054,2572
1318815,18293,2019-04-14,11.0,1,10,2151,18282,2572
783140,18332,2019-04-14,8.0,1,62,3154,18324,2572
345655,18415,2019-04-14,31.0,1,75,694,18384,2572
951210,18758,2019-04-14,26.0,1,14,2151,18732,2572
467128,18866,2019-04-14,14.0,1,11,3034,18852,2572
118602,18904,2019-04-14,10.0,2,14,3447,18894,2572
486756,18961,2019-04-14,7.0,1,15,3034,18954,2572
402413,19445,2019-04-14,41.0,1,75,694,19404,2572


Save to file for reference

In [6]:
fileToSave = 'data/generated/station_' + str(targetStationId) + '.csv'
stationDS.to_csv(fileToSave, encoding='utf-8', index=False)