In [293]:
# load libraries
import pandas as pd
import os, re
import datetime as dt
from sqlalchemy import create_engine, text
import zipfile
import numpy as np

# General setup

In [294]:
# define paths
workingdir = "/home/jupyter-maita.schade/VW_Data_Hub/Gap_Map/"
#storagedir = "smb://192.168.90.30/allmende%20verkehr/4%20Projekte/2%20Projekte%20Mobilitätswende/ÖV-Deutschlandkarte%20(Gap-Map)/Berechnungen/raw/gtfs/"



In [296]:
def readStopTimes(zf, trips_df = None, stop_ids = None):
    # This is NOT SAFE if not filtering down quite significantly!
    # options to filter by stop_id, or pre-filtered trips
    start = dt.datetime.now()
    #     index_start = 1

    print("\t...reading stop_times")
    if not stop_ids is None:
        print("\t......filtering stops")
    if not trips_df is None:
        print("\t......filtering trips")
        
    
    if ((not stop_ids is None) and (trips_df is None)):
        reader = pd.read_csv(zf.open("stop_times.txt"), chunksize=200000, iterator=True, encoding='utf-8')
        return(pd.concat([r[r.stop_id.isin(stop_ids)] for r in reader]))
    else:
        chunksize = 200000
        j = 0
    
        result_df = pd.DataFrame()
        for df in pd.read_csv(zf.open("stop_times.txt"), chunksize=chunksize, iterator=True, encoding='utf-8'):

            # just keeping track...
            j+=1
            if j%100==0:
                print('\t{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize))

            # putting things together

            # filtering by stop_ids if given--this would be more efficient if separated out
            if not stop_ids is None:
                df = df[df.stop_id.isin(stop_ids)]
            if not trips_df is None:
                df = df.merge(trips_df, on = "trip_id", how="inner")

            result_df = pd.concat([result_df, df])
        #     print("\t...", len(result_df))
    #         if j==1:
    #             result_df.to_sql("stop_counts", dbout, if_exists='replace')
    #         else:
    #             result_df.to_sql("stop_counts", dbout, if_exists='append')

        return(result_df)

In [297]:
def readTrips(zf):
    print("\t...reading trips")
    trips_df = pd.read_csv(zf.open("trips.txt"), usecols = ["route_id","trip_id","service_id"])
    print("\t...", len(trips_df))
    return(trips_df)

# 1st Approach

This seems to give really wrong results.

In [45]:
# find the ids
nstops = pd.read_csv( outdir + "2020_reissue_2.nstops.csv")
# nstops[nstops.stop_name.str.contains("Boddinstr")]
# list(nstops[nstops.stop_name.str.contains("Boddinstr")].stop_id)
stop_ids = [306217, 621724, 638469, 647211, 717673] #[85753, 202891, 496613, 717506]
time_frame = [20200514, 20200520]

Get, combine, and count calendar

In [35]:
def interveningWeekdays(start, end, inclusive=True, weekdays=[0, 1, 2, 3, 4]):
    # a useful function from Stackoverflow, to count particular weekdays in date range
    if isinstance(start, dt.datetime):
        start = start.date()               # make a date from a datetime

    if isinstance(end, dt.datetime):
        end = end.date()                   # make a date from a datetime

    if end < start:
        # you can opt to return 0 or swap the dates around instead
        # raise ValueError("start date must be before end date")
        end, start = start, end

    if inclusive:
        end += dt.timedelta(days=1)  # correct for inclusivity

    try:
        # collapse duplicate weekdays
        weekdays = {weekday % 7 for weekday in weekdays}
    except TypeError:
        weekdays = [weekdays % 7]
        
#     print(weekdays)

    ref = dt.date.today()                    # choose a reference date
#     print(ref)
    ref -= dt.timedelta(days=ref.weekday())  # and normalize its weekday

    return sum((ref_plus - start).days // 7 - (ref_plus - end).days // 7
               for ref_plus in
               (ref + dt.timedelta(days=weekday) for weekday in weekdays))


def filterCalendar(calendar_df, time_frame):
    '''Given a start and end date (list of two integers yyyymmdd), filters 
    calendar_df to only schedules overlapping with those dates, and trun-
    cates calendar start- and end dates to the time window'''
    start_date = min(time_frame)
    end_date = max(time_frame)
    filtered_calendar = calendar_df[
        ((calendar_df.start_date <= end_date) & (calendar_df.end_date >= start_date))
    ]
    filtered_calendar.loc[filtered_calendar.start_date<start_date, 'start_date'] = start_date
    filtered_calendar.loc[filtered_calendar.end_date>end_date, 'end_date'] = end_date
    return(filtered_calendar)

### Helper function to count days 
def countDaysInIntervalHelper(calendarrow):
    # function to find number of days of service operation based on calendars.txt-entry
    servicepattern = calendarrow.loc["monday":"sunday"].to_numpy()
    servicedays = servicepattern.nonzero()[0].tolist()
    startdate = dt.datetime.strptime(str(int(calendarrow.get("start_date"))),"%Y%m%d")
    enddate = dt.datetime.strptime(str(int(calendarrow.get("end_date"))),"%Y%m%d")
    return(interveningWeekdays(startdate, enddate, weekdays = servicedays))

### Helper function to compare dates
def isInIntervalHelper(n, interval):
    '''works only on ARRAY-like n'''
    return(np.where((n <= max(interval)) & (n >= min(interval)), True, False))

def getServiceCount(zf = zf, time_frame = None):
    # enriches stop_times DataFrame with information about how often in the feed
    # period each stop is made
    # if time_frame is given, truncates the calendar to only the dates given in time_frame
    

    print("Getting number of service days for each service")
    # use service_id to find service...
    

    # get regular service from calendar.txt
    print("\t...reading regular service calendars")
    calendar_df = pd.read_csv(zf.open("calendar.txt"))    
    # trim the calendar to fit into time_frame
    if not time_frame is None:
        calendar_df = filterCalendar(calendar_df, time_frame)
    calendar_df["days_count"] = calendar_df.apply(countDaysInIntervalHelper, axis=1)
    print(len(calendar_df))

#     calendar_df.to_sql("calendar",db, if_exists = "replace")
    # and get exceptions from calendar_dates.txt

    print("\t...reading calendar exceptions")
    calendar_dates_df = pd.read_csv(zf.open("calendar_dates.txt"))
    # we are only concerned with exceptions in the window of interest 
    if not time_frame is None:
        calendar_dates_df = calendar_dates_df[isInIntervalHelper(calendar_dates_df.date, time_frame)]
    

    print("\t...aggregating calendar")
    calendar_df = calendar_dates_df.groupby(["service_id", "exception_type"], as_index=False
                              ).count(
                            ).pivot(index = "service_id", columns = "exception_type", values = "date"
                            ).reset_index(
                            ).merge(calendar_df, on="service_id", how="outer"
                            ).loc[:,'service_id':'days_count']

    
    print("\t...calculating total in calendar")
    if 1 in calendar_df.columns:
        calendar_df.days_count= (calendar_df.days_count.fillna(0) + calendar_df.loc[:,1].fillna(0))
    if 2 in calendar_df.columns:
        calendar_df.days_count= (calendar_df.days_count.fillna(0) - calendar_df.loc[:,2].fillna(0))
    if(any(calendar_df.days_count < 0)):
        print("SOMETHING WENT WRONG--NEGATIVE DAY COUNT(S)")
        
    return(calendar_df) #[["service_id","days_count"]])

counted_calendar_df = getServiceCount(zf = zf, time_frame = time_frame)

Getting number of service days for each service
	...reading regular service calendars
697
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


Get Trips, filtered by time_frame (via calendar)

In [5]:
def filterTrips(trips_df, counted_calendar_df):
    return(trips_df.merge(counted_calendar_df[["service_id", "days_count"]], on="service_id", how="right"))
    
trips_df = filterTrips(readTrips(zf = zf), counted_calendar_df)

	...reading trips
	... 3517922


Get stop_times, filtered by time_frame (via trips) AND stop_id

In [21]:
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

	...filtering stop_times with trips
	2 seconds: completed 2000000 rows
	6 seconds: completed 4000000 rows
	9 seconds: completed 6000000 rows
	12 seconds: completed 8000000 rows
	15 seconds: completed 10000000 rows
	18 seconds: completed 12000000 rows
	21 seconds: completed 14000000 rows
	23 seconds: completed 16000000 rows
	26 seconds: completed 18000000 rows
	29 seconds: completed 20000000 rows
	31 seconds: completed 22000000 rows
	34 seconds: completed 24000000 rows
	37 seconds: completed 26000000 rows
	40 seconds: completed 28000000 rows
	42 seconds: completed 30000000 rows
	45 seconds: completed 32000000 rows
	48 seconds: completed 34000000 rows
	51 seconds: completed 36000000 rows
	53 seconds: completed 38000000 rows
	56 seconds: completed 40000000 rows
	59 seconds: completed 42000000 rows
	61 seconds: completed 44000000 rows
	64 seconds: completed 46000000 rows
	67 seconds: completed 48000000 rows
	70 seconds: completed 50000000 rows
	72 seconds: completed 52000000 rows
	75 secon

In [19]:
len(stop_times_df)

1483

## 2020

In [82]:
# Welches Jahr?
jahr = "2020"
# Welcher Zip?
zipname = "2020_reissue_2"
# Welche Routenreferenz? (im raw-directory)

# define paths

# constructed paths
# rawdir = workingdir + "raw/"
rawdir = workingdir + "raw/"
rawdatadir = rawdir + "gtfs/" + jahr + "/"
outdir = workingdir + "out/"+jahr+"/"
#inpath = "{0}{1}_{2}.db".format(rawdatadir,jahr,datum)
zippath = rawdatadir + zipname + ".zip"


# set up zip file as default for functions
zf = zipfile.ZipFile(zippath) # this is the raw stuff

In [109]:
# file to get stop ids
nstops = pd.read_csv( outdir + "2020_reissue_2.nstops.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/home/jupyter-maita.schade/VW_Data_Hub/Gap_Map/out/2021/2020_reissue_2.nstops.csv'

### Gießen

In [83]:
nstops[nstops.stop_name.str.contains("Gießen Behördenzentrum")]
stop_ids = list(nstops[nstops.stop_name.str.contains("Gießen Behördenzentrum")].stop_id)
time_frame = [20200314, 20200314]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 3517922
Getting number of service days for each service
	...reading regular service calendars
9993
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips
	2 seconds: completed 2000000 rows
	5 seconds: completed 4000000 rows
	7 seconds: completed 6000000 rows
	10 seconds: completed 8000000 rows
	13 seconds: completed 10000000 rows
	16 seconds: completed 12000000 rows
	18 seconds: completed 14000000 rows
	21 seconds: completed 16000000 rows
	23 seconds: completed 18000000 rows
	26 seconds: completed 20000000 rows
	28 seconds: completed 22000000 rows
	31 seconds: completed 24000000 rows
	34 seconds: completed 26000000 rows
	36 seconds: completed 28000000 rows
	39 seconds: completed 30000000 rows
	41 seconds: completed 32000000 rows
	44 seconds: completed 34000000 rows
	46 seconds: completed 36000000 rows
	49 seconds: completed 38000000 rows
	51 seconds: completed 40000000 rows
	54 seconds: compl

44.0

In [84]:
nstops[nstops.stop_name.str.contains("Gießen Graudenzer")]
stop_ids = list(nstops[nstops.stop_name.str.contains("Gießen Graudenzer")].stop_id)
time_frame = [20200520, 20200520]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 3517922
Getting number of service days for each service
	...reading regular service calendars
8710
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips
	2 seconds: completed 2000000 rows
	4 seconds: completed 4000000 rows
	7 seconds: completed 6000000 rows
	9 seconds: completed 8000000 rows
	12 seconds: completed 10000000 rows
	14 seconds: completed 12000000 rows
	17 seconds: completed 14000000 rows
	19 seconds: completed 16000000 rows
	22 seconds: completed 18000000 rows
	24 seconds: completed 20000000 rows
	27 seconds: completed 22000000 rows
	29 seconds: completed 24000000 rows
	32 seconds: completed 26000000 rows
	34 seconds: completed 28000000 rows
	37 seconds: completed 30000000 rows
	39 seconds: completed 32000000 rows
	42 seconds: completed 34000000 rows
	44 seconds: completed 36000000 rows
	47 seconds: completed 38000000 rows
	49 seconds: completed 40000000 rows
	51 seconds: comple

51.0

### Heidelberg

In [85]:
nstops[nstops.stop_name.str.contains("Alte Brücke Nord")]
stop_ids = list(nstops[nstops.stop_name.str.contains("Alte Brücke Nord")].stop_id)
time_frame = [20200314, 20200314]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 3517922
Getting number of service days for each service
	...reading regular service calendars
9993
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips
	2 seconds: completed 2000000 rows
	5 seconds: completed 4000000 rows
	7 seconds: completed 6000000 rows
	10 seconds: completed 8000000 rows
	13 seconds: completed 10000000 rows
	15 seconds: completed 12000000 rows
	18 seconds: completed 14000000 rows
	21 seconds: completed 16000000 rows
	23 seconds: completed 18000000 rows
	26 seconds: completed 20000000 rows
	29 seconds: completed 22000000 rows
	31 seconds: completed 24000000 rows
	34 seconds: completed 26000000 rows
	37 seconds: completed 28000000 rows
	39 seconds: completed 30000000 rows
	42 seconds: completed 32000000 rows
	45 seconds: completed 34000000 rows
	47 seconds: completed 36000000 rows
	50 seconds: completed 38000000 rows
	52 seconds: completed 40000000 rows
	55 seconds: compl

97.0

In [86]:
nstops[nstops.stop_name.str.contains("Rohrbach, Eichendorffplatz")]
stop_ids = list(nstops[nstops.stop_name.str.contains("Rohrbach, Eichendorffplatz")].stop_id)
time_frame = [20200520, 20200520]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 3517922
Getting number of service days for each service
	...reading regular service calendars
8710
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips
	2 seconds: completed 2000000 rows
	5 seconds: completed 4000000 rows
	7 seconds: completed 6000000 rows
	10 seconds: completed 8000000 rows
	12 seconds: completed 10000000 rows
	15 seconds: completed 12000000 rows
	18 seconds: completed 14000000 rows
	20 seconds: completed 16000000 rows
	23 seconds: completed 18000000 rows
	25 seconds: completed 20000000 rows
	28 seconds: completed 22000000 rows
	30 seconds: completed 24000000 rows
	33 seconds: completed 26000000 rows
	35 seconds: completed 28000000 rows
	38 seconds: completed 30000000 rows
	40 seconds: completed 32000000 rows
	43 seconds: completed 34000000 rows
	45 seconds: completed 36000000 rows
	48 seconds: completed 38000000 rows
	50 seconds: completed 40000000 rows
	53 seconds: compl

407.0

### Magdeburg

In [87]:
nstops[nstops.stop_name.str.contains("Goldschmiedebrücke")]
stop_ids = list(nstops[nstops.stop_name.str.contains("Goldschmiedebrücke")].stop_id)
time_frame = [20200314, 20200314]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 3517922
Getting number of service days for each service
	...reading regular service calendars
9993
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips
	2 seconds: completed 2000000 rows
	5 seconds: completed 4000000 rows
	7 seconds: completed 6000000 rows
	10 seconds: completed 8000000 rows
	12 seconds: completed 10000000 rows
	14 seconds: completed 12000000 rows
	17 seconds: completed 14000000 rows
	19 seconds: completed 16000000 rows
	22 seconds: completed 18000000 rows
	24 seconds: completed 20000000 rows
	26 seconds: completed 22000000 rows
	29 seconds: completed 24000000 rows
	31 seconds: completed 26000000 rows
	34 seconds: completed 28000000 rows
	36 seconds: completed 30000000 rows
	38 seconds: completed 32000000 rows
	40 seconds: completed 34000000 rows
	43 seconds: completed 36000000 rows
	45 seconds: completed 38000000 rows
	47 seconds: completed 40000000 rows
	49 seconds: compl

0.0

In [88]:
nstops[nstops.stop_name.str.contains("Flechtinger Stra")]
stop_ids = list(nstops[nstops.stop_name.str.contains("Flechtinger Stra")].stop_id)
time_frame = [20200520, 20200520]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 3517922
Getting number of service days for each service
	...reading regular service calendars
8710
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips
	2 seconds: completed 2000000 rows
	4 seconds: completed 4000000 rows
	7 seconds: completed 6000000 rows
	9 seconds: completed 8000000 rows
	12 seconds: completed 10000000 rows
	14 seconds: completed 12000000 rows
	17 seconds: completed 14000000 rows
	19 seconds: completed 16000000 rows
	22 seconds: completed 18000000 rows
	24 seconds: completed 20000000 rows
	27 seconds: completed 22000000 rows
	29 seconds: completed 24000000 rows
	32 seconds: completed 26000000 rows
	34 seconds: completed 28000000 rows
	37 seconds: completed 30000000 rows
	39 seconds: completed 32000000 rows
	42 seconds: completed 34000000 rows
	44 seconds: completed 36000000 rows
	47 seconds: completed 38000000 rows
	49 seconds: completed 40000000 rows
	51 seconds: comple

0.0

## 2021

In [115]:
# Welches Jahr?
jahr = "2021"
# Welcher Zip?
zipname = "2021_reissue_2"
# constructed paths
# rawdir = workingdir + "raw/"
rawdir = workingdir + "raw/"
rawdatadir = rawdir + "gtfs/" + jahr + "/"
outdir = workingdir + "out/"+jahr+"/"
#inpath = "{0}{1}_{2}.db".format(rawdatadir,jahr,datum)
zippath = rawdatadir + zipname + ".zip"


# set up zip file as default for functions
zf = zipfile.ZipFile(zippath) # this is the raw stuff

In [116]:
stops = pd.read_csv(zf.open("stops.txt"))

### Gießen

In [130]:
stops[stops.stop_name.str.contains("Gießen Behördenzentrum")]
stop_ids = list(stops[stops.stop_name.str.contains("Gießen Behördenzentrum")].stop_id)
time_frame = [20211005, 20211005]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 1641730
Getting number of service days for each service
	...reading regular service calendars
12004
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips


  exec(code_obj, self.user_global_ns, self.user_ns)


	4 seconds: completed 2000000 rows
	8 seconds: completed 4000000 rows
	13 seconds: completed 6000000 rows
	17 seconds: completed 8000000 rows
	21 seconds: completed 10000000 rows
	26 seconds: completed 12000000 rows
	30 seconds: completed 14000000 rows
	34 seconds: completed 16000000 rows
	39 seconds: completed 18000000 rows
	43 seconds: completed 20000000 rows
	47 seconds: completed 22000000 rows
	52 seconds: completed 24000000 rows
	56 seconds: completed 26000000 rows
	60 seconds: completed 28000000 rows
	65 seconds: completed 30000000 rows
	69 seconds: completed 32000000 rows
	73 seconds: completed 34000000 rows
	78 seconds: completed 36000000 rows
	82 seconds: completed 38000000 rows
	86 seconds: completed 40000000 rows
	91 seconds: completed 42000000 rows
	95 seconds: completed 44000000 rows
	99 seconds: completed 46000000 rows
	104 seconds: completed 48000000 rows
	108 seconds: completed 50000000 rows
	112 seconds: completed 52000000 rows
	117 seconds: completed 54000000 rows
	12

0.0

In [131]:
stops[stops.stop_name.str.contains("Gießen Graudenzer")]
stop_ids = list(stops[stops.stop_name.str.contains("Gießen Graudenzer")].stop_id)
time_frame = [20211005, 20211005]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 1641730
Getting number of service days for each service
	...reading regular service calendars
12004
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips
	4 seconds: completed 2000000 rows
	8 seconds: completed 4000000 rows
	12 seconds: completed 6000000 rows
	17 seconds: completed 8000000 rows
	21 seconds: completed 10000000 rows
	25 seconds: completed 12000000 rows
	30 seconds: completed 14000000 rows
	34 seconds: completed 16000000 rows
	38 seconds: completed 18000000 rows
	42 seconds: completed 20000000 rows
	47 seconds: completed 22000000 rows
	51 seconds: completed 24000000 rows
	55 seconds: completed 26000000 rows
	59 seconds: completed 28000000 rows
	64 seconds: completed 30000000 rows
	68 seconds: completed 32000000 rows
	72 seconds: completed 34000000 rows
	76 seconds: completed 36000000 rows
	81 seconds: completed 38000000 rows
	85 seconds: completed 40000000 rows
	89 seconds: com

0.0

### Heidelberg

In [105]:
nstops[nstops.stop_name.str.contains("Alte Brücke Nord")]
stop_ids = list(nstops[nstops.stop_name.str.contains("Alte Brücke Nord")].stop_id)
time_frame = [20210314, 20210314]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 19617794
Getting number of service days for each service
	...reading regular service calendars
697
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips
	2 seconds: completed 2000000 rows
	5 seconds: completed 4000000 rows
	8 seconds: completed 6000000 rows
	10 seconds: completed 8000000 rows
	13 seconds: completed 10000000 rows
	16 seconds: completed 12000000 rows
	19 seconds: completed 14000000 rows
	21 seconds: completed 16000000 rows
	24 seconds: completed 18000000 rows
	27 seconds: completed 20000000 rows
	30 seconds: completed 22000000 rows
	34 seconds: completed 24000000 rows
	37 seconds: completed 26000000 rows
	40 seconds: completed 28000000 rows
	43 seconds: completed 30000000 rows
	46 seconds: completed 32000000 rows
	49 seconds: completed 34000000 rows
	52 seconds: completed 36000000 rows
	55 seconds: completed 38000000 rows
	58 seconds: completed 40000000 rows
	62 seconds: compl

13.0

In [106]:
nstops[nstops.stop_name.str.contains("Rohrbach, Eichendorffplatz")]
stop_ids = list(nstops[nstops.stop_name.str.contains("Rohrbach, Eichendorffplatz")].stop_id)
time_frame = [20210520, 20210520]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 19617794
Getting number of service days for each service
	...reading regular service calendars
4427
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips
	2 seconds: completed 2000000 rows
	4 seconds: completed 4000000 rows
	6 seconds: completed 6000000 rows
	8 seconds: completed 8000000 rows
	11 seconds: completed 10000000 rows
	13 seconds: completed 12000000 rows
	15 seconds: completed 14000000 rows
	17 seconds: completed 16000000 rows
	20 seconds: completed 18000000 rows
	22 seconds: completed 20000000 rows
	24 seconds: completed 22000000 rows
	27 seconds: completed 24000000 rows
	29 seconds: completed 26000000 rows
	31 seconds: completed 28000000 rows
	33 seconds: completed 30000000 rows
	35 seconds: completed 32000000 rows
	38 seconds: completed 34000000 rows
	40 seconds: completed 36000000 rows
	42 seconds: completed 38000000 rows
	44 seconds: completed 40000000 rows
	47 seconds: compl

20.0

### Magdeburg

In [107]:
nstops[nstops.stop_name.str.contains("Goldschmiedebrücke")]
stop_ids = list(nstops[nstops.stop_name.str.contains("Goldschmiedebrücke")].stop_id)
time_frame = [20210314, 20210314]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 19617794
Getting number of service days for each service
	...reading regular service calendars
697
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips
	2 seconds: completed 2000000 rows
	5 seconds: completed 4000000 rows
	8 seconds: completed 6000000 rows
	11 seconds: completed 8000000 rows
	15 seconds: completed 10000000 rows
	18 seconds: completed 12000000 rows
	21 seconds: completed 14000000 rows
	24 seconds: completed 16000000 rows
	27 seconds: completed 18000000 rows
	30 seconds: completed 20000000 rows
	32 seconds: completed 22000000 rows
	35 seconds: completed 24000000 rows
	38 seconds: completed 26000000 rows
	41 seconds: completed 28000000 rows
	44 seconds: completed 30000000 rows
	47 seconds: completed 32000000 rows
	50 seconds: completed 34000000 rows
	53 seconds: completed 36000000 rows
	56 seconds: completed 38000000 rows
	59 seconds: completed 40000000 rows
	62 seconds: compl

0.0

In [108]:
nstops[nstops.stop_name.str.contains("Flechtinger Stra")]
stop_ids = list(nstops[nstops.stop_name.str.contains("Flechtinger Stra")].stop_id)
time_frame = [20210520, 20210520]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 19617794
Getting number of service days for each service
	...reading regular service calendars
4427
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips
	1 seconds: completed 2000000 rows
	3 seconds: completed 4000000 rows
	5 seconds: completed 6000000 rows
	7 seconds: completed 8000000 rows
	9 seconds: completed 10000000 rows
	11 seconds: completed 12000000 rows
	13 seconds: completed 14000000 rows
	15 seconds: completed 16000000 rows
	17 seconds: completed 18000000 rows
	18 seconds: completed 20000000 rows
	20 seconds: completed 22000000 rows
	22 seconds: completed 24000000 rows
	24 seconds: completed 26000000 rows
	26 seconds: completed 28000000 rows
	28 seconds: completed 30000000 rows
	30 seconds: completed 32000000 rows
	32 seconds: completed 34000000 rows
	34 seconds: completed 36000000 rows
	35 seconds: completed 38000000 rows
	37 seconds: completed 40000000 rows
	39 seconds: comple

3.0

## Summary

| Stadt      | Haltestelle                | Datum  | 2020 | 2021 | 
|------------|----------------------------|--------|------|------|
| Gießen     | Behördenzentrum            | 14.03. | 164  | 444    | 
| Gießen     | Graudenzer Straße          | 20.05. | 81   | 196   | 
| Heidelberg | Alte Brücke Nord           | 14.03. | 262  | 656  |
| Heidelberg | Rohrbach, Eichendorffplatz | 20.05. | 913  | 3117   |
| Magdeburg  | Goldschmiedebrücke         | 14.03. | 1464 | 1224  |
| Magdeburg  | Flechtinger Straße         | 20.05. | 1068 | 986  |


| Stadt      | Haltestelle                | Datum  | 2020 | 2021 | Delfi 21
|------------|----------------------------|--------|------|------|----------
| Gießen     | Behördenzentrum            | 05.10. | 168  | 448  | 659
| Gießen     | Graudenzer Straße          | 05.10. | 81   | 166  | 207
| Heidelberg | Alte Brücke Nord           | 05.10. | 264  | 656  | 200
| Heidelberg | Rohrbach, Eichendorffplatz | 05.10. | 913  | 2871 | 413
| Magdeburg  | Goldschmiedebrücke         | 05.10. | 1464 | 1210 | 260
| Magdeburg  | Flechtinger Straße         | 05.10. | 1068 | 986  | 207

## DELFI 2021

In [194]:

# Welcher Zip?
zipname = "20211015_fahrplaene_gesamtdeutschland_gtfs"
# constructed paths
# rawdir = workingdir + "raw/"
rawdir = workingdir + "raw/"
rawdatadir = rawdir + "gtfs/delfi/" 
#inpath = "{0}{1}_{2}.db".format(rawdatadir,jahr,datum)
zippath = rawdatadir + zipname + ".zip"


# set up zip file as default for functions
zf = zipfile.ZipFile(zippath) # this is the raw stuff
# stops as reference for finding individual stations
stops = pd.read_csv(zf.open("stops.txt"))# stops as reference for finding individual stations 

In [132]:
stops[stops.stop_name.str.contains("Gießen Behördenzentrum")]
stop_ids = list(stops[stops.stop_name.str.contains("Gießen Behördenzentrum")].stop_id)
time_frame = [20211005, 20211005]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 1641730
Getting number of service days for each service
	...reading regular service calendars
12004
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips
	3 seconds: completed 2000000 rows
	8 seconds: completed 4000000 rows
	12 seconds: completed 6000000 rows
	16 seconds: completed 8000000 rows
	20 seconds: completed 10000000 rows
	24 seconds: completed 12000000 rows
	29 seconds: completed 14000000 rows
	33 seconds: completed 16000000 rows
	37 seconds: completed 18000000 rows
	41 seconds: completed 20000000 rows
	45 seconds: completed 22000000 rows
	49 seconds: completed 24000000 rows
	53 seconds: completed 26000000 rows
	57 seconds: completed 28000000 rows
	62 seconds: completed 30000000 rows
	66 seconds: completed 32000000 rows
	70 seconds: completed 34000000 rows
	74 seconds: completed 36000000 rows
	78 seconds: completed 38000000 rows
	82 seconds: completed 40000000 rows
	86 seconds: com

0.0

In [133]:
stops[stops.stop_name.str.contains("Gießen Graudenzer")]
stop_ids = list(stops[stops.stop_name.str.contains("Gießen Graudenzer")].stop_id)
time_frame = [20211005, 20211005]

trips_df = filterTrips(readTrips(zf = zf), getServiceCount(zf = zf, time_frame = time_frame))
stop_times_df = readStopTimes(trips_df, stop_ids=stop_ids)

stop_times_df.days_count.sum()

	...reading trips
	... 1641730
Getting number of service days for each service
	...reading regular service calendars
12004
	...reading calendar exceptions
	...aggregating calendar
	...calculating total in calendar
	...filtering stop_times with trips
	4 seconds: completed 2000000 rows
	8 seconds: completed 4000000 rows
	12 seconds: completed 6000000 rows
	16 seconds: completed 8000000 rows
	21 seconds: completed 10000000 rows
	25 seconds: completed 12000000 rows
	30 seconds: completed 14000000 rows
	34 seconds: completed 16000000 rows
	38 seconds: completed 18000000 rows
	42 seconds: completed 20000000 rows
	46 seconds: completed 22000000 rows
	51 seconds: completed 24000000 rows
	55 seconds: completed 26000000 rows
	59 seconds: completed 28000000 rows
	64 seconds: completed 30000000 rows
	68 seconds: completed 32000000 rows
	72 seconds: completed 34000000 rows
	76 seconds: completed 36000000 rows
	80 seconds: completed 38000000 rows
	85 seconds: completed 40000000 rows
	89 seconds: com

0.0

In [141]:
stop_times_df = readStopTimes(stop_ids=stop_ids)

	...filtering stop_times with trips


  exec(code_obj, self.user_global_ns, self.user_ns)


	1 seconds: completed 2000000 rows
	2 seconds: completed 4000000 rows
	4 seconds: completed 6000000 rows
	5 seconds: completed 8000000 rows
	7 seconds: completed 10000000 rows
	9 seconds: completed 12000000 rows
	10 seconds: completed 14000000 rows
	12 seconds: completed 16000000 rows
	13 seconds: completed 18000000 rows
	15 seconds: completed 20000000 rows
	16 seconds: completed 22000000 rows
	18 seconds: completed 24000000 rows
	20 seconds: completed 26000000 rows
	21 seconds: completed 28000000 rows
	22 seconds: completed 30000000 rows
	24 seconds: completed 32000000 rows


In [142]:
stop_times_df

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,stop_headsign
3993444,1196869605,5:57:00,5:57:00,de:06531:19222:1:1,7,0,0,
3993460,1196869616,6:37:00,6:37:00,de:06531:19222:1:1,7,0,0,
3993476,1196869632,7:19:00,7:19:00,de:06531:19222:1:1,7,0,0,
3993492,1196869633,7:46:00,7:46:00,de:06531:19222:1:1,7,0,0,
3993508,1196869635,8:46:00,8:46:00,de:06531:19222:1:1,7,0,0,
...,...,...,...,...,...,...,...,...
21075963,1373674126,15:56:00,15:56:00,de:06531:19222:1:1,9,0,0,
21075975,1373674127,16:26:00,16:26:00,de:06531:19222:1:1,9,0,0,
21075987,1373674128,16:56:00,16:56:00,de:06531:19222:1:1,9,0,0,
21075999,1373674129,17:26:00,17:26:00,de:06531:19222:1:1,9,0,0,


# Other approach

In [301]:
def getN(stop_name, date, zf):
    stop_ids = list(stops[stops.stop_name.str.contains(stop_name)].stop_id)
    print(stop_ids)
    
    stop_times_df = readStopTimes(zf, stop_ids=stop_ids)

    stop_trips = pd.merge(stop_times_df, readTrips(zf), how="left")
    stop_calendar = stop_trips.merge(pd.read_csv(zf.open("calendar.txt")), on="service_id", how="left")
    stop_calendar = stop_calendar[stop_calendar.tuesday==1 & (date >= stop_calendar.start_date) & (date <= stop_calendar.end_date)]
    

    stop_exceptions = stop_trips.merge(pd.read_csv(zf.open("calendar_dates.txt")), on="service_id", how="left")
    stop_exceptions = stop_exceptions[stop_exceptions.date==date]
    
    
    return(len(stop_calendar) + sum(stop_exceptions.exception_type==1) - sum(stop_exceptions.exception_type==2))

## DELFI 2021

In [316]:
# Welcher Zip?
zipname = "20211015_fahrplaene_gesamtdeutschland_gtfs"
# constructed paths
# rawdir = workingdir + "raw/"
rawdir = workingdir + "raw/"
rawdatadir = rawdir + "gtfs/delfi/" 
#inpath = "{0}{1}_{2}.db".format(rawdatadir,jahr,datum)
zippath = rawdatadir + zipname + ".zip"


# set up zip file as default for functions
zf = zipfile.ZipFile(zippath) # this is the raw stuff
# stops as reference for finding individual stations
stops = pd.read_csv(zf.open("stops.txt"))# stops as reference for finding individual stations 

In [317]:
stop_name = "Gießen Behördenzentrum"
stops[stops.stop_name.str.contains(stop_name)]

getN(stop_name, 20211005, zf)

['de:06531:14681:1:1', 'de:06531:14681:2:2', 'de:06531:14681:3:3', 'de:06531:14681:4:4']
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 1641730


659

In [318]:
stops[stops.stop_name.str.contains("Gießen Graudenzer")]

getN("Gießen Graudenzer", 20211005,zf)

['de:06531:19222:1:1', 'de:06531:19222:2:2']
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 1641730


207

In [319]:
stops[stops.stop_name.str.contains("Alte Brücke Nord")]

getN("Alte Brücke Nord", 20211005,zf)

['de:08221:1223:0:RiO', 'de:08221:1223:0:RiW']
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 1641730


200

In [320]:
stop_name = "Rohrbach, Eichendorffplatz"
stops[stops.stop_name.str.contains(stop_name)]

getN(stop_name, 20211005,zf)

['de:08221:1263:1:TrRiN', 'de:08221:1263:1:TrRiS', 'de:08221:1263:2:Bus']
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 1641730


413

In [321]:
stop_name = "Goldschmiedebrücke"
stops[stops.stop_name.str.contains(stop_name)]

getN(stop_name, 20211005,zf)

['de:15003:5255::01', 'de:15003:5255::02']
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 1641730


260

In [322]:
stop_name = "Magdeburg, Flechtinger"
stops[stops.stop_name.str.contains(stop_name)]

getN(stop_name, 20211005,zf)

['de:15003:7316::01', 'de:15003:7316::02']
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 1641730


207

## Brosi 2020

In [303]:
# Welches Jahr?
jahr = "2020"
# Welcher Zip?
zipname = "2020_reissue_2"
# Welche Routenreferenz? (im raw-directory)

# define paths

# constructed paths
# rawdir = workingdir + "raw/"
rawdir = workingdir + "raw/"
rawdatadir = rawdir + "gtfs/" + jahr + "/"
#inpath = "{0}{1}_{2}.db".format(rawdatadir,jahr,datum)
zippath = rawdatadir + zipname + ".zip"


# set up zip file as default for functions
zf = zipfile.ZipFile(zippath) # this is the raw stuff
# stops as reference for finding individual stations
stops = pd.read_csv(zf.open("stops.txt"))

In [305]:
stop_name = "Gießen Behördenzentrum"
stops[stops.stop_name.str.contains(stop_name)]

getN(stop_name, 20201006, zf)

[552828, 509957, 440391, 506192]
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 3517922


168

In [306]:
stops[stops.stop_name.str.contains("Gießen Graudenzer")]

getN("Gießen Graudenzer", 20201006, zf)

[691820, 371615, 194474]
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 3517922


81

In [307]:
stops[stops.stop_name.str.contains("Alte Brücke Nord")]

getN("Alte Brücke Nord", 20201006, zf)

[259794, 198339, 612391]
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 3517922


264

In [308]:
stop_name = "Rohrbach, Eichendorffplatz"
stops[stops.stop_name.str.contains(stop_name)]

getN(stop_name, 20201006, zf)

[445957, 435869, 237754, 96346]
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 3517922


913

In [309]:
stop_name = "Goldschmiedebrücke"
stops[stops.stop_name.str.contains(stop_name)]

getN(stop_name, 20201006, zf)

[82973, 90365]
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 3517922


1464

In [310]:
stop_name = "Flechtinger Stra"
stops[stops.stop_name.str.contains(stop_name)]

getN(stop_name, 20201006, zf)

[407241, 643049, 757513]
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 3517922


1068

redo the rest quickly...

In [312]:
for stop_name in ["Gießen Behördenzentrum", "Gießen Graudenzer", "Alte Brücke Nord",
                  "Rohrbach, Eichendorffplatz", "Goldschmiedebrücke", "Flechtinger Stra"]:
    print(stop_name)
    for date in [20200314, 20200520]:
        print(date)
        print(getN(stop_name, date, zf))

Gießen Behördenzentrum
20200314
[552828, 509957, 440391, 506192]
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 3517922
164
20200520
[552828, 509957, 440391, 506192]
	...reading stop_times
	......filtering stops
	...reading trips
	... 3517922
164
Gießen Graudenzer
20200314
[691820, 371615, 194474]
	...reading stop_times
	......filtering stops
	...reading trips
	... 3517922
81
20200520
[691820, 371615, 194474]
	...reading stop_times
	......filtering stops
	...reading trips
	... 3517922
81
Alte Brücke Nord
20200314
[259794, 198339, 612391]
	...reading stop_times
	......filtering stops
	...reading trips
	... 3517922
262
20200520
[259794, 198339, 612391]
	...reading stop_times
	......filtering stops
	...reading trips
	... 3517922
262
Rohrbach, Eichendorffplatz
20200314
[445957, 435869, 237754, 96346]
	...reading stop_times
	......filtering stops
	...reading trips
	... 3517922
906
20200520
[445957, 435869, 237754, 96346]
	...reading stop_times
	......filtering stops
	...reading trips
	... 3517922
913
Goldschmiedebrücke
20200314
[82973, 90365]
	...reading stop_times
	......filte

## Brosi 2021

In [314]:
# Welches Jahr?
jahr = "2021"
# Welcher Zip?
zipname = "2021_reissue_2"
# Welche Routenreferenz? (im raw-directory)

# define paths

# constructed paths
# rawdir = workingdir + "raw/"
rawdir = workingdir + "raw/"
rawdatadir = rawdir + "gtfs/" + jahr + "/"
#inpath = "{0}{1}_{2}.db".format(rawdatadir,jahr,datum)
zippath = rawdatadir + zipname + ".zip"


# set up zip file as default for functions
zf = zipfile.ZipFile(zippath) # this is the raw stuff
# stops as reference for finding individual stations
stops = pd.read_csv(zf.open("stops.txt"))

In [315]:
for stop_name in ["Gießen Behördenzentrum", "Gießen Graudenzer", "Alte Brücke Nord",
                  "Rohrbach, Eichendorffplatz", "Goldschmiedebrücke", "Flechtinger Stra"]:
    print(stop_name)
    for date in [20210314, 20210520, 20211005]:
        print(date)
        print(getN(stop_name, date, zf))

Gießen Behördenzentrum
20210314
[915763, 870579, 1036033, 814570, 209277]
	...reading stop_times
	......filtering stops


  """


	...reading trips
	... 19617794
444
20210520
[915763, 870579, 1036033, 814570, 209277]
	...reading stop_times
	......filtering stops
	...reading trips
	... 19617794
525
20211005
[915763, 870579, 1036033, 814570, 209277]
	...reading stop_times
	......filtering stops
	...reading trips
	... 19617794
448
Gießen Graudenzer
20210314
[451984, 335280, 622431, 594632, 1016860]
	...reading stop_times
	......filtering stops
	...reading trips
	... 19617794
166
20210520
[451984, 335280, 622431, 594632, 1016860]
	...reading stop_times
	......filtering stops
	...reading trips
	... 19617794
196
20211005
[451984, 335280, 622431, 594632, 1016860]
	...reading stop_times
	......filtering stops
	...reading trips
	... 19617794
166
Alte Brücke Nord
20210314
[434412, 875600, 964303, 202356, 302426]
	...reading stop_times
	......filtering stops
	...reading trips
	... 19617794
656
20210520
[434412, 875600, 964303, 202356, 302426]
	...reading stop_times
	......filtering stops
	...reading trips
	... 19617794
742


## Concept

In [223]:
stops[stops.stop_name.str.contains("Goldschmiedebrücke")]
stop_ids = list(stops[stops.stop_name.str.contains("Goldschmiedebrücke")].stop_id)
date = 20201006

stop_times_df = readStopTimes(stop_ids=stop_ids)

stop_trips = pd.merge(stop_times_df, readTrips(zf=zf), how="left")
stop_calendar = stop_trips.merge(pd.read_csv(zf.open("calendar.txt")), on="service_id", how="left")
stop_calendar = stop_calendar[stop_calendar.tuesday==1 & (date >= stop_calendar.start_date) & (date <= stop_calendar.end_date)]
len(stop_calendar)

stop_exceptions = stop_trips.merge(pd.read_csv(zf.open("calendar_dates.txt")), on="service_id", how="left")
stop_exceptions = stop_exceptions[stop_exceptions.date==date]
sum(stop_exceptions.exception_type==1) - sum(stop_exceptions.exception_type==2)



	...reading stop_times


  exec(code_obj, self.user_global_ns, self.user_ns)


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [283]:
reader = pd.read_csv(zf.open("stop_times.txt"), chunksize=200000, iterator=True, encoding='utf-8')

pd.concat([r[r.stop_id.isin(stop_ids)] for r in reader])