In [1]:
# load libraries
import pandas as pd
import os, re
import datetime as dt
from sqlalchemy import create_engine, text
import zipfile
from dask import dataframe as dd

## Setup

In [2]:
# Welches Jahr?
jahr = "2021"
# Welcher Zip?
zipname = "20211015_fahrplaene_gesamtdeutschland_gtfs"
# # Welche Routenreferenz? (im raw-directory)
routescope = ""

# define paths
workingdir = "/home/jupyter-maita.schade/VW_Data_Hub/Gap_Map/"
#storagedir = "smb://192.168.90.30/allmende%20verkehr/4%20Projekte/2%20Projekte%20Mobilitätswende/ÖV-Deutschlandkarte%20(Gap-Map)/Berechnungen/raw/gtfs/"

# constructed paths
# rawdir = workingdir + "raw/"
rawdir = workingdir + "raw/"
rawdatadir = rawdir + "gtfs/" + 'delfi/'# + jahr + "/"
outdir = workingdir + "out/" + 'delfi/'# + jahr + "/"
#inpath = "{0}{1}_{2}.db".format(rawdatadir,jahr,datum)
zippath = rawdatadir + zipname + ".zip"

# set up zip file as default for functions
zf = zipfile.ZipFile(zippath) # this is the raw stuff

In [3]:
# choose file-based output connection
outpath = '{0}{1}_test.db'.format(outdir,zipname)
# set up DB connection
dbout = create_engine('sqlite:///' + outpath)

# Count service_ids

In [4]:
def interveningWeekdays(start, end, inclusive=True, weekdays=[0, 1, 2, 3, 4]):
    # a useful function from Stackoverflow, to count particular weekdays in date range
    if isinstance(start, dt.datetime):
        start = start.date()               # make a date from a datetime

    if isinstance(end, dt.datetime):
        end = end.date()                   # make a date from a datetime

    if end < start:
        # you can opt to return 0 or swap the dates around instead
        # raise ValueError("start date must be before end date")
        end, start = start, end

    if inclusive:
        end += dt.timedelta(days=1)  # correct for inclusivity

    try:
        # collapse duplicate weekdays
        weekdays = {weekday % 7 for weekday in weekdays}
    except TypeError:
        weekdays = [weekdays % 7]
        
    ref = dt.date.today()                    # choose a reference date
    ref -= dt.timedelta(days=ref.weekday())  # and normalize its weekday

    return sum((ref_plus - start).days // 7 - (ref_plus - end).days // 7
               for ref_plus in
               (ref + dt.timedelta(days=weekday) for weekday in weekdays))

def countDaysInIntervalHelper(calendarrow):
    # function to find number of days of service operation based on calendars.txt-entry
    servicepattern = calendarrow.loc["monday":"sunday"].to_numpy()
    servicedays = servicepattern.nonzero()[0].tolist()

    startdate = dt.datetime.strptime(str(int(calendarrow.get("start_date"))),"%Y%m%d")
    enddate = dt.datetime.strptime(str(int(calendarrow.get("end_date"))),"%Y%m%d")
    return(interveningWeekdays(startdate, enddate, weekdays = servicedays))

### Helper function to compare dates
def isInIntervalHelper(n, interval):
    '''works only on ARRAY-like n'''
    return(np.where((n <= max(interval)) & (n >= min(interval)), True, False))

In [5]:
# function to add frequencies... let's hope this is right
def addCountToCalendar(calendar_df, calendar_dates_df):
    # enriches stop_times DataFrame with information about how often in the feed
    # period each stop is made
    

    print("Getting number of service days for each service")
    # use service_id to find service...
    calendar_df["days_count"] = calendar_df.apply(countDaysInIntervalHelper, axis=1)    

    print("\t...aggregating calendar")
    calendar_df = calendar_dates_df.groupby(["service_id", "exception_type"], as_index=False
                              ).count(
                            ).pivot(index = "service_id", columns = "exception_type", values = "date"
                            ).reset_index(
                            ).merge(calendar_df, on="service_id", how="right"
                            )[['service_id', 1, 2, 'monday',
                                  'tuesday',  'wednesday',   'thursday',     'friday',   'saturday',
                                  'sunday', 'start_date',   'end_date', 'days_count']]
    
    print("\t...calculating total in calendar")
    calendar_df.days_count= (calendar_df.days_count + calendar_df[1].fillna(0) - calendar_df[2].fillna(0)
                            )
    
    return(calendar_df)

In [6]:
def feedDays(calendar_df, calendar_dates_df):
    ''' Enriches counted dataframe with average daily count for each stop,
    using the feed's calendar information to infer the number of days
    '''
    # calculate
    startdate =  min(pd.to_datetime(calendar_df.start_date,format="%Y%m%d"))
    enddate = max(pd.to_datetime(calendar_df.end_date,format="%Y%m%d"))
    excdates = pd.to_datetime(calendar_dates_df.date,format="%Y%m%d")

    firstdate = min(startdate, min(excdates))
    lastdate = max(enddate, max(excdates))

    ndays = (lastdate - firstdate).days
    
    return(ndays)

In [7]:
calendar_df = pd.read_csv(rawdatadir + "calendar.txt")
calendar_dates_df = pd.read_csv(rawdatadir + "calendar_dates.txt")

In [8]:
calendar_df = addCountToCalendar(calendar_df, calendar_dates_df)

Getting number of service days for each service
	...aggregating calendar
	...calculating total in calendar


In [9]:
ndays = feedDays(calendar_df, calendar_dates_df)

In [10]:
calendar_df.head()

Unnamed: 0,service_id,1,2,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,days_count
0,1,,7.0,1,1,1,1,1,0,0,20211001,20211211,44.0
1,2,,9.0,1,1,1,1,1,1,1,20211001,20211211,63.0
2,3,6.0,3.0,0,0,0,0,0,1,1,20211001,20211211,24.0
3,4,1.0,1.0,0,0,0,0,0,0,1,20211001,20211211,10.0
4,5,6.0,1.0,0,0,0,0,0,0,1,20211001,20211211,15.0


# Get things into database

## calendar

In [11]:
calendar_df.to_sql("calendar", 'sqlite:///' + outpath,
          if_exists = 'replace')

## trips, stops

In [None]:
for table_name in ['stops','trips']: #'stop_times','calendar'
    print(table_name)

    df = dd.read_csv(rawdatadir + table_name + ".txt", dtype={'level_id': 'float64',
                                                              'parent_station': 'object',
                                                              'platform_code': 'object',
                                                              'stop_headsign': 'object',
                                                              'trip_short_name': 'object'
                                                              }

                                                              )
    
    df.to_sql(table_name,'sqlite:///'+outpath,
          if_exists = 'replace')

## stop_times

In [None]:
start = dt.datetime.now()
chunksize = 200000
j = 0

for df in pd.read_csv(rawdatadir + 'stop_times' + '.txt', nrows = 2000000, chunksize=chunksize, iterator=True, encoding='utf-8'):
    j+=1
#     print(j)
    if j%10==0:
        print('\t{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize))

#     print("\t...", len(result_df))
    if j==1:
        df.to_csv(outdir + "stop_times.txt", mode = 'w')
    else:
        df.to_csv(outdir + "stop_times.txt", mode = 'a')


In [None]:
%%time
table_name = 'stop_times'
df = dd.read_csv(rawdatadir + table_name + ".txt", 
                 dtype={'Unnamed: 0': 'float64',
                       'drop_off_type': 'object',
                       'pickup_type': 'object',
                       'stop_sequence': 'object',
                       'trip_id': 'object',
                       'stop_headsign': 'object'}
                )
df.to_sql(table_name,
          'sqlite:///'+outpath,
          if_exists = 'replace')

In [None]:
%%time
table_name = 'stop_times'
j=0
for df in pd.read_csv(rawdatadir + table_name + ".txt",
                      chunksize=chunksize, iterator=True, encoding='utf-8',
                       dtype={'Unnamed: 0': 'float64',
                       'drop_off_type': 'object',
                       'pickup_type': 'object',
                       'stop_sequence': 'object',
                       'trip_id': 'object',
                       'stop_headsign': 'object'}
                     ):
    j+=1
    if j==1:
        df.to_sql(table_name, dbout, if_exists='replace')
    else:
        df.to_sql(table_name, dbout, if_exists='append')

In [None]:
%%time
calendar_dd.size.compute()

In [None]:
calendar_dd.to_sql("calendar", 'sqlite:///' + outpath)

# calendar -> trips [service_id, count]

# trips -> stop_times [trip_id, count]

In [12]:
%load_ext sql

%sql sqlite:////home/jupyter-maita.schade/VW_Data_Hub/Gap_Map/out/delfi/20211015_fahrplaene_gesamtdeutschland_gtfs_test.db


In [18]:
%%sql 
SELECT 
    name
FROM 
    sqlite_schema
WHERE 
    type ='table' AND 
    name NOT LIKE 'sqlite_%';

 * sqlite:////home/jupyter-maita.schade/VW_Data_Hub/Gap_Map/out/delfi/20211015_fahrplaene_gesamtdeutschland_gtfs_test.db
Done.


name
stop_times
stops
trips
calendar


    CREATE TABLE n_stops AS
       ...> SELECT stop_id, SUM(days_count)
       ...> FROM stop_times
       ...> LEFT JOIN trips ON trips.trip_id = stop_times.trip_id
       ...> LEFT JOIN calendar ON trips.service_id = calendar.service_id
       ...> GROUP BY stop_id;


In [None]:
%%sql 
CREATE TABLE n_stops AS 
SELECT stop_id, SUM(days_count) 
FROM stop_times 
LEFT JOIN trips ON trips.trip_id = stop_times.trip_id 
LEFT JOIN calendar 
ON trips.service_id = calendar.service_id 
GROUP BY stop_id;

 * sqlite:////home/jupyter-maita.schade/VW_Data_Hub/Gap_Map/out/delfi/20211015_fahrplaene_gesamtdeutschland_gtfs_test.db


In [None]:
pd.read_sql_query('SELECT * '
                       'FROM n_stops '
                       'JOIN stops ON n_stops.stop_id = stops.stop_id',
                  dbout
                 )

# Group stop_times [stop_id]

# stop_times -> stops