[Data source VBZ](https://data.stadt-zuerich.ch/dataset/vbz-fahrgastzahlen-ogd)

[Data source ZVV](https://data.stadt-zuerich.ch/dataset/vbz_fahrplandaten_gtfs)




In [20]:
import math
import re

import pandas as pd
import numpy as np

import requests

import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns 
sns.set()

import networkx as nx

pd.options.display.float_format = '{:,.8}'.format

In [21]:
#VBZ:
df_fahrgastzahlen = pd.read_csv("reisende.csv", sep = ";")
df_haltestelle = pd.read_csv("haltestellen.csv", sep = ";")
df_tagtyp = pd.read_csv("tagtyp.csv", sep = ";")

#ZVV (Mapping from stops to lon/lat to have a same understanding of stops with 1a):
df_stops = pd.read_csv("stops.txt") 

**Features Tagtyp**

In [22]:
#this is made by hand
days_per_year_per_tagtyp_id = {
    3 : 65
    ,4 : 52
    ,5 : 52
    ,6 : 251
    ,7 : 200
    ,8 : 307 * 1/7
    ,9 : 307 * 1/7
    ,10 : 307 * 1/7
    ,11 : 307 * 5/7
    ,13 : 32 * 5/7
    ,14 : 125 * 5/7
    ,15 : 125 * 4/7
    ,16 : 125 * 1/7
    ,17 : 125 * 1/7
    ,18 : 125 * 1/7
}

days_per_year_per_tagtyp_id = {key : math.floor(value) for key,value in days_per_year_per_tagtyp_id.items() }

df_tagtyp2 = pd.DataFrame(list(days_per_year_per_tagtyp_id.items()), columns = ["Tagtyp_Id","days_per_year"])
#df_tagtyp2["ratio_days_per_year"] = df_tagtyp2.days_per_year / 365
df_tagtyp2[:5]

Unnamed: 0,Tagtyp_Id,days_per_year
0,3,65
1,4,52
2,5,52
3,6,251
4,7,200


**Features Haltestellen**

In [83]:
def rename_stops(input_name):
    renaming = {
        "Zürich, Altes Krematorium" : "Zürich, Krematorium Sihlfeld"
        ,"Dübendorf, Innovationspark" : "Dübendorf, Eglishölzli"
        ,"Kilchberg ZH, Altersheim" : "Kilchberg ZH, Hallenbad"
    }
    renamed = renaming.get(input_name,None)
    
    return renamed or input_name
    
    
df_stops["stop_name_actual"] = df_stops.stop_name.map(rename_stops)
df_stops2 = df_stops.groupby(["stop_name_actual"],)[["stop_lat","stop_lon"]].mean().reset_index()

df_stops3 = df_stops2.merge(df_haltestelle,left_on="stop_name_actual",right_on="Haltestellenlangname")

df_stops_final = df_stops3[["Haltestellen_Id","Haltestellenkurzname","Haltestellenlangname","stop_lat","stop_lon"]]
df_stops_final[:5]

Unnamed: 0,Haltestellen_Id,Haltestellenkurzname,Haltestellenlangname,stop_lat,stop_lon
0,352,AHOR,"Adliswil, Ahornweg",47.318928,8.5342153
1,656,ABAD01,"Adliswil, Badstrasse",47.313703,8.5337616
2,658,BADL01,"Adliswil, Bahnhof",47.311643,8.5244311
3,655,EWEG,"Adliswil, Eichenweg",47.320052,8.5336628
4,657,AGRU,"Adliswil, Grundstrasse",47.31295,8.5300516


**Final table for aggregation**

In [84]:
df = df_fahrgastzahlen[["Tagtyp_Id"
                       #,"Linien_Id"
                       #,"Linienname"
                       #,"Plan_Fahrt_Id"
                       ,"Haltestellen_Id"
                       ,"Nach_Hst_Id"
                       ,"Einsteiger"
                       ,"Aussteiger"
                       ,"Besetzung"
                      ]]
#Haltestellen-Id 349 fehlt im Lookup
df2 = df \
        .merge(df_tagtyp2[["Tagtyp_Id","days_per_year"]], on = ["Tagtyp_Id"]) \
        .merge(df_stops_final, on = "Haltestellen_Id") \
        .merge(df_stops_final, left_on = "Nach_Hst_Id", right_on ="Haltestellen_Id", suffixes = ("_von","_nach")) \
        .drop(["Tagtyp_Id","Nach_Hst_Id"],axis=1)
        
df3 = df2.fillna({"Einsteiger" : 0, "Aussteiger" : 0, "Besetzung" : 0})

df3["Einsteiger_per_year"] = (df3.Einsteiger * df3.days_per_year).map(math.floor)
df3["Aussteiger_per_year"] = (df3.Aussteiger * df3.days_per_year).map(math.floor)
df3["Besetzung_per_year"] = (df3.Besetzung * df3.days_per_year).map(math.floor)

df_final = df3.drop("days_per_year",axis=1)
df_final[:5]

Unnamed: 0,Haltestellen_Id_von,Einsteiger,Aussteiger,Besetzung,Haltestellenkurzname_von,Haltestellenlangname_von,stop_lat_von,stop_lon_von,Haltestellen_Id_nach,Haltestellenkurzname_nach,Haltestellenlangname_nach,stop_lat_nach,stop_lon_nach,Einsteiger_per_year,Aussteiger_per_year,Besetzung_per_year
0,84,1.42,0.0,1.42,BALT,"Zürich, Bahnhof Altstetten",47.39073,8.4889507,320,SEID,"Zürich, Seidelhof",47.391931,8.483712,100,0,100
1,84,2.84,0.0,2.84,BALT,"Zürich, Bahnhof Altstetten",47.39073,8.4889507,320,SEID,"Zürich, Seidelhof",47.391931,8.483712,201,0,201
2,84,4.16,0.0,4.16,BALT,"Zürich, Bahnhof Altstetten",47.39073,8.4889507,320,SEID,"Zürich, Seidelhof",47.391931,8.483712,295,0,295
3,84,12.8,0.0,12.8,BALT,"Zürich, Bahnhof Altstetten",47.39073,8.4889507,320,SEID,"Zürich, Seidelhof",47.391931,8.483712,908,0,908
4,84,17.3,0.0,17.3,BALT,"Zürich, Bahnhof Altstetten",47.39073,8.4889507,320,SEID,"Zürich, Seidelhof",47.391931,8.483712,1228,0,1228


In [89]:
df_menge_einsteiger = (df_final.groupby(["Haltestellenlangname_von","Haltestellenkurzname_von"])[["Einsteiger_per_year"]]
                               .sum()
                               .reset_index()
                               .rename({"Haltestellenlangname_von" : "stop_name"
                                        ,"Haltestellenkurzname_von" : "stop_kuerzel" 
                                       }, axis = 1
                                )
                      )
df_menge_aussteiger = (df_final.groupby(["Haltestellenlangname_von","Haltestellenkurzname_von"])[["Aussteiger_per_year"]]
                               .sum()
                               .reset_index()
                               .rename({"Haltestellenlangname_von" : "stop_name"
                                        ,"Haltestellenkurzname_von" : "stop_kuerzel" 
                                       }, axis = 1
                                )
                      )
df_menge_besetzung = (df_final.groupby(["Haltestellenlangname_von"
                                       ,"Haltestellenlangname_nach"
                                       ,"Haltestellenkurzname_von"
                                       ,"Haltestellenkurzname_nach"]
                              )[["Besetzung_per_year"]]
                              .sum()
                              .reset_index()
                              .rename({"Haltestellenlangname_von" : "stop_name_von"
                                        ,"Haltestellenkurzname_von" : "stop_kuerzel_von" 
                                       , "Haltestellenlangname_nach" : "stop_name_nach"
                                        ,"Haltestellenkurzname_nach" : "stop_kuerzel_nach" 
                                      },  axis = 1
                               )
                     )

In [90]:
df_menge_einsteiger[:5]

Unnamed: 0,stop_name,stop_kuerzel,Einsteiger_per_year
0,"Adliswil, Ahornweg",AHOR,48019
1,"Adliswil, Badstrasse",ABAD01,44314
2,"Adliswil, Bahnhof",BADL01,176901
3,"Adliswil, Eichenweg",EWEG,40330
4,"Adliswil, Grundstrasse",AGRU,8489


In [91]:
df_menge_besetzung.sort_values(by="Besetzung_per_year", ascending = False)[:5]

Unnamed: 0,stop_name_von,stop_name_nach,stop_kuerzel_von,stop_kuerzel_nach,Besetzung_per_year
1419,"Zürich, Rennweg","Zürich, Paradeplatz",RAUG,PARA,10620127
771,"Zürich, Bahnhofstrasse/HB","Zürich, Rennweg",BSTR,RAUG,10362311
1382,"Zürich, Paradeplatz","Zürich, Rennweg",PARA,RAUG,10223229
1418,"Zürich, Rennweg","Zürich, Bahnhofstrasse/HB",RAUG,BSTR,10164296
786,"Zürich, Bellevue","Zürich, Bürkliplatz",BELL,BURK,10060137


In [92]:
df_menge_einsteiger.to_csv("1b_menge_einsteiger.csv", index = False)
df_menge_aussteiger.to_csv("1b_menge_aussteiger.csv", index = False)
df_menge_besetzung.to_csv("1b_menge_besetzung.csv", index = False)