# Data Wrangling

## Spark Loading

In [1]:
%load_ext sparkmagic.magics

In [2]:
import pickle
path_data = "../data/"

In [3]:
# pull from lfs
! git lfs pull

In [4]:
import os
from IPython import get_ipython
username = os.environ['RENKU_USERNAME']
server = "http://iccluster029.iccluster.epfl.ch:8998"

# set the application name as "<your_gaspar_id>-homework3"
get_ipython().run_cell_magic(
    'spark',
    line='config', 
    cell="""{{ "name": "{0}-homework3", "executorMemory": "4G", "executorCores": 4, "numExecutors": 10, "driverMemory": "4G"}}""".format(username)
)

In [5]:
get_ipython().run_line_magic(
    "spark", "add -s {0}-homework3 -l python -u {1} -k".format(username, server)
)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
6605,application_1652960972356_2148,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


In [6]:
%%spark
print('We are using Spark %s' % spark.version)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

We are using Spark 2.3.2.3.1.4.0-315

## Stops 15km from Zürich HB (8503000) (lat, lon) = (47.378177, 8.540192)

In [7]:
%%spark
all_stops = spark.read.csv('/data/sbb/csv/allstops/stop_locations.csv')
all_stops.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------------------+----------------+----------------+----+----+
|    _c0|                 _c1|             _c2|             _c3| _c4| _c5|
+-------+--------------------+----------------+----------------+----+----+
|1100006|Zell (Wiesental),...|47.7046317515335|7.84777215468897|null|null|
|1100008|Zell (Wiesental),...|47.7100842702352|7.85964788274668|null|null|
|1100009|Zell (Wiesental),...|47.7131911044794|7.86290876722849|null|null|
|1100010|           Atzenbach|47.7146175266411| 7.8723500608659|null|null|
|1100011|     Mambach, Brücke|47.7282088873189| 7.8774704579861|null|null|
|1100012|  Mambach, Mühlschau|47.7340818684375| 7.8813871126254|null|null|
|1100013|  Mambach, Silbersau|47.7395192233867|7.88223152899259|null|null|
|1100014|Fröhnd (Schwarzw)...|47.7543663509316|7.88913059037559|null|null|
|1100015|Fröhnd (Schwarzw)...|47.7605926689054|7.88553732923861|null|null|
|1100016|     Wembach (Baden)|47.7728317637339|7.88772023537933|null|null|
+-------+----------------

In [8]:
%%spark
all_stops = all_stops.withColumnRenamed('_c0','stop_id')
all_stops = all_stops.withColumnRenamed('_c1','stop_name')
all_stops = all_stops.withColumnRenamed('_c2','lat')
all_stops = all_stops.withColumnRenamed('_c3','lon')
all_stops = all_stops.drop('_c4','_c5')
all_stops.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------------------+----------------+----------------+
|stop_id|           stop_name|             lat|             lon|
+-------+--------------------+----------------+----------------+
|1100006|Zell (Wiesental),...|47.7046317515335|7.84777215468897|
|1100008|Zell (Wiesental),...|47.7100842702352|7.85964788274668|
|1100009|Zell (Wiesental),...|47.7131911044794|7.86290876722849|
|1100010|           Atzenbach|47.7146175266411| 7.8723500608659|
|1100011|     Mambach, Brücke|47.7282088873189| 7.8774704579861|
|1100012|  Mambach, Mühlschau|47.7340818684375| 7.8813871126254|
|1100013|  Mambach, Silbersau|47.7395192233867|7.88223152899259|
|1100014|Fröhnd (Schwarzw)...|47.7543663509316|7.88913059037559|
|1100015|Fröhnd (Schwarzw)...|47.7605926689054|7.88553732923861|
|1100016|     Wembach (Baden)|47.7728317637339|7.88772023537933|
+-------+--------------------+----------------+----------------+
only showing top 10 rows

In [9]:
%%spark
import pyspark.sql.functions as F
zur_hp_lat = 47.378177
zur_hp_lon = 8.540192

all_stops_distance = all_stops.withColumn("a", (
        F.pow(F.sin(F.radians(zur_hp_lat - F.col("lat")) / 2), 2) +
        F.cos(F.radians(F.col("lat"))) * F.cos(F.radians(F.lit(zur_hp_lat))) *
        F.pow(F.sin(F.radians(zur_hp_lon - F.col("lon")) / 2), 2)
    )).withColumn("distance", F.atan2(F.sqrt(F.col("a")), F.sqrt(-F.col("a") + 1)) * 12742000)

all_stops_distance.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------------------+----------------+----------------+--------------------+------------------+
|stop_id|           stop_name|             lat|             lon|                   a|          distance|
+-------+--------------------+----------------+----------------+--------------------+------------------+
|1100006|Zell (Wiesental),...|47.7046317515335|7.84777215468897|2.475398503604094...|  63396.0142787592|
|1100008|Zell (Wiesental),...|47.7100842702352|7.85964788274668|2.445985737384344...| 63018.24892966985|
|1100009|Zell (Wiesental),...|47.7131911044794|7.86290876722849|2.446306465224431...| 63022.38043559904|
|1100010|           Atzenbach|47.7146175266411| 7.8723500608659|2.409494241610368E-5|62546.396144882805|
|1100011|     Mambach, Brücke|47.7282088873189| 7.8774704579861|2.456510835903595E-5| 63153.68854022285|
|1100012|  Mambach, Mühlschau|47.7340818684375| 7.8813871126254|2.469960438154199...| 63326.33982721011|
|1100013|  Mambach, Silbersau|47.7395192233867|7.882231

In [10]:
%%spark
close_stops = all_stops_distance.filter(all_stops_distance['distance'] <= 15000).drop('a','distance')
close_stops.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+--------------------+----------------+----------------+
|      stop_id|           stop_name|             lat|             lon|
+-------------+--------------------+----------------+----------------+
|          176|Zimmerberg-Basist...|47.3516780901371|8.52195777551452|
|      8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317|
|      8502075|Zürich Flughafen,...|47.4510244676285|8.56372943623189|
|      8502186|Dietikon Stoffelbach|47.3933267759652|8.39896044679575|
|    8502186:0|Dietikon Stoffelbach|47.3933997509195|8.39894248049007|
|8502186:0:1/2|Dietikon Stoffelbach|47.3933997509195|8.39894248049007|
|     8502186P|Dietikon Stoffelbach|47.3933997509195|8.39894248049007|
|      8502187|Rudolfstetten Hof...|47.3646702178563|8.37695172233176|
|    8502187:0|Rudolfstetten Hof...|47.3647371479356|8.37703257070734|
|8502187:0:1/2|Rudolfstetten Hof...|47.3647371479356|8.37703257070734|
+-------------+--------------------+----------------+----------------+
only s

In [11]:
%%spark
close_stops.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2122

## Weekly timetable

Considered period: May 08-14, 2019 (then we exclude the weekends)

In [12]:
%%spark
all_trips1 = spark.read.csv('/data/sbb/csv/stop_times/2019/05/08/stop_times.txt', header=True)
#all_trips2 = spark.read.csv('/data/sbb/csv/stop_times/2019/05/15/stop_times.txt', header=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
%%spark
all_trips1.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

10862563

In [14]:
%%spark
#all_trips2.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
%%spark
all_trips1.filter(all_trips1.arrival_time==all_trips1.departure_time).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

10413974

In [16]:
%%spark
#all_trips_tot = all_trips1.union(all_trips2)
all_trips_tot = all_trips1
all_trips_tot.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+------------+--------------+-------+-------------+-----------+-------------+
|            trip_id|arrival_time|departure_time|stop_id|stop_sequence|pickup_type|drop_off_type|
+-------------------+------------+--------------+-------+-------------+-----------+-------------+
|1.TA.1-84-j19-1.1.H|    06:13:00|      06:13:00|8572249|            1|          0|            0|
|1.TA.1-84-j19-1.1.H|    06:14:00|      06:14:00|8577508|            2|          0|            0|
|1.TA.1-84-j19-1.1.H|    06:15:00|      06:15:00|8581070|            3|          0|            0|
|1.TA.1-84-j19-1.1.H|    06:16:00|      06:16:00|8578360|            4|          0|            0|
|1.TA.1-84-j19-1.1.H|    06:17:00|      06:17:00|8583448|            5|          0|            0|
|1.TA.1-84-j19-1.1.H|    06:18:00|      06:19:00|8578359|            6|          0|            0|
|1.TA.1-84-j19-1.1.H|    06:24:00|      06:24:00|8578358|            7|          0|            0|
|1.TA.1-84-j19-1.1.H

In [17]:
%%spark
routes1 = spark.read.csv('/data/sbb/csv/trips/2019/05/08/trips.txt', header=True)
#routes2 = spark.read.csv('/data/sbb/csv/trips/2019/05/15/trips.txt', header=True)
#routes_tot = routes1.union(routes2)
routes_tot = routes1
routes_tot.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+----------+--------------------+-------------------+---------------+------------+
|   route_id|service_id|             trip_id|      trip_headsign|trip_short_name|direction_id|
+-----------+----------+--------------------+-------------------+---------------+------------+
| 1-85-j19-1|  TA+b0001| 2.TA.1-85-j19-1.1.H|Schöftland, Bahnhof|          85003|           0|
|1-1-C-j19-1|  TA+b0001|5.TA.1-1-C-j19-1.3.R| Zofingen, Altachen|            108|           1|
|1-1-C-j19-1|  TA+b0001|7.TA.1-1-C-j19-1.3.R| Zofingen, Altachen|            112|           1|
|1-1-C-j19-1|  TA+b0001|9.TA.1-1-C-j19-1.3.R| Zofingen, Altachen|            116|           1|
|1-1-C-j19-1|  TA+b0001|11.TA.1-1-C-j19-1...| Zofingen, Altachen|            120|           1|
|1-1-C-j19-1|  TA+b0001|13.TA.1-1-C-j19-1...| Zofingen, Altachen|            124|           1|
|1-1-C-j19-1|  TA+b0001|15.TA.1-1-C-j19-1...| Zofingen, Altachen|            128|           1|
|1-1-C-j19-1|  TA+b0001|17.TA.1-1-C-j19-1...| Zofi

In [18]:
%%spark
trips_with_routes = all_trips_tot.join(routes_tot, all_trips_tot.trip_id == routes_tot.trip_id)
trips_with_routes.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+------------+--------------+------------+-------------+-----------+-------------+------------+----------+--------------------+--------------+---------------+------------+
|             trip_id|arrival_time|departure_time|     stop_id|stop_sequence|pickup_type|drop_off_type|    route_id|service_id|             trip_id| trip_headsign|trip_short_name|direction_id|
+--------------------+------------+--------------+------------+-------------+-----------+-------------+------------+----------+--------------------+--------------+---------------+------------+
|1.TA.12-1-A-j19-1...|    01:30:00|      01:30:00|8505000:0:11|            1|          0|            0|12-1-A-j19-1|  TA+b099q|1.TA.12-1-A-j19-1...|        Sursee|          31100|           0|
|1.TA.12-1-A-j19-1...|    01:35:00|      01:35:00| 8502021:0:1|            2|          0|            0|12-1-A-j19-1|  TA+b099q|1.TA.12-1-A-j19-1...|        Sursee|          31100|           0|
|1.TA.12-1-A-j19-1...|    01:36:00|

In [19]:
%%spark
trips_with_routes = trips_with_routes.select(all_trips_tot.trip_id, 'service_id').distinct()
trips_with_routes.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+----------+
|             trip_id|service_id|
+--------------------+----------+
|1.TA.12-1-A-j19-1...|  TA+b099q|
|1.TA.16-440-j19-1...|  TA+b0007|
|1.TA.23-792-j19-1...|  TA+b000o|
|1.TA.26-18-j19-1.1.H|  TA+b0b46|
|1.TA.26-833-j19-1...|  TA+b002d|
|1.TA.26-853-j19-1...|  TA+b001t|
|1.TA.30-32-Y-j19-...|  TA+b09sz|
| 1.TA.5-21-j19-1.1.H|  TA+b08w1|
|1.TA.6-M13-j19-1.1.H|  TA+b0bou|
|1.TA.61-211-Y-j19...|  TA+b0786|
+--------------------+----------+
only showing top 10 rows

In [20]:
%%spark
cal1 = spark.read.csv('/data/sbb/csv/calendar/2019/05/08/calendar.txt', header=True)
#cal2 = spark.read.csv('/data/sbb/csv/calendar/2019/05/15/calendar.txt', header=True)
#cal_tot = cal1.union(cal2).distinct()
cal_tot = cal1.distinct()
cal_tot.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+------+-------+---------+--------+------+--------+------+----------+--------+
|service_id|monday|tuesday|wednesday|thursday|friday|saturday|sunday|start_date|end_date|
+----------+------+-------+---------+--------+------+--------+------+----------+--------+
|  TA+b06hv|     0|      0|        0|       0|     0|       0|     1|  20181209|20191214|
|  TA+b0036|     0|      0|        0|       0|     0|       0|     1|  20181209|20191214|
|  TA+b0ha1|     0|      0|        0|       0|     0|       1|     0|  20181209|20191214|
|  TA+b0nha|     1|      1|        1|       1|     0|       0|     1|  20181209|20191214|
|  TA+b022q|     1|      1|        1|       1|     0|       0|     0|  20181209|20191214|
|  TA+b025m|     0|      0|        0|       0|     1|       0|     0|  20181209|20191214|
|  TA+b0ms4|     0|      0|        0|       0|     0|       0|     1|  20181209|20191214|
|  TA+b0nvl|     0|      0|        0|       0|     0|       1|     1|  20181209|20191214|
|  TA+b0nz

In [21]:
%%spark
cal_filtered = cal_tot.filter((cal_tot.monday == 1) | (cal_tot.tuesday == 1) | (cal_tot.wednesday == 1) | (cal_tot.thursday == 1) | (cal_tot.friday == 1))
cal_filtered_dates = cal_filtered.filter((cal_tot.start_date <= '20190513') & (cal_tot.end_date >= '20190518'))
cal_filtered_dates.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+------+-------+---------+--------+------+--------+------+----------+--------+
|service_id|monday|tuesday|wednesday|thursday|friday|saturday|sunday|start_date|end_date|
+----------+------+-------+---------+--------+------+--------+------+----------+--------+
|  TA+b0nha|     1|      1|        1|       1|     0|       0|     1|  20181209|20191214|
|  TA+b022q|     1|      1|        1|       1|     0|       0|     0|  20181209|20191214|
|  TA+b025m|     0|      0|        0|       0|     1|       0|     0|  20181209|20191214|
|  TA+b0nz0|     0|      0|        0|       0|     1|       1|     0|  20181209|20191214|
|  TA+b0o2i|     1|      0|        0|       1|     0|       0|     0|  20181209|20191214|
|  TA+b0obi|     1|      1|        1|       1|     1|       0|     0|  20181209|20191214|
|  TA+b0obp|     0|      1|        0|       0|     1|       1|     1|  20181209|20191214|
|  TA+b0gtx|     1|      1|        1|       1|     1|       0|     1|  20181209|20191214|
|  TA+b0g2

In [22]:
%%spark
service_ids = cal_filtered_dates.drop('start_date', 'end_date', 'saturday', 'sunday')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [23]:
%%spark
service_ids.select('service_id')
trips_in_period = trips_with_routes.join(service_ids, service_ids.service_id == trips_with_routes.service_id,'inner').drop(trips_with_routes.service_id).distinct()
trips_in_period.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+----------+------+-------+---------+--------+------+
|             trip_id|service_id|monday|tuesday|wednesday|thursday|friday|
+--------------------+----------+------+-------+---------+--------+------+
|1.TA.12-1-A-j19-1...|  TA+b099q|     0|      1|        1|       0|     1|
|1.TA.16-440-j19-1...|  TA+b0007|     1|      1|        1|       1|     1|
|1.TA.23-792-j19-1...|  TA+b000o|     0|      0|        0|       0|     1|
|1.TA.26-18-j19-1.1.H|  TA+b0b46|     1|      1|        1|       1|     1|
|1.TA.26-833-j19-1...|  TA+b002d|     0|      0|        0|       0|     1|
|1.TA.30-32-Y-j19-...|  TA+b09sz|     0|      1|        0|       0|     0|
|1.TA.6-M13-j19-1.1.H|  TA+b0bou|     0|      0|        0|       0|     1|
|1.TA.61-211-Y-j19...|  TA+b0786|     1|      1|        1|       1|     1|
|1.TA.61-40-Y-j19-...|  TA+b0001|     1|      1|        1|       1|     1|
|1.TA.61-479-Y-j19...|  TA+b04d0|     0|      0|        1|       1|     1|
+--------------------+---

In [24]:
%%spark
final_trips = all_trips_tot.join(trips_in_period, trips_in_period.trip_id == all_trips_tot.trip_id).distinct().drop(all_trips_tot.trip_id)
final_trips.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+--------------+------------+-------------+-----------+-------------+--------------------+----------+------+-------+---------+--------+------+
|arrival_time|departure_time|     stop_id|stop_sequence|pickup_type|drop_off_type|             trip_id|service_id|monday|tuesday|wednesday|thursday|friday|
+------------+--------------+------------+-------------+-----------+-------------+--------------------+----------+------+-------+---------+--------+------+
|    01:30:00|      01:30:00|8505000:0:11|            1|          0|            0|1.TA.12-1-A-j19-1...|  TA+b099q|     0|      1|        1|       0|     1|
|    01:35:00|      01:35:00| 8502021:0:1|            2|          0|            0|1.TA.12-1-A-j19-1...|  TA+b099q|     0|      1|        1|       0|     1|
|    01:36:00|      01:37:00| 8502028:0:1|            3|          0|            0|1.TA.12-1-A-j19-1...|  TA+b099q|     0|      1|        1|       0|     1|
|    01:39:00|      01:39:00| 8502012:0:1|            4|        

In [25]:
%%spark
all_trips_tot.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

10862563

In [26]:
%%spark
final_trips.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

7008267

### Considering working hours from 9:00 to 16:59

In [27]:
%%spark
import pyspark.sql.functions as F

final_trips = final_trips.filter(F.hour(final_trips.arrival_time).between(9,17) & F.hour(final_trips.departure_time).between(9,17))
final_trips.show(10, truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+--------------+-----------+-------------+-----------+-------------+--------------------+----------+------+-------+---------+--------+------+
|arrival_time|departure_time|stop_id    |stop_sequence|pickup_type|drop_off_type|trip_id             |service_id|monday|tuesday|wednesday|thursday|friday|
+------------+--------------+-----------+-------------+-----------+-------------+--------------------+----------+------+-------+---------+--------+------+
|10:41:00    |10:41:00      |8503064    |1            |0          |0            |1.TA.26-18-j19-1.1.H|TA+b0b46  |1     |1      |1        |1       |1     |
|10:45:00    |10:45:00      |8503065:0:1|2            |0          |0            |1.TA.26-18-j19-1.1.H|TA+b0b46  |1     |1      |1        |1       |1     |
|10:46:00    |10:46:00      |8503074    |3            |0          |0            |1.TA.26-18-j19-1.1.H|TA+b0b46  |1     |1      |1        |1       |1     |
|10:47:00    |10:47:00      |8503068    |4            |0          |0  

In [28]:
%%spark
final_trips.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

3777527

### Keeping only trip_ids that contains a stop in the area of Zurich

In [29]:
%%spark
final_close_trips_ids = final_trips.join(close_stops, final_trips.stop_id == close_stops.stop_id , 'inner').select('trip_id').distinct()
final_close_trips_ids.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+
|             trip_id|
+--------------------+
|1.TA.26-18-j19-1.1.H|
|1.TA.63-138-Y-j19...|
|1001.TA.26-213-j1...|
|1014.TA.26-520-j1...|
|1017.TA.26-151-j1...|
|1023.TA.26-4-j19-...|
|1029.TA.26-LAF-j1...|
|103.TA.26-510-j19...|
|1039.TA.26-200-j1...|
|1082.TA.26-201-j1...|
+--------------------+
only showing top 10 rows

In [30]:
%%spark
final_close_trips1 = final_trips.join(final_close_trips_ids, final_trips.trip_id == final_close_trips_ids.trip_id).drop(final_close_trips_ids.trip_id).distinct()
final_close_trips2 = final_close_trips1.join(all_stops_distance, all_stops_distance.stop_id == final_close_trips1.stop_id).drop(final_close_trips1.stop_id).distinct()
final_close_trips_filtered = final_close_trips2.filter(all_stops_distance['distance'] <= 18000).drop('a','distance', 'lat', 'lon', 'stop_name')
final_close_trips_filtered.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+--------------+-------------+-----------+-------------+----------+------+-------+---------+--------+------+--------------------+------------+
|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|service_id|monday|tuesday|wednesday|thursday|friday|             trip_id|     stop_id|
+------------+--------------+-------------+-----------+-------------+----------+------+-------+---------+--------+------+--------------------+------------+
|    10:41:00|      10:41:00|            1|          0|            0|  TA+b0b46|     1|      1|        1|       1|     1|1.TA.26-18-j19-1.1.H|     8503064|
|    10:45:00|      10:45:00|            2|          0|            0|  TA+b0b46|     1|      1|        1|       1|     1|1.TA.26-18-j19-1.1.H| 8503065:0:1|
|    10:46:00|      10:46:00|            3|          0|            0|  TA+b0b46|     1|      1|        1|       1|     1|1.TA.26-18-j19-1.1.H|     8503074|
|    10:47:00|      10:47:00|            4|          0|         

In [31]:
%%spark
final_close_trips_filtered.cache()
final_close_trips_filtered.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

535866

## Create timetable

As request by the paper 

### stops Table

In [32]:
%%spark
stops_table = final_close_trips_filtered.select(final_close_trips_filtered.stop_id).distinct()
stops_table.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+
|    stop_id|
+-----------+
|    8591190|
|    8506895|
|    8591284|
|    8573729|
|    8503376|
|8503306:0:2|
|    8502508|
|    8589111|
|    8503078|
|    8590819|
+-----------+
only showing top 10 rows

In [33]:
%%spark

all_stops = spark.read.csv('/data/sbb/csv/allstops/stop_locations.csv')

all_stops = all_stops.withColumnRenamed('_c0','stop_id')
all_stops = all_stops.withColumnRenamed('_c1','stop_name')
all_stops = all_stops.withColumnRenamed('_c2','stop_lat')
all_stops = all_stops.withColumnRenamed('_c3','stop_lon')
all_stops = all_stops.withColumnRenamed('_c4','location_type')
all_stops = all_stops.withColumnRenamed('_c5','parent_station')
all_stops = all_stops.drop('location_type')

all_stops_full = all_stops.join(stops_table, "stop_id" ,"outer")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [34]:
%%spark
all_stops_full.show(10)
#all_stops_full.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------------------+----------------+----------------+--------------+
|stop_id|           stop_name|        stop_lat|        stop_lon|parent_station|
+-------+--------------------+----------------+----------------+--------------+
|1100150|Schlächtenhaus, S...|47.6898550981112|7.74233688977719|          null|
|1100376|Brombach (LÖ), Fr...|47.6314693778999|7.68696473565635|          null|
|1100411|Häg-Ehrsberg, Ehr...|  47.74319213737|7.90619858077624|          null|
|1101439|   Gailingen, Schule|47.6948133619046|8.75378600091951|          null|
|1101511|Hoppetenzell, Sar...|47.8931911144564|9.01140485813517|          null|
|1101517|           Igelswies|48.0050619291514|9.13811222897787|          null|
|1101664|  Liptingen, Rathaus|47.9384005993389|8.90860165700622|          null|
|1102913|Lindau (D), Westl...|47.5459998093133|9.67818836101854|          null|
|1103354|Tettnang, Bärenplatz|47.6711364278945|9.59007261478699|          null|
|1103389|Tettnang, Gut Kal...|47.6844488

In [35]:
%%spark
#all_stops_full.write.mode('overwrite').option("delimiter", ";").option("header","true").format("csv").save("/group/five-guys/stops_table")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### connections Table

In [36]:
%%spark
trip_ids_with_days_table = final_close_trips_filtered.select('trip_id', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday').distinct()
trip_ids_with_days_table.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+------+-------+---------+--------+------+
|             trip_id|monday|tuesday|wednesday|thursday|friday|
+--------------------+------+-------+---------+--------+------+
|1.TA.26-18-j19-1.1.H|     1|      1|        1|       1|     1|
|1.TA.63-138-Y-j19...|     0|      0|        1|       0|     0|
|1001.TA.26-213-j1...|     0|      0|        0|       0|     1|
|1014.TA.26-520-j1...|     1|      1|        1|       1|     0|
|1017.TA.26-151-j1...|     1|      1|        1|       1|     0|
|1023.TA.26-4-j19-...|     1|      1|        1|       1|     1|
|1029.TA.26-LAF-j1...|     1|      1|        1|       1|     0|
|103.TA.26-510-j19...|     0|      0|        0|       0|     1|
|1039.TA.26-200-j1...|     0|      0|        0|       0|     1|
|1082.TA.26-201-j1...|     0|      0|        0|       0|     1|
+--------------------+------+-------+---------+--------+------+
only showing top 10 rows

In [37]:
%%spark
working_table = final_close_trips_filtered.withColumn('stop_sequence',final_close_trips_filtered.stop_sequence.cast('int')).orderBy('trip_id', 'stop_sequence').drop('service_id', 'pickup_type', 'drop_off_type', 'monday',  'tuesday', 'thursday', 'wednesday', 'thursday', 'friday')
working_table = working_table.withColumnRenamed('stop_id','departure_id')
working_table = working_table.withColumnRenamed('arrival_time','old_arrival_time')
working_table.show(30)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+--------------+-------------+--------------------+------------+
|old_arrival_time|departure_time|stop_sequence|             trip_id|departure_id|
+----------------+--------------+-------------+--------------------+------------+
|        15:10:00|      15:10:00|            1|1.TA.1-245-j19-1.1.H|     8573710|
|        15:11:00|      15:11:00|            2|1.TA.1-245-j19-1.1.H|     8591341|
|        15:13:00|      15:13:00|            3|1.TA.1-245-j19-1.1.H|     8502572|
|        15:15:00|      15:15:00|            4|1.TA.1-245-j19-1.1.H|     8580912|
|        15:17:00|      15:17:00|            5|1.TA.1-245-j19-1.1.H|     8503610|
|        15:18:00|      15:18:00|            6|1.TA.1-245-j19-1.1.H|     8573711|
|        15:21:00|      15:21:00|            7|1.TA.1-245-j19-1.1.H|     8503709|
|        15:23:00|      15:23:00|            8|1.TA.1-245-j19-1.1.H|     8503699|
|        15:24:00|      15:24:00|            9|1.TA.1-245-j19-1.1.H|     8503580|
|        15:25:0

In [38]:
%%spark
import pyspark.sql.functions as F
from pyspark.sql.window import Window

w = Window().partitionBy('trip_id').orderBy('stop_sequence')
table_a = working_table.select("*",F.lag("old_arrival_time", -1).over(w).alias("arrival_time"), F.lag("departure_id", -1).over(w).alias("arrival_id")).na.drop()
table_a.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+--------------+-------------+--------------------+------------+------------+-----------+
|old_arrival_time|departure_time|stop_sequence|             trip_id|departure_id|arrival_time| arrival_id|
+----------------+--------------+-------------+--------------------+------------+------------+-----------+
|        10:41:00|      10:41:00|            1|1.TA.26-18-j19-1.1.H|     8503064|    10:45:00|8503065:0:1|
|        10:45:00|      10:45:00|            2|1.TA.26-18-j19-1.1.H| 8503065:0:1|    10:46:00|    8503074|
|        10:46:00|      10:46:00|            3|1.TA.26-18-j19-1.1.H|     8503074|    10:47:00|    8503068|
|        10:47:00|      10:47:00|            4|1.TA.26-18-j19-1.1.H|     8503068|    10:48:00|    8503066|
|        10:48:00|      10:48:00|            5|1.TA.26-18-j19-1.1.H|     8503066|    10:50:00|    8503075|
|        10:50:00|      10:50:00|            6|1.TA.26-18-j19-1.1.H|     8503075|    10:52:00|    8503067|
|        10:52:00|      10:52:00|    

In [39]:
%%spark
import pyspark.sql.functions as F
from pyspark.sql.window import Window

final_table = table_a.join(trip_ids_with_days_table, table_a.trip_id == trip_ids_with_days_table.trip_id)
connections_table = final_table.drop('old_arrival_time', 'stop_sequence').select('departure_id', 'arrival_id', 'departure_time', 'arrival_time', table_a.trip_id, 'monday', 'tuesday', 'wednesday', 'thursday', 'friday')
connections_table.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+-----------+--------------+------------+--------------------+------+-------+---------+--------+------+
|departure_id| arrival_id|departure_time|arrival_time|             trip_id|monday|tuesday|wednesday|thursday|friday|
+------------+-----------+--------------+------------+--------------------+------+-------+---------+--------+------+
|     8591304|    8502495|      13:05:00|    13:06:00|32.TA.26-185-j19-...|     0|      1|        1|       0|     0|
|     8591279|    8591304|      13:04:00|    13:05:00|32.TA.26-185-j19-...|     0|      1|        1|       0|     0|
|     8591111|    8591279|      13:01:00|    13:04:00|32.TA.26-185-j19-...|     0|      1|        1|       0|     0|
|     8590477|    8591111|      12:59:00|    13:01:00|32.TA.26-185-j19-...|     0|      1|        1|       0|     0|
|     8590679|    8590477|      12:56:00|    12:59:00|32.TA.26-185-j19-...|     0|      1|        1|       0|     0|
|     8590476|    8590679|      12:55:00|    12:56:00|32.TA.26-1

In [40]:
%%spark
connections_table.filter(connections_table.arrival_time < connections_table.departure_time).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0

In [41]:
%%spark
connections_table.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Interrupted by user


## Full table

In [42]:
%%spark
from pyspark.sql.functions import desc
connections_table = connections_table.orderBy(desc("departure_time"))
connections_table.show(10)
#ordering of columns
#connections_table = connections_table_monday_full_sorted.select("departure_id", "arrival_id", "departure_time", "arrival_time", "trip_id")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+----------+--------------+------------+--------------------+------+-------+---------+--------+------+
|departure_id|arrival_id|departure_time|arrival_time|             trip_id|monday|tuesday|wednesday|thursday|friday|
+------------+----------+--------------+------------+--------------------+------+-------+---------+--------+------+
|     8502879|   8570706|      17:59:00|    17:59:00|1045.TA.26-215-j1...|     0|      0|        0|       0|     1|
|     8502879|   8570706|      17:59:00|    17:59:00|1415.TA.26-215-j1...|     1|      1|        1|       1|     0|
|     8591175|   8591273|      17:59:00|    17:59:00|174.TA.26-61-j19-...|     1|      1|        1|       1|     1|
|     8503690|   8583053|      17:59:00|    17:59:00|234.TA.26-236-j19...|     0|      0|        0|       0|     1|
|     8502879|   8570706|      17:59:00|    17:59:00|1580.TA.26-215-j1...|     1|      1|        1|       1|     0|
|     8502879|   8570706|      17:59:00|    17:59:00|1198.TA.26-215-j1..

In [None]:
%%spark
#connections_table.write.mode('overwrite').option("header","true").format("csv").save("/group/five-guys/conn_table")

### trips Table

In [43]:
%%spark
#final_close_trips_filtered = final_close_trips.filter(final_close_trips.arrival_time>final_close_trips.departure_time)
trips_table = final_close_trips_filtered.select(final_close_trips_filtered.trip_id).distinct()
trips_table.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+
|             trip_id|
+--------------------+
|1.TA.26-18-j19-1.1.H|
|1.TA.63-138-Y-j19...|
|1001.TA.26-213-j1...|
|1014.TA.26-520-j1...|
|1017.TA.26-151-j1...|
|1023.TA.26-4-j19-...|
|1029.TA.26-LAF-j1...|
|103.TA.26-510-j19...|
|1039.TA.26-200-j1...|
|1082.TA.26-201-j1...|
+--------------------+
only showing top 10 rows

In [44]:
%%spark 
#trips as described in the readme, but filtered for time and distance

trips_table_filtered = routes_tot.join(trips_table, "trip_id" ,"inner")
trips_table_filtered.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-----------+----------+--------------------+---------------+------------+
|             trip_id|   route_id|service_id|       trip_headsign|trip_short_name|direction_id|
+--------------------+-----------+----------+--------------------+---------------+------------+
|9.TA.1-217-j19-1.1.H|1-217-j19-1|  TA+b0001|Affoltern a. A., ...|          21719|           0|
|15.TA.1-217-j19-1...|1-217-j19-1|  TA+b0001|Affoltern a. A., ...|          21731|           0|
|17.TA.1-217-j19-1...|1-217-j19-1|  TA+b0001|Affoltern a. A., ...|          21735|           0|
|19.TA.1-217-j19-1...|1-217-j19-1|  TA+b0001|Affoltern a. A., ...|          21739|           0|
|21.TA.1-217-j19-1...|1-217-j19-1|  TA+b0001|Affoltern a. A., ...|          21743|           0|
|23.TA.1-217-j19-1...|1-217-j19-1|  TA+b0001|Affoltern a. A., ...|          21747|           0|
|11.TA.1-217-j19-1...|1-217-j19-1|  TA+b0001|Affoltern a. A., ...|          21723|           0|
|25.TA.1-217-j19-1...|1-217-j19-1|  TA+b

In [45]:
%%spark
trips_table_filtered = trips_table_filtered.select("route_id", "service_id", "trip_id", "trip_headsign", "trip_short_name", "direction_id")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+----------+--------------------+--------------------+---------------+------------+
|   route_id|service_id|             trip_id|       trip_headsign|trip_short_name|direction_id|
+-----------+----------+--------------------+--------------------+---------------+------------+
|1-217-j19-1|  TA+b0001|9.TA.1-217-j19-1.1.H|Affoltern a. A., ...|          21719|           0|
|1-217-j19-1|  TA+b0001|15.TA.1-217-j19-1...|Affoltern a. A., ...|          21731|           0|
|1-217-j19-1|  TA+b0001|17.TA.1-217-j19-1...|Affoltern a. A., ...|          21735|           0|
|1-217-j19-1|  TA+b0001|19.TA.1-217-j19-1...|Affoltern a. A., ...|          21739|           0|
|1-217-j19-1|  TA+b0001|21.TA.1-217-j19-1...|Affoltern a. A., ...|          21743|           0|
+-----------+----------+--------------------+--------------------+---------------+------------+
only showing top 5 rows

In [47]:
%%spark
trips_table_filtered.write.mode('overwrite').option("header","true").format("csv").save("/group/five-guys/trips")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Footpath table

In [49]:
%%spark

#add attributes lat, lon to stops_table
stops_table_expanded = stops_table.join(all_stops, 'stop_id', 'inner')
#create all possible pairs between stops
stops_table_expanded_a = stops_table_expanded.select([F.col(c).alias("a_"+c) for c in stops_table_expanded.columns])
stops_table_expanded_b = stops_table_expanded.select([F.col(c).alias("b_"+c) for c in stops_table_expanded.columns])
cartesian_product = stops_table_expanded_a.crossJoin(stops_table_expanded_b)
#compute distance for all pairs
stops_table_distance = cartesian_product.withColumn("a", (
        F.pow(F.sin(F.radians(F.col("a_stop_lat") - F.col("b_stop_lat")) / 2), 2) +
        F.cos(F.radians(F.col("b_stop_lat"))) * F.cos(F.radians(F.col("a_stop_lat"))) *
        F.pow(F.sin(F.radians(F.col("a_stop_lon") - F.col("b_stop_lon")) / 2), 2)
    )).withColumn("distance", F.atan2(F.sqrt(F.col("a")), F.sqrt(-F.col("a") + 1)) * 12742000)
#filter pair with distance shorter than 500m
stops_table_distance = stops_table_distance.filter(stops_table_distance.distance <= 500)
#compute time considering walking speed of 50m/1min
#time is in seconds
stops_table_time = stops_table_distance.withColumn('time', F.round((stops_table_distance['distance']/F.lit(50))*F.lit(60), 0).cast('int'))
footpath_table = stops_table_time.select(stops_table_time.a_stop_id.alias('departure_id'), stops_table_time.b_stop_id.alias('arrival_id'), stops_table_time.time)
#remove pairs where arrival == departure
footpath_table = footpath_table.filter(footpath_table.departure_id != footpath_table.arrival_id)
footpath_table.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+-----------+----+
|departure_id| arrival_id|time|
+------------+-----------+----+
| 8503306:0:2|    8590541| 528|
|     8506895|    8573228| 402|
|     8591190|    8591170| 311|
|     8591284|    8591116| 479|
|     8591284|    8591274| 327|
|     8503078|    8591903| 471|
|     8589111|    8590655| 583|
|     8589111|    8594182| 293|
| 8503306:0:2|8503306:0:1|   0|
|     8589111|    8502208| 351|
+------------+-----------+----+
only showing top 10 rows

In [50]:
%%spark
footpath_table.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- departure_id: string (nullable = true)
 |-- arrival_id: string (nullable = true)
 |-- time: integer (nullable = true)

In [None]:
%%spark
footpath_table.write.mode('overwrite').option("delimiter", ";").option("header","true").format("csv").save("/group/five-guys/footpath")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
#transform list of tuple into dictionary of dictionaries
#footpath_map = footpath_table.groupby('arrival_id')[['departure_id', 'time']].apply(lambda g: g.values.tolist()).to_dict()
#footpath_map = {k: dict(v) for k, v in footpath_map.items()}
#convert and save the dictionary to pickle
#pickle.dump(footpath_map, open(path_data+"footpath_map.pickle", "wb"))

In [None]:
#!git lfs track "../data/footpath_map.pickle"
#!git add .gitattributes
#!git commit -m "footpath_map.pickle"
#!git push