In [1]:
%matplotlib inline
import matplotlib.pylab as plt
plt.rcParams['figure.figsize'] = (10,6)
plt.rcParams['font.size'] = 18
plt.style.use('fivethirtyeight')

In [42]:
from pyspark.sql.functions import udf
import pyspark.sql.functions as fct
from pyspark.sql import Row
from geopy.distance import distance as geo_dist


In [3]:
import pandas as pd 

In [4]:
import getpass
import pyspark
from pyspark.sql import SparkSession

conf = pyspark.conf.SparkConf()
conf.setMaster('yarn')
conf.setAppName('final_proj-{0}'.format(getpass.getuser()))
conf.set('spark.executor.memory', '4g')
conf.set('spark.executor.instances', '15')
conf.set('spark.port.maxRetries', '100')
sc = pyspark.SparkContext.getOrCreate(conf)
conf = sc.getConf()
sc

In [5]:
spark = SparkSession(sc)

In [6]:
df = spark.read.csv('/datasets/project/istdaten/*/*/*', sep=';', header=True)

In [7]:
columns = 'TripDate string, TripId string, OperatorId string, OperatorAbbrv string, OperatorName string, ProductId string, LineId string, LineType string, UmlaufId string, TransportType string, AdditionalTrip boolean, FailedTrip boolean, BPUIC string, StopName string, ArrivalTimeScheduled string, ArrivalTimeActual string, ArrivalTimeActualStatus string,     DepartureTimeScheduled string, DepartureTimeActual string, DepartureTimeActualStatus string, SkipStation boolean'
columns = list(map(lambda x: x.split()[0],columns.split(',')))

In [8]:
for old, new in zip(df.columns, columns):
    print(old, new)
    df = df.withColumnRenamed(old, new)

BETRIEBSTAG TripDate
FAHRT_BEZEICHNER TripId
BETREIBER_ID OperatorId
BETREIBER_ABK OperatorAbbrv
BETREIBER_NAME OperatorName
PRODUKT_ID ProductId
LINIEN_ID LineId
LINIEN_TEXT LineType
UMLAUF_ID UmlaufId
VERKEHRSMITTEL_TEXT TransportType
ZUSATZFAHRT_TF AdditionalTrip
FAELLT_AUS_TF FailedTrip
BPUIC BPUIC
HALTESTELLEN_NAME StopName
ANKUNFTSZEIT ArrivalTimeScheduled
AN_PROGNOSE ArrivalTimeActual
AN_PROGNOSE_STATUS ArrivalTimeActualStatus
ABFAHRTSZEIT DepartureTimeScheduled
AB_PROGNOSE DepartureTimeActual
AB_PROGNOSE_STATUS DepartureTimeActualStatus
DURCHFAHRT_TF SkipStation


In [9]:
df.head()

Row(TripDate='13.09.2017', TripId='80:06____:17010:000', OperatorId='80:06____', OperatorAbbrv='DB', OperatorName='DB Regio AG', ProductId='Zug', LineId='17010', LineType='RE', UmlaufId=None, TransportType='RE', AdditionalTrip='false', FailedTrip='false', BPUIC='8500090', StopName='Basel Bad Bf', ArrivalTimeScheduled=None, ArrivalTimeActual=None, ArrivalTimeActualStatus='PROGNOSE', DepartureTimeScheduled='13.09.2017 05:45', DepartureTimeActual='13.09.2017 05:45:00', DepartureTimeActualStatus='PROGNOSE', SkipStation='false')

In [10]:
df.count()

196232995

In [11]:
df.select("StopName").distinct().count()

10366

In [145]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf

In [179]:
df_meta = spark.read.csv('/datasets/project/metadata')

In [180]:
type(df_meta)

pyspark.sql.dataframe.DataFrame

In [181]:
Lat_zu = 47.377941
Long_ZU = 8.540141
def dist_to_ZU(lat, long): 
    res = str(geo_dist((lat, long), (Lat_zu, Long_ZU)))
    res = round(float(res.split()[0]),1)
    return res

In [182]:
df_meta = df_meta.select(fct.split(df_meta['_c0'], '  ')[1].alias('Long'), 
                         fct.split(fct.split(df_meta['_c0'], '  ')[2], ' ')[0].alias('Lat'), fct.split(df_meta['_c0'], '% ')[1].alias('StopName') )

In [183]:
df_meta.show()

+---------+---------+-------------------+
|     Long|      Lat|           StopName|
+---------+---------+-------------------+
|26.074412|44.446770|          Bucuresti|
| 1.811446|50.901549|             Calais|
| 1.075329|51.284212|         Canterbury|
|-3.543547|50.729172|             Exeter|
| 9.733756|46.922368|            Fideris|
| 8.571251|50.051219|Frankfurt Flughafen|
|18.643803|54.355520|             Gdansk|
| 7.389462|47.191804|           Grenchen|
|29.019602|40.996348|           Istanbul|
| 9.873959|48.577852|  Amstetten (Württ)|
| 4.786044|43.921937|            Avignon|
| 2.140369|41.378914|          Barcelona|
| 7.589551|47.547405|              Basel|
| 7.395229|46.937482|       Bern Bümpliz|
|-1.899480|52.483627|         Birmingham|
| 6.838953|46.949588|          Boudry TN|
|17.106466|48.158910|         Bratislava|
| 4.335694|50.835376|          Bruxelles|
|-2.979650|53.404289|          Liverpool|
| 8.500049|47.114619|         Lothenbach|
+---------+---------+-------------

In [184]:
udf_dist_to_ZU = udf(dist_to_ZU, DoubleType()) 

In [185]:
df_meta = df_meta.withColumn("Long", df_meta["Long"].cast(DoubleType()))
df_meta = df_meta.withColumn("Lat", df_meta["Lat"].cast(DoubleType()))

In [186]:
df_meta = df_meta.withColumn("Dist in km", udf_dist_to_ZU('Lat','Long'))

In [206]:
df_meta = df_meta.filter(df_meta['Dist in km'] < 10)

In [208]:
StopName = df_meta.select(df_meta.StopName).distinct()

In [209]:
Df_Zurich = StopName.join(df, on=['StopName'], how='outer')

In [210]:
Df_Zurich.show()

+--------+----------+--------------+----------+-------------+--------------+---------+------+--------+--------+-------------+--------------+----------+-------+--------------------+-------------------+-----------------------+----------------------+-------------------+-------------------------+-----------+
|StopName|  TripDate|        TripId|OperatorId|OperatorAbbrv|  OperatorName|ProductId|LineId|LineType|UmlaufId|TransportType|AdditionalTrip|FailedTrip|  BPUIC|ArrivalTimeScheduled|  ArrivalTimeActual|ArrivalTimeActualStatus|DepartureTimeScheduled|DepartureTimeActual|DepartureTimeActualStatus|SkipStation|
+--------+----------+--------------+----------+-------------+--------------+---------+------+--------+--------+-------------+--------------+----------+-------+--------------------+-------------------+-----------------------+----------------------+-------------------+-------------------------+-----------+
|   Bever|04.10.2017|85:72:1110:000|     85:72|          RhB|Rhätische Bahn|      