In [1]:
%matplotlib inline
import matplotlib.pylab as plt
plt.rcParams['figure.figsize'] = (10,6)
plt.rcParams['font.size'] = 18
plt.style.use('fivethirtyeight')

In [15]:
import getpass
import pyspark
from pyspark.sql import SparkSession

conf = pyspark.conf.SparkConf()
conf.setMaster('yarn')
conf.setAppName('final_proj-{0}'.format(getpass.getuser()))
conf.set('spark.executor.memory', '4g')
conf.set('spark.executor.instances', '6')
conf.set('spark.executor.cores', 2)
conf.set('spark.port.maxRetries', '100')
sc = pyspark.SparkContext.getOrCreate(conf)
conf = sc.getConf()
sc

In [16]:
spark = SparkSession(sc)

In [17]:
df = spark.read.csv('/datasets/project/istdaten/*/*/*', sep=';', header=True)

First, we rename the columns in English language:

In [18]:
columns = 'TripDate string, TripId string, OperatorId string, OperatorAbbrv string, OperatorName string, ProductId string, LineId string, LineType string, UmlaufId string, TransportType string, AdditionalTrip boolean, FailedTrip boolean, BPUIC string, StopName string, ArrivalTimeScheduled string, ArrivalTimeActual string, ArrivalTimeActualStatus string,     DepartureTimeScheduled string, DepartureTimeActual string, DepartureTimeActualStatus string, SkipStation boolean'
columns = list(map(lambda x: x.split()[0],columns.split(',')))

for old, new in zip(df.columns, columns):
    df = df.withColumnRenamed(old, new)

# Computing the quality of a transfer

## Assumptions: 

   * everytime when making a transfer in a station, the traveler needs one minute for actually changing transport.
   * even though a train departs late all the time in a specific station, the trip planner will never use the fact that it does so, so we will only take into consideration the early departures and the correct ones. 

## Main idea: 

The idea behind computing the quality of a specific transfer given the *expected arrival hour* in the station and the *expected departure hour* from that same station, and some *extra information* regarding the trip before the transfer and the one after the transfer:

   * First, we compute the **discrete distribution of arrival delays $\mathcal{D}_a$** in that station, given the information of the trip before the transfer.
   * Then, we compute the **discrete distribution of negative departure delays $\mathcal{D}_d$** in that station, given the information of the trip after the transfer.
   * Next, we compute the probability of successfully realizing the transfer, by computing a convolution between the two given distributions. Therefore, assuming that the time of transfer is $k$ minutes, then we would simply compute:
      
      $\sum\limits_{t_a }\Pr[\mathcal{D}_a = t_a] \cdot \Pr[\mathcal{D}_d = k-1+t_a]$,
      
       where we have taken into consideration the minute needed by the traveler for changing the transport. 
       
---
       
Therefore, we first need to decide what are the features which will decide the distributions of the delays. For that, we will use a **Decision Tree Regressor**, selecting several features which might be important from the data, and the target label will be the delay for each datapoint, expressed in seconds. Then, we will train the regressor on both departures and arrivals data, and will look into which are the most important features in each case, for making a good prediction of the delay time. 

We have to emphasize that we considered this method, because of the way that Decision Trees decide which are the most important feature, i.e. the one which have the most variance of delays between the different values for the specific feature. 

After constructing the Decision Tree and deciding which are the most important features, we will construct the distributions of the delays from the **actual data**, by grouping the datapoints with the same value for the decisive features, and making the distribution of delays for each group.

We decided to use the actual data instead of modelling the distribution of delays using a fixed distribution family (e.g. Log-normal or Gamma distributions), because we consider that the actual data is more relevant, then considering just an estimator or to assume that it follows a distribution in a family of distributions.

## Constructing the Decision Tree Regressor

The first step in constructing the Decision Tree Regressor is to construct some potential important features from the given data, and also to compute the delays for each datapoint:

In [None]:
from pyspark.sql.functions import unix_timestamp, to_timestamp

DATE_FORMAT_SCHEDULED = 'dd.MM.yyyy HH:mm' 
DATE_FORMAT_ACTUAL = 'dd.MM.yyyy HH:mm:ss' # both formats are used

df_processed = df.withColumn('ArrivalTimeScheduledDate', to_timestamp(df.ArrivalTimeScheduled, DATE_FORMAT_SCHEDULED))
df_processed = df_processed.withColumn('DepartureTimeScheduledDate', to_timestamp(df_processed.DepartureTimeScheduled, DATE_FORMAT_SCHEDULED))

df_processed = df_processed.withColumn('ArrivalTimeScheduled', unix_timestamp(df_processed.ArrivalTimeScheduled, DATE_FORMAT_SCHEDULED))
df_processed = df_processed.withColumn('ArrivalTimeActual', unix_timestamp(df_processed.ArrivalTimeActual, DATE_FORMAT_ACTUAL))
df_processed = df_processed.withColumn('DepartureTimeScheduled', unix_timestamp(df_processed.DepartureTimeScheduled, DATE_FORMAT_SCHEDULED))
df_processed = df_processed.withColumn('DepartureTimeActual', unix_timestamp(df_processed.DepartureTimeActual, DATE_FORMAT_ACTUAL))

Let's look into how the data looks so far:

In [None]:
df_processed.head()

Next, we also add the hour of departure and of the arrival to the dataset:

In [None]:
from pyspark.sql.types import FloatType, StringType
from pyspark.sql.functions import hour, to_date, date_format, month

df_to_classify = df_processed.select(
    df_processed.LineId.alias('line_id'), 
    df_processed.ProductId.alias('product_id'), 
    df_processed.StopName.alias('stop_name'),
    df_processed.AdditionalTrip.alias('additional_trip'), 
    hour(df_processed.ArrivalTimeScheduledDate).alias("arrival_hour").astype(StringType()),
    hour(df_processed.DepartureTimeScheduledDate).alias("departure_hour").astype(StringType()),
    date_format(to_date(df_processed.TripDate, 'dd.MM.yyyy'), 'u').alias("day_of_week"),
    ((df_processed.ArrivalTimeActual - df_processed.ArrivalTimeScheduled)).alias("delta_arrival").astype(FloatType()),
    ((df_processed.DepartureTimeActual - df_processed.DepartureTimeScheduled)).alias("delta_departure").astype(FloatType()))

df_to_classify.cache()

In [None]:
df_to_classify.head(5)

# we must index?

Next, for using the Decision Tree Regressor, and because each feature is in fact categorial, we must each one of them using a *StringIndexer*:

In [None]:
from pyspark.ml.feature import StringIndexer

def transform_dataset(dataset, departure):
    '''
    Function that transforms a dataset, adding for each categorial feature a column, which represents the output of the 
    StringIndexer applied to that column. 
    
    Parameters:
        - dataset: the dataset to be processed
        - departure: True if the dataset is for departures, False otherwise
    '''
    
    line_id_indexer = StringIndexer(inputCol="line_id", outputCol="line_id_cat", handleInvalid='keep') # keep nulls 
    product_id_indexer = StringIndexer(inputCol="product_id", outputCol="product_id_cat", handleInvalid='skip')
    stop_name_indexer = StringIndexer(inputCol="stop_name", outputCol="stop_name_cat", handleInvalid='skip')
    additional_trip_indexer = StringIndexer(inputCol="additional_trip", outputCol="additional_trip_cat", handleInvalid='skip')
    day_of_week_indexer = StringIndexer(inputCol="day_of_week", outputCol="day_of_week_cat", handleInvalid='skip')
    departure_hour_indexer = StringIndexer(inputCol="departure_hour", outputCol="departure_hour_cat", handleInvalid='skip')
    arrival_hour_indexer = StringIndexer(inputCol="arrival_hour", outputCol="arrival_hour_cat", handleInvalid='skip')

    indexers = [line_id_indexer, product_id_indexer, stop_name_indexer, additional_trip_indexer,day_of_week_indexer]
    
    if departure:
        indexers.append(departure_hour_indexer)
    else:
        indexers.append(arrival_hour_indexer)

    indexed = dataset

    for indexer in indexers:
        indexed = indexer.fit(indexed).transform(indexed) # add columns to dataset
        
    return indexed

Next, we use the *VectorAssembler* to construct the column for features, which will be used by the Decision Tree:

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import VectorIndexer

def compute_features_column(dataset, is_departure):
    '''
    Function that computes the features column for the given dataset.
    
    Parameters:
        - dataset: the dataset to compute the features column for
        - is_departure: True is dataset is used for departures, False otherwise.
    '''
    input_cols = ['line_id_cat', 'product_id_cat', 'stop_name_cat', 'additional_trip_cat', 'day_of_week_cat']
    
    if is_departure:
        input_cols.append('departure_hour_cat') # departure dataset
    else:
        input_cols.append('arrival_hour_cat') # arrival dataset
        
    vector_assembler = VectorAssembler(inputCols = input_cols, outputCol = 'features')
    dataset = transform_dataset(dataset, is_departure) # add categorial features
    
    df_features = vector_assembler.transform(dataset) # add features column
    # Use VectorIndexer to make sure that the added features are recognized as categorical
    
    featureIndexer = \
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=100000000).fit(df_features)
    
    df_features = featureIndexer.transform(df_features) # transform features to categorical
    
    if is_departure:
        df_final = df_features.select(df_features.indexedFeatures, df_features.delta_departure.alias("delta"))
    else:
        df_final = df_features.select(df_features.indexedFeatures, df_features.delta_arrival.alias("delta"))
    
    return df_final

Finally, we construct our datasets to input to the Decision Tree:

In [None]:
# Construct departures dataset
df_departure_to_regress = df_to_classify.filter(
    df_to_classify.departure_hour.isNotNull() & # filter only departures
    df_to_classify.delta_departure.isNotNull())

df_departure = compute_features_column(df_departure_to_regress, is_departure=True)

# Construct arrivals dataset
df_arrival_to_regress = df_to_classify.filter(
    df_to_classify.arrival_hour.isNotNull() & # filter only arrivals
    df_to_classify.delta_arrival.isNotNull())

df_arrival = compute_features_column(df_arrival_to_regress, is_departure=False)

Let's check the generated dataframes:

In [None]:
df_departure.head(5)

In [None]:
df_arrival.head(5)

Next, we write the function for training the Decision Tree Regressor:

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor

def train_regressor(dataset):
    dt = DecisionTreeRegressor(featuresCol ='indexedFeatures', labelCol = 'delta', maxBins=100000000, maxDepth=3)
    dt_model = dt.fit(dataset)
    
    return dt_model

Finally, we train the decision trees for both datasets and we extract the most important features:

In [None]:
# Get most important fetrain_regressorpartures dataset
regressor_departures = train_regressor(df_departure)
print("Feature importances departures: {}".format(regressor_departures.featureImportances))

# Get most important features for departures dataset
regressor_arrivals = train_regressor(df_arrival)
print("Feature importances arrivals: {}".format(regressor_arrivals.featureImportances))

So, we can see that the 3 most important features are, in both cases, the *hour*, the *line_id* and the *stop_name*. We can see that everything makes very much sense, because we have big differences of delays between normal hours and rush hours, for example, and also specific stops and routes have usually more delays than the others.

Therefore, we continue by constructing the probability distributions for each possible value of the three most important features.

## Computing the probability distributions 

First, we only consider the three most important features in the two initial datasets. We will consider the unity of time to be the minute from now on, instead of seconds: 

In [None]:
from pyspark.sql.types import IntegerType

df_best_feat_departures = df_departure_to_regress.select(
                df_departure_to_regress.departure_hour,
                df_departure_to_regress.stop_name,
                df_departure_to_regress.line_id,
                (df_departure_to_regress.delta_departure / 60).astype(IntegerType()).alias("delta_minutes"))

df_best_feat_departures = df_best_feat_departures.filter(df_best_feat_departures.delta_minutes <= 0) 
# only keep departures which left on time or earlier, we do not want to base our recommendation on assumption
# that a train or bus leaves with a delay.

df_best_feat_arrival = df_arrival_to_regress.select(
                df_arrival_to_regress.arrival_hour,
                df_arrival_to_regress.stop_name,
                df_arrival_to_regress.line_id,
                (df_arrival_to_regress.delta_arrival / 60).astype(IntegerType()).alias("delta_minutes"))

In [None]:
df_best_feat_departures.head(5)

Finally, we want to make the distribution of delays for each possible value of the features, for both departures and arrivals:

In [None]:
from pyspark.sql.functions import collect_list, struct, count, lit

df_departures_grouped_count = df_best_feat_departures.groupby( 
                df_best_feat_departures.departure_hour,
                df_best_feat_departures.stop_name,
                df_best_feat_departures.line_id,
                df_best_feat_departures.delta_minutes).agg(count(lit(1)).alias("count_min")) # add a count for each possible value
        
df_departures_distribution = df_departures_grouped_count.\
                                    groupby('departure_hour', 'stop_name', 'line_id').\
                                    agg(collect_list(struct('delta_minutes', 'count_min')).alias('counts'))

# for each value of (departure_hour, stop_name, line_id), we have a list of the form [(delay_minutes, count)]

In [None]:
df_departures_grouped_count.head(3)

In [None]:
df_departures_distribution.show(10)

In [None]:
def compute_key_for_feature_values(hour, line_id, stop_name):
    return '{}#{}#{}'.format(hour, line_id, stop_name)

In [None]:
collected = df_departures_distribution.collect()

distribution_departures = {
    compute_key_for_feature_values(x.departure_hour, x.line_id, x.stop_name) : 
    list(sorted(x.counts, key=lambda y: y[0])) for x in collected}

We do the same now for the arrivals: 

In [None]:
df_arrivals_grouped_count = df_best_feat_arrival.groupby( 
                df_best_feat_arrival.arrival_hour,
                df_best_feat_arrival.stop_name,
                df_best_feat_arrival.line_id,
                df_best_feat_arrival.delta_minutes).agg(count(lit(1)).alias("count_min")) # add a count for each possible value
        
df_arrivals_distribution = df_arrivals_grouped_count.\
                                    groupby('arrival_hour', 'stop_name', 'line_id').\
                                    agg(collect_list(struct('delta_minutes', 'count_min')).alias('counts'))
        
collected = df_arrivals_distribution.collect()

distribution_arrivals = {
    compute_key_for_feature_values(x.arrival_hour, x.line_id, x.stop_name) : 
    list(sorted(x.counts, key=lambda y: y[0])) for x in collected}

We also want to include a default distribution, for the case we have new data, which was not encountered anymore. We will compute it as the distribution of all the data:

In [None]:
df_default_distrib_departures = df_best_feat_departures.groupby('delta_minutes').agg(count(lit(1)).alias("count_min")) # add a count for each possible value
collected_default = df_default_distrib_departures.collect()
default_departures = list(sorted(collected_default, key=lambda x: x[0]))

df_default_distrib_arrivals = df_best_feat_arrival.groupby('delta_minutes').agg(count(lit(1)).alias("count_min")) # add a count for each possible value
collected_default = df_default_distrib_arrivals.collect()
default_arrivals = list(sorted(collected_default, key=lambda x: x[0]))

Next, we add the default values to the dictionary of distributions:

In [None]:
distribution_departures['default'] = default_departures
distribution_arrivals['default'] = default_arrivals

Finally, we transform the counts to probabilities, to be able to compute the final quality faster:

In [None]:
def transform_to_proba(counts_list):
    total_sum = 0
    final_proba = []
    
    for row in counts_list:
        total_sum += row.count_min
        
    for row in counts_list:
        final_proba.append((row.delta_minutes, row.count_min / total_sum))
        
    return final_proba

In [None]:
distribution_departures = {k : transform_to_proba(v) for k, v in distribution_departures.items()}
distribution_arrivals = {k : transform_to_proba(v) for k, v in distribution_arrivals.items()}

We finally write the computed dictionaries to file, to be able to load them later:

In [None]:
import pickle
import os

FILE_DISTRIBUTION_DEPARTURES = 'distrib_departures.pic'
FILE_DISTRIBUTION_ARRIVALS = 'distrib_arrivals.pic'

pickle.dump(distribution_departures, open(FILE_DISTRIBUTION_DEPARTURES, 'wb'))
pickle.dump(distribution_arrivals, open(FILE_DISTRIBUTION_ARRIVALS, 'wb'))

## The exposed API for computing distributions

Finally, the last part is to write a function which receives the features of a specific transfer, and it returns the quality of the transfer, by performing the convolution of the corresponding distributions, using the formula:

$\sum\limits_{t_a }\Pr[\mathcal{D}_a = t_a] \cdot \Pr[\mathcal{D}_d = k-1+t_a]$,
      
where we have taken into consideration the minute needed by the traveler for changing the transport. 
       
Here, we considered $\mathcal{D}_a$ to be the distribution of arrivals and $\mathcal{D}_d$ the distribution of departures.


In [19]:
from datetime import datetime
import time
import pickle
DATE_FORMAT = '%b %d %Y %H:%M:%S'

FILE_DISTRIBUTION_DEPARTURES = 'distrib_departures.pic'
FILE_DISTRIBUTION_ARRIVALS = 'distrib_arrivals.pic'

class TransferQualityComputer:
    def __init__(self):
        self.distribution_departures = pickle.load(open(FILE_DISTRIBUTION_DEPARTURES, 'rb'))
        self.distribution_arrivals = pickle.load(open(FILE_DISTRIBUTION_ARRIVALS, 'rb'))
        
    def compute_key_for_feature_values(self, hour, line_id, stop_name): # same as before
        return '{}#{}#{}'.format(hour, line_id, stop_name)
    
    def compute_quality(self, arrival_timestamp, departure_timestamp, departure_stop_name, arrival_stop_name, 
                        departure_line_id, arrival_line_id, walktime=1):
        # timestamp in the format: Dec 31 2017 20:40:49,01

        arrival_time = datetime.strptime(arrival_timestamp[:-3], DATE_FORMAT)
        departure_time = datetime.strptime(departure_timestamp[:-3], DATE_FORMAT)

        arrival_hour = arrival_time.hour
        departure_hour = departure_time.hour
        delta_minutes = int((time.mktime(departure_time.timetuple()) - time.mktime(arrival_time.timetuple())) / 60)

        if delta_minutes < 0:
            return 0 # impossible to complete the transfer

        departure_key = self.compute_key_for_feature_values(departure_hour, departure_line_id, departure_stop_name)
        if departure_key in self.distribution_departures:
            departure_dist = self.distribution_departures[departure_key]
        else: 
            departure_dist = self.distribution_departures['default'] # default distribution

        arrival_key = self.compute_key_for_feature_values(arrival_hour, arrival_line_id, arrival_stop_name)
        if arrival_key in self.distribution_arrivals:
            arrival_dist = self.distribution_arrivals[arrival_key]
        else: 
            arrival_dist = self.distribution_arrivals['default'] # default distribution

        total_proba = 0

        for dep_delay, dep_proba in departure_dist:
            for arr_delay, arr_proba in arrival_dist:

                delta_minutes = ((departure_time - arrival_time).seconds // 60) % 60
                if delta_minutes >= dep_delay + arr_delay + walktime: 
                # consider also walktime between stations, if the same station then we considered the walk time 1min
                    total_proba += (dep_proba * arr_proba)
                else:
                    break

        return total_proba

Testing the code:

In [20]:
computer = TransferQualityComputer()

print(computer.compute_quality('Dec 31 2017 00:40:49,01', 'Dec 31 2017 00:41:58,01', 'Dietikon, Birmensdorferstrasse', 'Dietikon, Birmensdorferstrasse', '85:849:303', '85:849:303', walktime=1))

0.8011350894443582


Rename columns: 

### Metadata

We first start by read metadata in order to select stop station within 10 km from Zürich 

In [27]:
df_meta = spark.read.csv('/datasets/project/metadata')

In [28]:
#Here we can see that we have some duplicated stop name 
#df_meta.filter(df_meta._c0.contains('Zürich')).head(30)

In [29]:
df_meta.printSchema()

root
 |-- _c0: string (nullable = true)



In [30]:
df_meta = df_meta.select(fct.split(df_meta['_c0'], '  ')[1].alias('Long'), 
                         fct.split(fct.split(df_meta['_c0'], '  ')[2], ' ')[0].alias('Lat'), 
                         fct.split(df_meta['_c0'], '% ')[1].alias('StopName_Meta') )

In [31]:
df_meta.show()

+---------+---------+-------------------+
|     Long|      Lat|      StopName_Meta|
+---------+---------+-------------------+
|26.074412|44.446770|          Bucuresti|
| 1.811446|50.901549|             Calais|
| 1.075329|51.284212|         Canterbury|
|-3.543547|50.729172|             Exeter|
| 9.733756|46.922368|            Fideris|
| 8.571251|50.051219|Frankfurt Flughafen|
|18.643803|54.355520|             Gdansk|
| 7.389462|47.191804|           Grenchen|
|29.019602|40.996348|           Istanbul|
| 9.873959|48.577852|  Amstetten (Württ)|
| 4.786044|43.921937|            Avignon|
| 2.140369|41.378914|          Barcelona|
| 7.589551|47.547405|              Basel|
| 7.395229|46.937482|       Bern Bümpliz|
|-1.899480|52.483627|         Birmingham|
| 6.838953|46.949588|          Boudry TN|
|17.106466|48.158910|         Bratislava|
| 4.335694|50.835376|          Bruxelles|
|-2.979650|53.404289|          Liverpool|
| 8.500049|47.114619|         Lothenbach|
+---------+---------+-------------

In [32]:
print(len(df_meta.filter(df_meta['StopName_Meta'].rlike("Lausanne")).collect()))

211


In [33]:
#Again we can see that we have many occurance of Zurich with different coordinate
#df_meta.filter(df_meta.StopName_Meta == 'Zürich').show()

In [34]:
df_meta = df_meta.withColumn("Long", df_meta["Long"].cast(FloatType()))
df_meta = df_meta.withColumn("Lat", df_meta["Lat"].cast(FloatType()))

In [35]:
df_meta.printSchema()

root
 |-- Long: float (nullable = true)
 |-- Lat: float (nullable = true)
 |-- StopName_Meta: string (nullable = true)



We can see that there is many duplicate name with different coordinate. 
For example we find many time Lausanne, after investigatin we understand that all the subway station where simply Lausanne. We decide to fill that problem using another dataset in order to merge them. 

We decide to merge the two dataset using coordinate, in order to do this we round coordinate to match them. A round at 3 decimal change the precission by max 135m. For example Google Maps use 6 decimal

First we only keep point in/near switzerland we decide to do this by draw a square arount the country and keep point inside.  Here we find the extreme points of switzerland: 
https://fr.wikipedia.org/wiki/Liste_de_points_extr%C3%AAmes_de_la_Suisse

In [36]:
df_meta.count()

25935

In [37]:
df_meta = df_meta.filter(df_meta.Lat.between(45.490404, 47.485074))
df_meta = df_meta.filter(df_meta.Long.between(5.572263, 10.2931))                       

In [38]:
df_meta.count()

22723

Then we see the minimum precision we have in our dataset in order to round all coordinate to this precision

In [39]:
slen = udf(lambda s: len(str(s).split('.')[1]), IntegerType())

In [40]:
df_meta = df_meta.withColumn("lat_len", slen(df_meta.Lat))
df_meta = df_meta.withColumn("lon_len", slen(df_meta.Long))
#df_meta = df_meta.withColumn("precision", min(df_meta.lat_len, df_meta.lon_len))
print(df_meta.agg({"lat_len": "min"}).collect())
print(df_meta.agg({"lon_len": "min"}).collect())

[Row(min(lat_len)=6)]
[Row(min(lon_len)=6)]


So we have a precision of 6 digit which is sufficient for our work. See why df_meta.show(5) not always display the same number of digit 

In [41]:
df_meta.show(5)

+--------+---------+-------------+-------+-------+
|    Long|      Lat|StopName_Meta|lat_len|lon_len|
+--------+---------+-------------+-------+-------+
|9.733756|46.922367|      Fideris|     15|     15|
|7.389462|47.191803|     Grenchen|     15|     15|
|7.395229| 46.93748| Bern Bümpliz|     14|     15|
|6.838953| 46.94959|    Boudry TN|     15|     15|
|8.500049| 47.11462|   Lothenbach|     15|     15|
+--------+---------+-------------+-------+-------+
only showing top 5 rows



In [42]:
df_meta = df_meta.select('Long', 'Lat', 'StopName_Meta')

In [43]:
round_6 = udf(lambda s: round(s, 6), DoubleType())

In [44]:
df_meta = df_meta.withColumn("Round_Long", round_6(df_meta.Long))
df_meta = df_meta.withColumn("Round_Lat", round_6(df_meta.Lat))

In [45]:
df_meta.show(5)

+--------+---------+-------------+----------+---------+
|    Long|      Lat|StopName_Meta|Round_Long|Round_Lat|
+--------+---------+-------------+----------+---------+
|9.733756|46.922367|      Fideris|  9.733756|46.922367|
|7.389462|47.191803|     Grenchen|  7.389462|47.191803|
|7.395229| 46.93748| Bern Bümpliz|  7.395229|46.937481|
|6.838953| 46.94959|    Boudry TN|  6.838953|46.949589|
|8.500049| 47.11462|   Lothenbach|  8.500049| 47.11462|
+--------+---------+-------------+----------+---------+
only showing top 5 rows



In [46]:
print(df_meta.distinct().count())
print(df_meta.select('Round_Lat', 'Round_Long').distinct().count())

22696
22671


### Use another dataset to fil missing names

In [47]:
with open('stops.txt', 'r') as file: 
    one_splitted = file.readline().strip().split(",")
    file_lines = [line.strip().split('"') for line in file.readlines()]
    
stop_names = [x[3] for x in file_lines]
Lat = [float(x[5]) for x in file_lines]
Long = [float(x[7]) for x in file_lines]

df_stop = pd.DataFrame({
        "StopName": stop_names, 
        "Lat_stop": Lat, 
        "Long_stop": Long,   
    })
df_stop.head()

Unnamed: 0,Lat_stop,Long_stop,StopName
0,45.989901,8.345062,"Anzola, chiesa"
1,46.167251,8.345807,Altoggio
2,46.060122,8.11362,Antronapiana
3,45.98987,8.345717,Anzola
4,46.261498,8.319253,Baceno


In [48]:
mySchema = StructType([ StructField("Lat_stop", DoubleType(), True)\
                        ,StructField("Long_stop", DoubleType(), True)\
                        ,StructField("StopName", StringType(), True) ])
df_stop = spark.createDataFrame(df_stop, mySchema)
df_stop.show()

+----------------+----------------+--------------------+
|        Lat_stop|       Long_stop|            StopName|
+----------------+----------------+--------------------+
|45.9899010293845|8.34506152974108|      Anzola, chiesa|
|46.1672513851495|  8.345807131427|            Altoggio|
| 46.060121674738|8.11361957990831|        Antronapiana|
|45.9898698225697|8.34571729989858|              Anzola|
|46.2614983591677|8.31925293162473|              Baceno|
|46.0790618438814|8.29927439970313|Beura Cardezza, c...|
|46.1222963432243|8.21077237789936|Bognanco, T. Vill...|
|46.0656504576122|8.26113193273411|           Boschetto|
|46.2978807772998| 8.3626325767009|            Cadarese|
|46.1340194356792|8.28619492916453|               Caddo|
|46.0916476333918|8.28041876188684|              Calice|
|45.9695691829797|8.04585965801774|            Campioli|
|46.4091810825782| 8.4117524564434|    Cascate del Toce|
|46.0205875326422| 8.2148866619012|         Castiglione|
|45.9710364221151|8.06992552448

In [49]:
df_stop = df_stop.withColumn("lat_len", slen(df_stop.Lat_stop))
df_stop = df_stop.withColumn("lon_len", slen(df_stop.Long_stop))
#df_meta = df_meta.withColumn("precision", min(df_meta.lat_len, df_meta.lon_len))
print(df_stop.agg({"lat_len": "min"}).collect())
print(df_stop.agg({"lon_len": "min"}).collect())

df_stop.orderBy('lon_len').show(2)

[Row(min(lat_len)=1)]
[Row(min(lon_len)=1)]
+----------------+-------------+---------------+-------+-------+
|        Lat_stop|    Long_stop|       StopName|lat_len|lon_len|
+----------------+-------------+---------------+-------+-------+
|             0.0|          0.0|Isola Superiore|      1|      1|
|46.7810573848296|10.2605841261|Tarasp, Florins|     13|     10|
+----------------+-------------+---------------+-------+-------+
only showing top 2 rows



In [50]:
print(df_stop.filter(df_stop['StopName'].rlike("Isola Superiore")).collect())

[Row(Lat_stop=0.0, Long_stop=0.0, StopName='Isola Superiore', lat_len=1, lon_len=1)]


Here we can see that this can from an error in the dataset, we use google maps to find the good coordinate of Isola Superiore which is: Isola Superiore: 45.901230 - 8.520450

In [51]:
df_stop = df_stop.withColumn("Lat_stop", \
              when(df_stop["StopName"] == 'Isola Superiore', 45.901230).otherwise(df_stop["Lat_stop"]))
df_stop = df_stop.withColumn("Long_stop", \
              when(df_stop["StopName"] == 'Isola Superiore', 8.520450).otherwise(df_stop["Long_stop"]))

Now we again round all the coordinate by 6 in order to merge both of the dataframe. 

In [52]:
df_stop = df_stop.withColumn("Round_Long", round_6(df_stop.Long_stop))
df_stop = df_stop.withColumn("Round_Lat", round_6(df_stop.Lat_stop))

### Merge Dataframe

In [53]:
Df_meta = df_meta.join(df_stop, on = ['Round_Lat', 'Round_Long'], how='outer') 

In [54]:
print(df_meta.filter(df_meta['StopName_Meta'].like("Lausanne")).count())
print(Df_meta.filter(Df_meta['StopName_Meta'].like("Lausanne") & Df_meta['StopName'].isNull()).count())

188
185


We can see that for the example of Lausanne we just recover 2 name over about a hundred. 
After investigation we find the coordidate for particular station in both dataset: 
<br/>
<br/>Lausanne Malley: 46.524212 - 6.603306 -- 46.524211 - 6.603309
<br/>Lausanne Bourdonette: 46.523466 - 6.589805 -- 46.523465 - 6.589807
<br/>Lausanne Provence: 46.523384 - 6.608102 -- 46.523382 - 6.608106

We can see that each time our merge fail for 1 digit

We try again with a round at 5 digits whith is still a very good precision

In [55]:
round_5 = udf(lambda s: round(s, 5), DoubleType())

In [56]:
df_meta = df_meta.withColumn("Round_Long", round_5(df_meta.Long))
df_meta = df_meta.withColumn("Round_Lat", round_5(df_meta.Lat))

In [57]:
df_stop = df_stop.withColumn("Round_Long", round_5(df_stop.Long_stop))
df_stop = df_stop.withColumn("Round_Lat", round_5(df_stop.Lat_stop))

In [58]:
Df_meta = df_meta.join(df_stop, on = ['Round_Lat', 'Round_Long'], how='outer') 

In [59]:
print(df_meta.filter(df_meta['StopName_Meta'].like("Lausanne")).count())
print(Df_meta.filter(Df_meta['StopName_Meta'].like("Lausanne") & Df_meta['StopName'].isNull()).count())

188
65


We now achieved a satisfactory result

In [60]:
Df_meta.printSchema()

root
 |-- Round_Lat: double (nullable = true)
 |-- Round_Long: double (nullable = true)
 |-- Long: float (nullable = true)
 |-- Lat: float (nullable = true)
 |-- StopName_Meta: string (nullable = true)
 |-- Lat_stop: double (nullable = true)
 |-- Long_stop: double (nullable = true)
 |-- StopName: string (nullable = true)
 |-- lat_len: integer (nullable = true)
 |-- lon_len: integer (nullable = true)



In [61]:
Df_meta = Df_meta.select('Long', 'Lat', 'StopName_Meta', 'StopName')
Df_meta.show()

+--------+---------+----------------+--------------------+
|    Long|      Lat|   StopName_Meta|            StopName|
+--------+---------+----------------+--------------------+
|    null|     null|            null|Macugnaga, Pestarena|
|    null|     null|            null| Lugano, Via Ginevra|
|    null|     null|            null|      Gandria, Paese|
|    null|     null|            null|               Gozzi|
|8.943882|46.034714|        Cureglia|   Cureglia, Rotonda|
|    null|     null|            null|        Bogno, Paese|
|6.090986| 46.15237|           Perly|                null|
|6.044045|46.161507|        Laconnex|Laconnex, Chemin ...|
|8.912559|46.179436|         Agarone|                null|
|8.699336| 46.18245|        Cresmino|      Cresmino, Case|
|6.246757|46.183704|       Annemasse|Annemasse, Généra...|
|7.393176| 46.19771|Les Mayens-de-S.|Les Mayens-de-S.,...|
|6.167676| 46.20001|          Genève|  Genève, Amandolier|
|6.157857|46.203766|          Genève|                nul

In [62]:
Df_meta = Df_meta.na.drop(subset=["Long", 'Lat'])

In [63]:
Df_meta = Df_meta.withColumn("StopName_Meta", \
              when(Df_meta["StopName"].isNotNull(), Df_meta["StopName"]).otherwise(Df_meta["StopName_Meta"]))

In [64]:
Df_meta = Df_meta.select('StopName_Meta', 'Lat', 'Long')

Df_meta = Df_meta.toPandas()

### Request Part

In [66]:
def get_lat_long(name): 
    tmp = Df_meta.loc[Df_meta['StopName_Meta'] == name][['Lat', 'Long']]
    
    assert len(tmp) != 0, "Problement with the location {}".format(name)
    
    tmp = tmp.iloc[0]
    return tmp['Lat'], tmp['Long']

In [67]:
def return_request(fromPlace, toPlace, departure, Months, Days, Hours, AM_PM, Minutes, Seconds, lat_long_from = False, lat_long_to = False):
    if (fromPlace.split(' ')[0] == 'stop'):
        fromPlace = toPlace[5:-1]
        
    if lat_long_from == False:
        lat_from, long_from = get_lat_long(fromPlace)
    else:
        lat_from, long_from = lat_long_from[0], lat_long_from[1]
    
    
    
    if (toPlace.split(' ')[0] == 'stop'):
        toPlace = toPlace[5:-1]
    #print('TOOOOO PLACE ################## {} {}'.format(toPlace, len(toPlace)))    
    if lat_long_to == False:
        lat_to, long_to = get_lat_long(toPlace)
    else:
        lat_to, long_to = lat_long_to[0], lat_long_to[1]
    
    url = 'http://10.90.38.21:8829/otp/routers/default/plan?fromPlace=stop+'
    url += '+'.join(fromPlace.split()) +  '+%3A%3A' + str(lat_from) + '%2C' + str(long_from)
    url += '&toPlace=stop+' +  '+'.join(toPlace.split()) +  '+%3A%3A' + str(lat_to) + '%2C' + str(long_to)
    url += '&time={}%3A{}{}&date={}-{}-2018&mode=TRANSIT%2CWALK&maxWalkDistance=804.672&arriveBy={}&wheelchair=false&locale=en&numItineraries=3'.format(Hours, Minutes, AM_PM, Months, Days, not(departure))
    #url += '&time={}&date={}-{}-2018&mode=TRANSIT%2CWALK&maxWalkDistance=804.672&arriveBy=false&wheelchair=false&locale=en&numItineraries=3&departure=true'.format(Datetime, Months, Days)
    #print(url)
    r = requests.get(url)
    #print(r)
    #print(r.json())
    #read_json(r.json())
    return r.json()

### Create itineraries from JSON

In [256]:
def read_json_extract_itineraries(json_data, df_BT):
    info_list = []
    
    if 'plan' not in json_data:
        return []
    
    for route in json_data['plan']['itineraries']:
        info_list_route = []
        for step in route['legs']: 
            mode = step['mode']
            from_ = step['from']['name']
            lat_from = step['from']['lat']
            lon_from = step['from']['lon']
            to_ = step['to']['name']
            lat_to = step['to']['lat']
            lon_to = step['to']['lon']
            
            start_time = str(step['from']['departure'])
            departure_time = time.strftime("%b %d %Y %H:%M:%S,%M", time.localtime(float(start_time[:len(start_time)-3])))
            end_time = str(step['endTime'])
            arrival_time = time.strftime("%b %d %Y %H:%M:%S,%M", time.localtime(float(end_time[:len(end_time)-3])))
            duration = str(step['duration'])
            
            route_id = 0
            trip_id = 0
            agency_name = 'unknown'
            if ('routeShortName' in step.keys()):
                route_id = step['routeShortName']
            if('tripShortName' in step.keys()):
                trip_id = step['tripShortName']
            if('agencyName' in step.keys()):
                agency_name = step['agencyName']
            line_id = trip_id
            if mode != 'RAIL' and mode!='WALK':
                if mode == 'BUS' or mode == 'Bus':
                    tmp = df_BT.query('ProductId.str.lower() == "bus" & OperatorName == @agency_name & LineType == @route_id')[:1]
                elif mode =='TRAM' or mode == 'Tram':
                    tmp = df_BT.query('ProductId.str.lower() == "tram" & OperatorName == @agency_name & LineType == @route_id')[:1]
                else: 
                    tmp = df_BT.query('ProductId == @mode & OperatorName == @agency_name & LineType == @route_id')[:1]
                if len(tmp) == 0:
                    line_id = 'unknown'
                else:
                    line_id = tmp['LineId'].iloc[0]
        
            info_list_route.append({'product_id': mode, 'from': from_, 'lat_long_from': [lat_from, lon_from] ,'departure_time':departure_time,'to':to_, 'lat_long_to': [lat_to, lon_to], 'arrival_time':arrival_time, 'line_id': line_id})
        info_list.append(info_list_route)
    return info_list 

### Select itineraries respecting the quality

In [290]:
def comp_itinerary_quality(itinerary, quality_computer):
    itinerary_quality = 1
    prev_leg = None
    crt_leg = None
    walking_time = 1
    for crt_leg in itinerary:
        if crt_leg['product_id'] == 'WALK':
            walking_time += (datetime.strptime(crt_leg['arrival_time'][:-3], DATE_FORMAT) - \
                            datetime.strptime(crt_leg['departure_time'][:-3], DATE_FORMAT)).total_seconds()//60
        else:
            if prev_leg is not None:
                transfer_quality = quality_computer.compute_quality(
                    prev_leg['arrival_time'],
                    crt_leg['departure_time'],
                    prev_leg['to'],
                    prev_leg['line_id'],
                    crt_leg['line_id'],
                    walking_time
                )
                itinerary_quality *= transfer_quality
                
            prev_leg = crt_leg
            walking_time = 1
            
    return itinerary_quality

In [230]:
def split_with_quality(itinerary_list, Quality, quality_computer):
    itinerary_quality_ = [0,]*len(itinerary_list)
    print('Quality of itineraries:  ')
    for i_ in range(len(itinerary_list)):
        itinerary_quality_[i_] = comp_itinerary_quality(itinerary_list[i_], quality_computer)
        print('Itinerary number {}, quality: {}'.format(i_,itinerary_quality_[i_]))
    itinerary_list_accepted = np.array(itinerary_list)[[it_>Quality for it_ in itinerary_quality_]].tolist()
    itinerary_list_refused = np.array(itinerary_list)[[not(it_>Quality) for it_ in itinerary_quality_]].tolist()
    return itinerary_list_accepted, itinerary_list_refused

### Explore itineraries "around" a too-low-quality itinerary TODO: make it an actual tree

In [269]:
def date_to_cells(date):
    dt = datetime.strptime(date[:-3], DATE_FORMAT)
    return dt.month,\
          dt.day,\
          (dt.hour if dt.hour <= 12 else dt.hour-12),\
          ('AM' if dt.hour <= 12 else 'PM'), \
          dt.minute,\
          dt.second

In [270]:
def explore_itineraries(itinerary, df_BT, quality, quality_computer):
    itinerary_list = []
    for j_ in range(len(itinerary)-1):
        #leg_1 = itinerary[j_]
        if comp_itinerary_quality(itinerary[0:j_+1], quality_computer) < quality:
            continue
        arr_month, arr_day, arr_hour, arr_AM_PM, arr_minute, arr_second = date_to_cells(itinerary[j_]['arrival_time'])
        #new_semi_its = request_with_quality(fromPlace = leg_1['to'], toPlace = itinerary[-1]['to'], Months = arr_month, Days = arr_day, Hours = arr_hour, Minutes = arr_minute, Seconds = arr_second, AM_PM = arr_AM_PM, departure = True, Quality = quality, lat_long_from = leg_1['lat_long_from'], lat_long_to = itinerary[-1]['lat_long_to'] )
        temp_json = return_request(fromPlace = itinerary[j_]['to'], toPlace = itinerary[-1]['to'], Months = arr_month, Days = arr_day, Hours = arr_hour, Minutes = arr_minute, Seconds = arr_second, AM_PM = arr_AM_PM, departure = True, lat_long_from = itinerary[j_]['lat_long_to'], lat_long_to = itinerary[-1]['lat_long_to'])
        #print(temp_json)
        new_partial_its = read_json_extract_itineraries(temp_json, df_BT)
        new_itineraries = [np.append(itinerary[:j_+1],new_partial_its[k_]).tolist() for k_ in range(len(new_partial_its))]
        
        itinerary_list.extend(new_itineraries)
    return itinerary_list

### Get news from SBB

In [234]:
def display_info(date, stopName):
    url = 'https://data.sbb.ch/api/records/1.0/search/?dataset=rail-traffic-information&lang=en&rows=1000&sort=validityend&facet=validitybegin&facet=validityend&refine.validitybegin={}'.format(date[0])
    tmp = requests.get(url).json()
    infos = []
    for el in tmp['records']: 
        end = str(el['fields']['validityend'].split('T')[0]).split('-')
        if((int(end[0]) == int(date[0]) and int(end[1]) == int(date[1]) and int(end[2]) < int(date[2])) or (int(end[0]) == int(date[0]) and int(end[1]) < int(date[1])) or (int(end[0]) < int(date[0]))):
            break
        #print(end)
        title = el['fields']['title']
        if('End of announcement:' in title): 
            pass
        else:
            if(len(title.split(':')) > 1):
                title = str(title.split(':')[1])
            title = title.replace(' and', '-').replace('engineering work is in progress', '').replace(',','').replace('.', '').replace('Between', '').replace('In', '').replace(' station', '').replace('Work due to a disruption','').strip()
            #print(title.split('- '))
            for el_title in title.split('- '): 
                for el_stop in stopName: 
                    if(el_title.strip() == el_stop.strip()): 
                        print(el_title)
                        infos.append(el['fields']['description'])
    return infos

### Find best itineraries

In [206]:
quality_computer = TransferQualityComputer()

In [282]:
quality = 0.99
fromPlace_ = "Zürich, Zürichbergstrasse"
toPlace_ = "Zürich"
Months_ = 2
Days_ = 4
Hours_ = 6
AM_PM_ = 'PM'
Minutes_ = 20
Seconds_ = 1
departure_ = True


#Get the json of quickest itineraries from local OTP server
test_json = return_request(fromPlace=fromPlace_ , toPlace= toPlace_ ,Months = Months_, Days= Days_, Hours= Hours_, AM_PM = AM_PM_, Minutes = Minutes_, Seconds = Seconds_, departure = departure_)
#Create Dataframe to find LineId from ProductId, LineType and OperatorName. Relevant for Bus and Tram
df_BT = df.where(col('ProductId') != 'Zug').select('ProductId','LineType','OperatorName','LineId').distinct().toPandas()


In [289]:
#Read json and create itinerary list of dicts
itinerary_first_list = read_json_extract_itineraries(test_json, df_BT)
itinerary_acc, itinerary_refu = split_with_quality(itinerary_first_list, quality, quality_computer)
itinerary_searched = []
iter_=0

## sort bad quality itineraries by arrival time
sorter_ids = np.argsort([itinerary_refu[i_][-1]['arrival_time'] for i_ in range(len(itinerary_refu))])
itinerary_refu = np.array(itinerary_refu)[sorter_ids].tolist()
while len(itinerary_refu) != 0:
    if len(itinerary_acc)>=3:
        break
    iter_+=1
    print('itineraries searched: {}'.format(iter_))
    itinerary_searched_ = itinerary_refu.pop(0)
    itinerary_searched.append(itinerary_searched_)
    print(itinerary_searched_[0]['departure_time'], itinerary_searched_[-1]['arrival_time'])
    #print(itinerary_searched_)
    itinerary_test_list_explored = explore_itineraries(itinerary_searched_, df_BT, quality, quality_computer)
    itinerary_acc_explored, itinerary_refu_explored = split_with_quality(itinerary_test_list_explored, quality, quality_computer)
    for iti_refu in itinerary_refu_explored:
        if not(any([(iti_refu[:-1] == iti[:-1]) for iti in itinerary_refu+itinerary_searched])):
            itinerary_refu.append(iti_refu)
    for iti_acc in itinerary_acc_explored:
        if not(any([(iti_acc == iti) for iti in itinerary_acc])):
            itinerary_acc.append(iti_acc)
    
    ## sort by arrival time
    sorter_ids = np.argsort([itinerary_refu[i_][-1]['arrival_time'] for i_ in range(len(itinerary_refu))])
    itinerary_refu = np.array(itinerary_refu)[sorter_ids].tolist()
    

## sort selected itineraries by arrival time
sorter_ids = np.argsort([itinerary_acc[i_][-1]['arrival_time'] for i_ in range(len(itinerary_acc))])
itinerary_acc = np.array(itinerary_acc)[sorter_ids].tolist()

Quality of itineraries:  
Itinerary number 0, quality: 0.8461788021532193
Itinerary number 1, quality: 0.9617261160185028
Itinerary number 2, quality: 0.9175936906284169
itineraries searched: 1
Feb 04 2018 18:21:15,21 Feb 04 2018 18:59:00,59
Quality of itineraries:  
Itinerary number 0, quality: 0.8461788021532193
Itinerary number 1, quality: 0.8868159286064247
Itinerary number 2, quality: 0.8461788021532193
Itinerary number 3, quality: 0.8461788021532193
Itinerary number 4, quality: 0.9874830925855225
Itinerary number 5, quality: 0.9450922110885083
itineraries searched: 2
Feb 04 2018 18:28:00,28 Feb 04 2018 19:14:00,14
Quality of itineraries:  
Itinerary number 0, quality: 0.9617261160185028
Itinerary number 1, quality: 0.9445794875311821
Itinerary number 2, quality: 0.986961428833482
itineraries searched: 3
Feb 04 2018 18:21:15,21 Feb 04 2018 19:14:00,14
Quality of itineraries:  
Itinerary number 0, quality: 0.8461788021532193
Itinerary number 1, quality: 0.8868159286064247
Itinerary

### Results

In [288]:
## Print out the arrival time and quality of the three selected paths
itinerary_selected_quality_ = [0,]*len(itinerary_acc)
for i_ in range(len(itinerary_acc)):
    itinerary_selected_quality_[i_] = comp_itinerary_quality(itinerary_acc[i_], quality_computer)
    print('Itinerary number: {}, quality: {}, arrival time: {}, transfers: {}'.format(i_,itinerary_selected_quality_[i_], itinerary_acc[i_][-1]['arrival_time'],len(itinerary_acc[i_])-1))

Itinerary number: 0, quality: 0.9935776148398892, arrival time: Feb 04 2018 19:29:00,29, transfers: 3
Itinerary number: 1, quality: 0.9944470435872333, arrival time: Feb 04 2018 19:44:00,44, transfers: 3
Itinerary number: 2, quality: 0.9959660865358548, arrival time: Feb 04 2018 19:44:00,44, transfers: 3
Itinerary number: 3, quality: 0.9971711686149598, arrival time: Feb 04 2018 19:57:00,57, transfers: 3


In [275]:
## Compare to initial output from OTP
## Print out the arrival time and quality of the three select paths
itinerary_initial_quality_ = [0,]*len(itinerary_test_list)
for i_ in range(len(itinerary_test_list)):
    itinerary_initial_quality_[i_] = comp_itinerary_quality(itinerary_test_list[i_], quality_computer)
    print('Itinerary number: {}, quality: {}, arrival time: {}, transfers: {}'.format(i_,itinerary_initial_quality_[i_], itinerary_test_list[i_][-1]['arrival_time'],len(itinerary_test_list[i_])-1))

Itinerary number: 0, quality: 0.8461788021532193, arrival time: Feb 04 2018 18:59:00,59, transfers: 3
Itinerary number: 1, quality: 0.9617261160185028, arrival time: Feb 04 2018 19:14:00,14, transfers: 2
Itinerary number: 2, quality: 0.9175936906284169, arrival time: Feb 04 2018 19:29:00,29, transfers: 2


In [None]:
## Print news from SBB regarding the path
date = [2018, Months_, Days_]
test = display_info(date, [fromPlace_, toPlace_])
for el in test: 
    print('\n')
    print(el)

### TODO: Interface to be included above

In [166]:
quality_ = 0.90
fromPlace_ = "Zürich, Zürichbergstrasse"
toPlace_ = 'Zürich Enge, Bahnhof'
Months_ = 2
Days_ = 4
Hours_ = 6
AM_PM_ = 'PM' Minutes_ = 20
Seconds_ = 1
departure_ = True

In [174]:
def find_itinerary_with_quality(fromPlace , toPlace, Months, Days, Hours, AM_PM, Minutes, Seconds, departure, quality):

    #Get the json of quickest itineraries from local OTP server
    test_json = return_request(fromPlace=fromPlace , toPlace= toPlace ,Months = Months, Days= Days, Hours= Hours, AM_PM = AM_PM, Minutes = Minutes, Seconds = Seconds, departure = departure)
    #Create Dataframe to find LineId from ProductId, LineType and OperatorName. Relevant for Bus and Tram
    #
    #Read json and create itinerary list of dicts
    itinerary_first_list = read_json_extract_itineraries(test_json, df_BT)
    itinerary_acc, itinerary_refu = split_with_quality(itinerary_first_list, Quality = quality)
    itinerary_searched = []
    iter_=0

    ## sort bad quality itineraries by arrival time
    sorter_ids = np.argsort([itinerary_refu[i_][-1]['arrival_time'] for i_ in range(len(itinerary_refu))])
    itinerary_refu = np.array(itinerary_refu)[sorter_ids].tolist()
    while len(itinerary_refu) != 0:
        if len(itinerary_acc)>=3:
            break
        iter_+=1
        print('itineraries expanded: {}'.format(iter_))
        itinerary_searched_ = itinerary_refu.pop(0)
        itinerary_searched.append(itinerary_searched_)
        #print(itinerary_searched_[0]['departure_time'], itinerary_searched_[-1]['arrival_time'])
        #print(itinerary_searched_)
        itinerary_test_list_explored = explore_itineraries(itinerary_searched_, df_BT, quality)
        itinerary_acc_explored, itinerary_refu_explored = split_with_quality(itinerary_test_list_explored, quality)
        for iti_refu in itinerary_refu_explored:
            if not(any([(iti_refu == iti) for iti in itinerary_refu+itinerary_searched])):
                itinerary_refu.append(iti_refu)
        for iti_acc in itinerary_acc_explored:
            if not(any([(iti_acc == iti) for iti in itinerary_acc])):
                itinerary_acc.append(iti_acc)

        ## sort by arrival time
        sorter_ids = np.argsort([itinerary_refu[i_][-1]['arrival_time'] for i_ in range(len(itinerary_refu))])
        itinerary_refu = np.array(itinerary_refu)[sorter_ids].tolist()


    ## sort selected itineraries by arrival time
    sorter_ids = np.argsort([itinerary_acc[i_][-1]['arrival_time'] for i_ in range(len(itinerary_acc))])
    itinerary_acc = np.array(itinerary_acc)[sorter_ids].tolist()


    print('\n Fastest itineraries without quality constraint:')
    ## Initial OTP output
    ## Print out the arrival time and quality of the three select paths
    itinerary_initial_quality_ = [0,]*len(itinerary_first_list)
    for i_ in range(len(itinerary_first_list)):
        itinerary_initial_quality_[i_] = comp_itinerary_quality(itinerary_first_list[i_])
        print('Itinerary number: {}, quality: {}, dpt: {}, arr: {}, transfers: {}'.format(i_,itinerary_initial_quality_[i_], itinerary_first_list[i_][0]['departure_time'], itinerary_first_list[i_][-1]['arrival_time'],len(itinerary_first_list[i_])-1))
    
    print('\n Fastest itineraries with quality constraint:')
    ## Print out the arrival time and quality of the three selected paths
    itinerary_selected_quality_ = [0,]*len(itinerary_acc)
    for i_ in range(len(itinerary_acc)):
        itinerary_selected_quality_[i_] = comp_itinerary_quality(itinerary_acc[i_])
        print('Itinerary number: {}, quality: {}, dpt: {}, arr: {}, transfers: {}'.format(i_,itinerary_selected_quality_[i_], itinerary_acc[i_][0]['departure_time'], itinerary_acc[i_][-1]['arrival_time'],len(itinerary_acc[i_])-1))


In [175]:
find_itinerary_with_quality(fromPlace_ , toPlace_, Months_, Days_, Hours_, AM_PM_, Minutes_, Seconds_, departure_, quality_)

itineraries expanded: 1
itineraries expanded: 2

 Fastest itineraries without quality constraint:
Itinerary number: 0, quality: 0.016150519696289896, dpt: Feb 04 2018 18:21:15,21, arr: Feb 04 2018 18:41:00,41, transfers: 1
Itinerary number: 1, quality: 0.016150519696289896, dpt: Feb 04 2018 18:31:15,31, arr: Feb 04 2018 18:51:00,51, transfers: 1
Itinerary number: 2, quality: 0.016150519696289896, dpt: Feb 04 2018 18:41:15,41, arr: Feb 04 2018 19:01:00,01, transfers: 1

 Fastest itineraries with quality constraint:
Itinerary number: 0, quality: 0.9932763142975513, dpt: Feb 04 2018 18:21:15,21, arr: Feb 04 2018 18:51:00,51, transfers: 1
Itinerary number: 1, quality: 0.9975081657292914, dpt: Feb 04 2018 18:21:15,21, arr: Feb 04 2018 19:01:00,01, transfers: 1
Itinerary number: 2, quality: 0.9932763142975513, dpt: Feb 04 2018 18:31:15,31, arr: Feb 04 2018 19:01:00,01, transfers: 1
Itinerary number: 3, quality: 0.9975081657292914, dpt: Feb 04 2018 18:31:15,31, arr: Feb 04 2018 19:11:00,11, t

In [184]:
for leg in itinerary_acc[0]:
    print('Take {} from {} at {} to {} arriving at {}'.format(leg['product_id'],leg['from'], leg['departure_time'], leg['to'], leg['arrival_time']))

Take WALK from stop Zürich, Zürichbergstrasse  at Feb 04 2018 18:21:15,21 to Zürich, Kirche Fluntern arriving at Feb 04 2018 18:26:00,26
Take TRAM from Zürich, Kirche Fluntern at Feb 04 2018 18:36:00,36 to Zürich Enge, Bahnhof arriving at Feb 04 2018 18:51:00,51


In [None]:
## Print out the arrival time and quality of the three selected paths
    itinerary_selected_quality_ = [0,]*len(itinerary_acc)
    for i_ in range(len(itinerary_acc)):
        itinerary_selected_quality_[i_] = comp_itinerary_quality(itinerary_acc[i_])
        print('Itinerary number: {}, quality: {}, dpt: {}, arr: {}, transfers: {}'.format(i_,itinerary_selected_quality_[i_], itinerary_acc[i_][0]['departure_time'], itinerary_acc[i_][-1]['arrival_time'],len(itinerary_acc[i_])-1))


In [185]:
from prettytable import PrettyTable

ModuleNotFoundError: No module named 'prettytable'

In [170]:
['Red','Yellow','Green','Brown','Blue','Pink','Grey']\n",
    "nb_pix_t = PrettyTable()\n",
    "nb_pix_t.field_names = ['Color','Image 1', 'Image 2', 'Image 3', 'Image 4']\n",
    "for i_ in range(nb_of_colors):\n",
    "    nb_pix_t.add_row([color_labels[i_], pixels_per_color[0,i_],pixels_per_color[1,i_],pixels_per_color[2,i_],pixels_per_color[3,i_]])\n",
    "print (nb_pix_t)"


SyntaxError: invalid syntax (<ipython-input-170-34eafd9ad13a>, line 1)

In [None]:
Days = [i for i in range(1, 32)]
Months = [i for i in range(1, 13)]
Hours = [i for i in range(0, 13)]
AM_PM = ['AM', 'PM']
Minutes = [0, 15, 30, 45]
Seconds = [0, 15, 30, 45]

In [None]:
StopName = Df_meta.select('StopName_Meta').distinct().collect()
StopName = [str(x).replace('"', "'") for x in StopName]
StopName = [str(x)[19:] for x in StopName]
StopName = [str(x).split("')")[0] for x in StopName]
StopName = sorted(StopName)

In [None]:
info_list = return_read_json(test_json)

### Test to see what we have in the data

It seems that we don't have data for bus and subway, at least near Lausanne 

After investigation it's seems that we have data for the LEB in Lausanne. 

## Visualizing confidence of trips

One of the validation methods we could use is to visualize an isochronous map showing how far one can hypothetically go in a fix number of minutes. 

On top of this, our visualization also conveys the % of time said travels are successful.

As we are interested in the area surrounding Zurich HB by a radius with 10km and because we did not want to add functionality for this in the core of our route planning algorithm, to compute the data for the map we query the route planner from Zurich HB to every other station within a 10km radius.

For each of the stations, we will plot a circle centered in it with radius directly proportional with the walking time left up until the time limit. We set an average walking speed of 5km/h and using the time left, we compute the distance around the station that can be walked.

For each station we also get the certainty of arriving there in % of times we would be able to actually make the trip there and this value between (0,1) is linearly map to a color scale. Hence, red corresponds to a % value of 100, while blue corresponds to 0.

In [213]:
import branca.colormap as cm
import folium

ZURICH_HB_COORDS = [47.377941, 8.540141]

AVERAGE_WALKING_SPEED_PER_SECOND = 1.38889 # 5kph but in meters per second
MINIMUM_CIRCLE_RADIUS = 30
LINEAR_CM = cm.LinearColormap(
    ['blue', 'red'],
    vmin=0, vmax=1,
)
LINEAR_CM.caption = 'Quality of trip'


def add_circle(m, coords, quality, time_left_in_seconds, popup_data):
    radius = time_left_in_seconds * AVERAGE_WALKING_SPEED_PER_SECOND / 10
    folium.Circle(
        coords,
        radius,
        fill=True,
        fill_color=LINEAR_CM(quality),
        fill_opacity=0.1,
        stroke=False,
        fill_rule='nonzero',
        popup="{}".format(popup_data['arrival_time'])
        #popup="{}<br\>{}<br\>Q : {:.2f}<br\>:Left{} mins"\
        #            .format(popup_data['station_name'],popup_data['arrival_time'], quality, time_left_in_seconds//60)
    ).add_to(m)

    
def create_map_with_quality(source_name, source_coord, stations_data):
    m = folium.Map(source_coord, zoom_start=13, tiles='Stamen toner') 
    m.add_child(LINEAR_CM)
    popup_data = {}
    for data in stations_data:
        popup_data['station_name'] = data[0]
        popup_data['arrival_time'] = data[3]
        add_circle(m, data[1], data[2], data[4]*60, popup_data)
    return m

First thing we do, we select the stations that are at most 10 km from Zurich HB.

For this, we compute the distance from Zurich HB for every stop name in the df_meta. Then we keep those stops that have a distance of less than 10km form Zurich HB

In [98]:
#Coordinate of the main station of Zürich
Lat_zu = 47.377941
Long_ZU = 8.540141
def dist_to_ZU(lat, long): 
    res = str(geo_dist((lat, long), (Lat_zu, Long_ZU)))
    res = round(float(res.split()[0]),1)
    return res

In [105]:
Df_meta['Dist in km'] = Df_meta.apply(lambda x: dist_to_ZU(x['Lat'], x['Long']), axis=1)
stops_zurich = Df_meta[Df_meta['Dist in km'] < 10]

Now that we have in **stops_zurich** the stops that are in a 10km radius from Zurich, we extract the names as we need them for the route query.

In [110]:
zurich_stations = stops_zurich['StopName_Meta'].tolist()

If we take a look at the station names, we can see there are duplicates which we choose to drop as they do not come with information about additional stops.

In [112]:
zurich_stations[:10]

['Zürich',
 'Wettswil a.A., Heidenchilen',
 'Wallisellen, Florastrasse',
 'Weiningen ZH, Aegelsee',
 'Rümlang, Heuelstrasse',
 'Kilchberg',
 'Kilchberg',
 'Neue Forch',
 'Schlieren, Wagonsfabrik',
 'Schlieren, Bahnhof']

In [113]:
zurich_stations = list(set(zurich_stations))

## Computing the travel times to stops close to Zurich

The next step in the visualization process is to compute the **arrival time** and **qualities** to the stops of interest. 

In order to do this, we query our route planner for routes from Zurich HB to every stop within 10km obtaining itineraries of which we are interested only in the arrival time at the final stop and the quality.

There are a two parameters that will shape our visualization:
    1. the start time we set for the trips - parameter required to make the queries
    2. the maximum length in time of the trips - used to filter the destinations to which the travel time takes more than this value

In [116]:
# Parameters of the visualization
ZURICH_HB_NAME = 'Zürich HB'

Months_ = 2
Days_ = 4
Hours_ = 6
AM_PM_ = 'PM'
Minutes_ = 20
Seconds_ = 1

departure_ = True
Max_travel_time_ = 30 # in minutes
Hours_24 = Hours_%12 if AM_PM_ == 'AM' else (Hours_%12)+12
departure_datetime = datetime(2018, Months_, Days_, Hours_24, Minutes_, Seconds_)

In [127]:
import multiprocessing
class NoDaemonProcess(multiprocessing.Process):
    # make 'daemon' attribute always return False
    def _get_daemon(self):
        return False
    def _set_daemon(self, value):
        pass
    daemon = property(_get_daemon, _set_daemon)

# We sub-class multiprocessing.pool.Pool instead of multiprocessing.Pool
# because the latter is only a wrapper function, not a proper class.
class MyPool(multiprocessing.pool.Pool):
    Process = NoDaemonProcess

In [129]:
def parse_datetime_string(dt):
    return datetime.strptime(dt, '%b %d %Y %H:%M:%S,%f')

def compute_remaining_travel_time(itinerary, departure_datetime, max_travel_minutes):
    '''
    Function that computes travel minutes left from the quota specified by max_travel_minutes
    '''
    last_step = itinerary[-1]
    arrival_datetime = parse_datetime_string(last_step['arrival_time'])
    travel_time_minutes = (arrival_datetime - departure_datetime).total_seconds()/60
    return max_travel_minutes - travel_time_minutes
    

def get_stop_plot_data(station, source_station, departure_datetime, max_travel_minutes, quality_computer):
    '''
    Function that computes, for each station from stations_names which is within max_travel_minutes
    of source_station, the coords, quality and time left from max_travel_minutes after arriving there
    '''
    request_json = return_request(source_station,
                                  station,
                                  True,
                                  departure_datetime.month,
                                  departure_datetime.day,
                                  departure_datetime.hour if departure_datetime.hour <= 12 else departure_datetime.hour-12,
                                  'AM' if departure_datetime.hour <= 12 else 'PM',
                                  departure_datetime.minute,
                                  departure_datetime.second,)

    itineraries = read_json_extract_itineraries(request_json, df_BT)

    if len(itineraries) == 0:
         return None

    fastest_itinerary = itineraries[0]
    remaining_travel_minutes = compute_remaining_travel_time(fastest_itinerary, departure_datetime, max_travel_minutes)

    if remaining_travel_minutes > 0:
        quality = comp_itinerary_quality(fastest_itinerary, quality_computer)
        plot_data = (
            station,
            fastest_itinerary[-1]['lat_long_to'],
            quality,
            fastest_itinerary[-1]['arrival_time'],
            remaining_travel_minutes
        )
        print(plot_data)
        return plot_data
    else:
        return None
    


In [209]:
import functools
from multiprocessing import Pool

partial_get_stop_plot_data = functools.partial(get_stop_plot_data, 
                                               source_station=ZURICH_HB_NAME,
                                               departure_datetime=departure_datetime,
                                               max_travel_minutes=60,
                                               quality_computer=quality_computer
                                              )
# we can make the requests to the OTP in parallel
with MyPool(10) as pool:
    plot_data = pool.map(partial_get_stop_plot_data, zurich_stations)
    
plot_data = [data for data in plot_data if data is not None]

('Dübendorf, Giessen', [47.4021499315644, 8.60765705363145], 0.9955913310907536, 'Feb 04 2018 18:51:00,51', 29.008166666666668)
('Pfaffhausen, Müseren', [47.3626987847054, 8.61754750491098], 0.9863427969290984, 'Feb 04 2018 18:47:00,47', 33.00883333333333)
('Zürich Manegg', [47.3383783847121, 8.51967605469254], 1, 'Feb 04 2018 18:47:00,47', 33.00883333333333)
('Zürich, Wartau', [47.402774810791016, 8.491626739501953], 1, 'Feb 04 2018 18:42:45,42', 37.25966666666666)
('Zürich, Im Walder', [47.34811782836914, 8.574365615844727], 0.894097587516201, 'Feb 04 2018 18:45:54,45', 34.10916666666667)
('Rüschlikon, Bahnhof', [47.30818176269531, 8.55359935760498], 1, 'Feb 04 2018 18:52:03,52', 27.958)
('Uitikon Waldegg', [47.3658384397731, 8.46598375015324], 1, 'Feb 04 2018 18:48:00,48', 32.00866666666667)
('Zürich, Mötteliweg', [47.413956419184, 8.52645833508858], 0.9695197523345596, 'Feb 04 2018 18:39:00,39', 41.01016666666666)
('Zürich, Maienweg', [47.4078468010511, 8.52638646986584], 0.9805495

('Volketswil', [47.38835525512695, 8.677387237548828], 0.9695197523345596, 'Feb 04 2018 18:50:08,50', 29.875)
('Urdorf, Sonne', [47.381263732910156, 8.422534942626953], 0.980565903002981, 'Feb 04 2018 18:58:32,58', 21.473666666666666)
('Zürich, Spyriplatz', [47.382057189941406, 8.552218437194824], 1, 'Feb 04 2018 18:35:25,35', 44.594166666666666)
('Urdorf, Uitikonerstrasse', [47.3822577379675, 8.42592787162877], 0.980565903002981, 'Feb 04 2018 18:55:00,55', 25.0075)
('Dübendorf, Kreuzbühl', [47.38814926147461, 8.621512413024902], 0.9465343030699442, 'Feb 04 2018 18:46:09,46', 33.858999999999995)
('Küsnacht ZH, Schübel', [47.3216531309359, 8.58985244469772], 0.9975089255143517, 'Feb 04 2018 19:07:00,07', 13.015499999999996)
('Zürich, Sukkulentensammlung', [47.35492706298828, 8.534296989440918], 1, 'Feb 04 2018 18:39:38,39', 40.37683333333333)
('Aesch ZH, Heligenmattstrasse', [47.3345248005831, 8.43633036262033], 0.9465343030699442, 'Feb 04 2018 19:05:00,05', 15.015833333333333)
('Zürich

('Dübendorf, Neugut Süd', [47.4028434753418, 8.60135555267334], 1, 'Feb 04 2018 18:44:14,44', 35.775999999999996)
('Opfikon, Vreniker', [47.4283188285141, 8.57959368415165], 0.9976501920505986, 'Feb 04 2018 19:01:00,01', 19.016499999999994)
('Zürich, Friedhof Sihlfeld', [47.3759315297134, 8.51089053121262], 0.9465343030699442, 'Feb 04 2018 18:33:00,33', 47.01116666666667)
('Zürich, Neumarkt', [47.3727742245616, 8.54735314860011], 1, 'Feb 04 2018 18:26:00,26', 54.01233333333333)
('Wallisellen, Spitzacker', [47.410980224609375, 8.60558795928955], 0.9950222093028461, 'Feb 04 2018 18:54:22,54', 25.641)
('Zürich, Althoos', [47.410701751708984, 8.524062156677246], 0.894097587516201, 'Feb 04 2018 18:37:53,37', 42.12716666666667)
('Zürich, Neeserweg', [47.3909246273101, 8.47396977303017], 0.4902677916358761, 'Feb 04 2018 18:42:00,42', 38.00966666666666)
('Zürich, Schaufelbergerstrasse', [47.3689048992124, 8.50184449630028], 1, 'Feb 04 2018 18:40:00,40', 40.01)
('Weiningen ZH', [47.419554760298

('Urdorf Weihermatt, Bahnhof', [47.38009262084961, 8.42996883392334], 1, 'Feb 04 2018 18:51:24,51', 28.608166666666666)
('Zürich Storchen', [47.371028900146484, 8.541963577270508], 1, 'Feb 04 2018 18:30:51,30', 49.16166666666667)
('Zürich, Technopark', [47.388789967894, 8.51565160221912], 1, 'Feb 04 2018 18:35:00,35', 45.01083333333333)
('Zürich, Seidelhof', [47.3918003600084, 8.4843453145632], 1, 'Feb 04 2018 18:40:00,40', 40.01)
('Zürich Stadelhofen FB', [47.36635208129883, 8.54813003540039], 1, 'Feb 04 2018 18:27:35,27', 52.42883333333333)
('Wallisellen', [47.409210205078125, 8.595544815063477], 1, 'Feb 04 2018 18:39:05,39', 40.926833333333335)
('Zürich, Segantinistrasse', [47.4074455475966, 8.48996876824257], 1, 'Feb 04 2018 18:40:00,40', 40.01)
('Zürich Balgrist', [47.3545810584948, 8.57502125935484], 0.894097587516201, 'Feb 04 2018 18:38:00,38', 42.010333333333335)
('Zürich, Fröhlichstrasse', [47.3551713700479, 8.55690224007163], 0.48636924488084676, 'Feb 04 2018 18:38:00,38', 42

('Zürich, Sonneggstrasse', [47.3828416573855, 8.54368802224039], 1, 'Feb 04 2018 18:28:00,28', 52.012)
('Dübendorf, Im Grund', [47.4015540654426, 8.61758343752235], 0.9942834536117754, 'Feb 04 2018 18:53:00,53', 27.00783333333333)
('Zürich, Carl-Spitteler-Strasse', [47.3583236436636, 8.58659156021591], 1, 'Feb 04 2018 18:45:00,45', 35.009166666666665)
('Zürich Wipkingen', [47.3934423195962, 8.52952159020786], 1, 'Feb 04 2018 18:30:00,30', 50.01166666666667)
('Opfikon, Bubenholz', [47.43449783325195, 8.571688652038574], 0.48636924488084676, 'Feb 04 2018 18:43:44,43', 36.27616666666667)
('Zürich Wiedikon, Bahnhof', [47.3714297270242, 8.52418559741944], 1, 'Feb 04 2018 18:34:00,34', 46.010999999999996)
('Uitikon, Schlössli', [47.3681078802743, 8.44785574771718], 0.48636924488084676, 'Feb 04 2018 18:54:00,54', 26.007666666666665)
('Zürich, Albisriederdörfli', [47.3753171171281, 8.48521668038892], 0.7684311920815117, 'Feb 04 2018 18:43:00,43', 37.0095)
('Opfikon, Grossacker', [47.4267692565

('Wettswil a.A., Beerimoos', [47.3295689093734, 8.48006933381019], 0.9465343030699442, 'Feb 04 2018 19:09:00,09', 11.015166666666666)
('Zürich, Förrlibuckstrasse', [47.3922119140625, 8.517154693603516], 1, 'Feb 04 2018 18:38:19,38', 41.693666666666665)
('Zürich, Milchbuck', [47.3974922678504, 8.54192732428327], 1, 'Feb 04 2018 18:34:00,34', 46.010999999999996)
('Erlenbach ZH, Erlenhöhe', [47.305999755859375, 8.60239315032959], 0.9971716152661291, 'Feb 04 2018 19:17:38,17', 2.380499999999998)
('Zürich, Eschergutweg', [47.396270751953125, 8.512043952941895], 1, 'Feb 04 2018 18:36:25,36', 43.594)
('Uetliberg', [47.3523658098635, 8.48765111480922], 1, 'Feb 04 2018 18:55:00,55', 25.0075)
('Zürich, Holzerhurd', [47.423803266843, 8.49661630134598], 0.8944331175756747, 'Feb 04 2018 18:46:00,46', 34.009)
('Weiningen ZH, Schulhaus', [47.4183512583635, 8.42866773324572], 0.9465343030699442, 'Feb 04 2018 18:57:00,57', 23.007166666666663)
('Erlenbach ZH, Schulhaus', [47.3023567199707, 8.59789657592

('Thalwil, Seehaldenstrasse', [47.2972955416243, 8.55452170456838], 0.9955913310907536, 'Feb 04 2018 19:03:00,03', 17.016166666666663)
('Zürich, Dorflinde', [47.40743637084961, 8.548800468444824], 1, 'Feb 04 2018 18:42:57,42', 37.059666666666665)
('Ringlikon, Langwis', [47.359619140625, 8.465119361877441], 0.9955913310907536, 'Feb 04 2018 19:19:25,19', 0.5968333333333291)
('Zürich Saalsporthalle', [47.35740661621094, 8.522143363952637], 1, 'Feb 04 2018 18:36:48,36', 43.21066666666667)
('Rüschlikon, Park im Grüene', [47.3003720755237, 8.55037148795517], 0.9955913310907536, 'Feb 04 2018 19:04:00,04', 16.016)
('Zürich, Zypressenstrasse', [47.3766919314159, 8.51376514012221], 0.4883913682085561, 'Feb 04 2018 18:36:00,36', 44.010666666666665)
('Waltikon', [47.3367955879821, 8.61702648204612], 0.9863642425748655, 'Feb 04 2018 19:03:00,03', 17.016166666666663)
('Zürich, Röslistrasse', [47.3886604309082, 8.54050064086914], 1, 'Feb 04 2018 18:29:40,29', 50.34516666666667)
('Zürich, Dunkelhölzli

('Zürich, Stadtgrenze', [47.3350422808333, 8.54146918348831], 0.9465343030699442, 'Feb 04 2018 18:53:00,53', 27.00783333333333)
('Zürich Affoltern', [47.42091369628906, 8.508564949035645], 0.894097587516201, 'Feb 04 2018 18:42:10,42', 37.843)
('Dübendorf, Post', [47.39616775512695, 8.616908073425293], 0.894097587516201, 'Feb 04 2018 18:44:16,44', 35.742666666666665)
('Zürich Enge, Bahnhof', [47.36412811279297, 8.531574249267578], 1, 'Feb 04 2018 18:32:15,32', 47.76133333333333)
('Zürich, Kempfhofsteig', [47.3973037636525, 8.51002814853975], 1, 'Feb 04 2018 18:34:00,34', 46.010999999999996)
('Zürich, Burgwies', [47.35801937836, 8.5718591895543], 0.894097587516201, 'Feb 04 2018 18:36:00,36', 44.010666666666665)
('Zürich, Letzipark', [47.3877743018639, 8.49921243251745], 1, 'Feb 04 2018 18:36:00,36', 44.010666666666665)
('Zürich Triemli', [47.3649987828854, 8.49519696319687], 1, 'Feb 04 2018 18:44:00,44', 36.00933333333333)
('Dübendorf, Schwimmbad', [47.3965132548335, 8.62315299228467], 0

('Maiacher', [47.3281806893937, 8.63106714993886], 0.9863642425748655, 'Feb 04 2018 19:06:00,06', 14.015666666666668)
('Glattpark, Glattpark', [47.4199559214972, 8.55716275150406], 0.980565903002981, 'Feb 04 2018 18:41:00,41', 39.00983333333333)
('Regensdorf, Lettenhau', [47.4257906479022, 8.47195754679347], 0.9694555373902938, 'Feb 04 2018 18:51:00,51', 29.008166666666668)
('Zürich Flughafen, OPC', [47.4526524323306, 8.56566081409302], 0.9960401651355173, 'Feb 04 2018 18:57:00,57', 23.007166666666663)
('Zürich Wiedikon', [47.3714714050293, 8.523462295532227], 1, 'Feb 04 2018 18:34:44,34', 45.27766666666667)
('Fällanden (See)', [47.365760803222656, 8.652711868286133], 0.9465343030699442, 'Feb 04 2018 18:59:09,59', 20.856833333333334)
('Adliswil, Moos', [47.3254528526241, 8.53104872619107], 0.946405419238296, 'Feb 04 2018 19:00:00,00', 20.016666666666666)
('Dübendorf, Sport Heerenschürli', [47.4043813295105, 8.59605082015901], 0.894097587516201, 'Feb 04 2018 18:48:00,48', 32.00866666666

('Zürich', [47.3270847004784, 8.52961142173628], 0.946405419238296, 'Feb 04 2018 18:59:00,59', 21.006833333333333)
('Dübendorf, Breitibach', [47.3957227341549, 8.60587838936865], 0.9465343030699442, 'Feb 04 2018 18:42:00,42', 38.00966666666666)
('Bassersdorf, Bahnhof', [47.438716888427734, 8.62613582611084], 0.8938121736816863, 'Feb 04 2018 18:48:07,48', 31.892)
('Wallisellen, Gemeindehaus', [47.4163696750409, 8.59102025456724], 0.9465343030699442, 'Feb 04 2018 18:39:00,39', 41.01016666666666)
('Zürich, Mittelleimbach', [47.3231389520848, 8.51428616298707], 0.48636924488084676, 'Feb 04 2018 18:54:00,54', 26.007666666666665)
('Kloten, Hohrainli', [47.4596193089001, 8.58166879245825], 0.9967055561310949, 'Feb 04 2018 19:03:00,03', 17.016166666666663)
('Dübendorf, Grüzenstrasse', [47.39128875732422, 8.617157936096191], 0.9465343030699442, 'Feb 04 2018 18:45:27,45', 34.55916666666667)
('Kilchberg ZH, Kreuzstrasse', [47.324562072753906, 8.545394897460938], 1, 'Feb 04 2018 18:49:30,49', 30.5

In [210]:
plot_data = [data for data in plot_data if data is not None]

In [195]:
m = create_map_with_quality("Zurich HB", ZURICH_HB_COORDS, plot_data)

In [196]:
m

In [214]:
m = create_map_with_quality("Zurich HB", ZURICH_HB_COORDS, plot_data)

In [215]:
m