In [1]:
%matplotlib inline
import matplotlib.pylab as plt
plt.rcParams['figure.figsize'] = (10,6)
plt.rcParams['font.size'] = 18
plt.style.use('fivethirtyeight')

In [2]:
import getpass
import pyspark
from pyspark.sql import SparkSession

conf = pyspark.conf.SparkConf()
conf.setMaster('yarn')
conf.setAppName('final_proj-{0}'.format(getpass.getuser()))
conf.set('spark.executor.memory', '4g')
conf.set('spark.executor.instances', '10')
conf.set('spark.port.maxRetries', '100')
sc = pyspark.SparkContext.getOrCreate(conf)
conf = sc.getConf()
sc

In [3]:
spark = SparkSession(sc)

In [4]:
df = spark.read.csv('/datasets/project/istdaten/*/*/*', sep=';', header=True)

First, we rename the columns in English language:

In [5]:
columns = 'TripDate string, TripId string, OperatorId string, OperatorAbbrv string, OperatorName string, ProductId string, LineId string, LineType string, UmlaufId string, TransportType string, AdditionalTrip boolean, FailedTrip boolean, BPUIC string, StopName string, ArrivalTimeScheduled string, ArrivalTimeActual string, ArrivalTimeActualStatus string,     DepartureTimeScheduled string, DepartureTimeActual string, DepartureTimeActualStatus string, SkipStation boolean'
columns = list(map(lambda x: x.split()[0],columns.split(',')))

for old, new in zip(df.columns, columns):
    df = df.withColumnRenamed(old, new)

# Computing the quality of a transfer

## Assumptions: 

   * everytime when making a transfer in a station, the traveler needs one minute for actually changing transport.
   * even though a train departs late all the time in a specific station, the trip planner will never use the fact that it does so, so we will only take into consideration the early departures and the correct ones. 

## Main idea: 

The idea behind computing the quality of a specific transfer given the *expected arrival hour* in the station and the *expected departure hour* from that same station, and some *extra information* regarding the trip before the transfer and the one after the transfer:

   * First, we compute the **discrete distribution of arrival delays $\mathcal{D}_a$** in that station, given the information of the trip before the transfer.
   * Then, we compute the **discrete distribution of negative departure delays $\mathcal{D}_d$** in that station, given the information of the trip after the transfer.
   * Next, we compute the probability of successfully realizing the transfer, by computing a convolution between the two given distributions. Therefore, assuming that the time of transfer is $k$ minutes, then we would simply compute:
      
      $\sum\limits_{t_a }\Pr[\mathcal{D}_a = t_a] \cdot \Pr[\mathcal{D}_d = k-1+t_a]$,
      
       where we have taken into consideration the minute needed by the traveler for changing the transport. 
       
---
       
Therefore, we first need to decide what are the features which will decide the distributions of the delays. For that, we will use a **Decision Tree Regressor**, selecting several features which might be important from the data, and the target label will be the delay for each datapoint, expressed in seconds. Then, we will train the regressor on both departures and arrivals data, and will look into which are the most important features in each case, for making a good prediction of the delay time. 

We have to emphasize that we considered this method, because of the way that Decision Trees decide which are the most important feature, i.e. the one which have the most variance of delays between the different values for the specific feature. 

After constructing the Decision Tree and deciding which are the most important features, we will construct the distributions of the delays from the **actual data**, by grouping the datapoints with the same value for the decisive features, and making the distribution of delays for each group.

We decided to use the actual data instead of modelling the distribution of delays using a fixed distribution family (e.g. Log-normal or Gamma distributions), because we consider that the actual data is more relevant, then considering just an estimator or to assume that it follows a distribution in a family of distributions.

## Constructing the Decision Tree Regressor

The first step in constructing the Decision Tree Regressor is to construct some potential important features from the given data, and also to compute the delays for each datapoint:

In [6]:
from pyspark.sql.functions import unix_timestamp, to_timestamp

DATE_FORMAT_SCHEDULED = 'dd.MM.yyyy HH:mm' 
DATE_FORMAT_ACTUAL = 'dd.MM.yyyy HH:mm:ss' # both formats are used

df_processed = df.withColumn('ArrivalTimeScheduledDate', to_timestamp(df.ArrivalTimeScheduled, DATE_FORMAT_SCHEDULED))
df_processed = df_processed.withColumn('DepartureTimeScheduledDate', to_timestamp(df_processed.DepartureTimeScheduled, DATE_FORMAT_SCHEDULED))

df_processed = df_processed.withColumn('ArrivalTimeScheduled', unix_timestamp(df_processed.ArrivalTimeScheduled, DATE_FORMAT_SCHEDULED))
df_processed = df_processed.withColumn('ArrivalTimeActual', unix_timestamp(df_processed.ArrivalTimeActual, DATE_FORMAT_ACTUAL))
df_processed = df_processed.withColumn('DepartureTimeScheduled', unix_timestamp(df_processed.DepartureTimeScheduled, DATE_FORMAT_SCHEDULED))
df_processed = df_processed.withColumn('DepartureTimeActual', unix_timestamp(df_processed.DepartureTimeActual, DATE_FORMAT_ACTUAL))

Let's look into how the data looks so far:

In [7]:
df_processed.head()

Row(TripDate='13.09.2017', TripId='80:06____:17010:000', OperatorId='80:06____', OperatorAbbrv='DB', OperatorName='DB Regio AG', ProductId='Zug', LineId='17010', LineType='RE', UmlaufId=None, TransportType='RE', AdditionalTrip='false', FailedTrip='false', BPUIC='8500090', StopName='Basel Bad Bf', ArrivalTimeScheduled=None, ArrivalTimeActual=None, ArrivalTimeActualStatus='PROGNOSE', DepartureTimeScheduled=1505274300, DepartureTimeActual=1505274300, DepartureTimeActualStatus='PROGNOSE', SkipStation='false', ArrivalTimeScheduledDate=None, DepartureTimeScheduledDate=datetime.datetime(2017, 9, 13, 5, 45))

Next, we also add the hour of departure and of the arrival to the dataset:

In [8]:
from pyspark.sql.types import FloatType, StringType
from pyspark.sql.functions import hour, to_date, date_format, month

df_to_classify = df_processed.select(
    df_processed.LineId.alias('line_id'), 
    df_processed.ProductId.alias('product_id'), 
    df_processed.StopName.alias('stop_name'),
    df_processed.AdditionalTrip.alias('additional_trip'), 
    hour(df_processed.ArrivalTimeScheduledDate).alias("arrival_hour").astype(StringType()),
    hour(df_processed.DepartureTimeScheduledDate).alias("departure_hour").astype(StringType()),
    date_format(to_date(df_processed.TripDate, 'dd.MM.yyyy'), 'u').alias("day_of_week"),
    ((df_processed.ArrivalTimeActual - df_processed.ArrivalTimeScheduled)).alias("delta_arrival").astype(FloatType()),
    ((df_processed.DepartureTimeActual - df_processed.DepartureTimeScheduled)).alias("delta_departure").astype(FloatType()))

df_to_classify.cache()

DataFrame[line_id: string, product_id: string, stop_name: string, additional_trip: string, arrival_hour: string, departure_hour: string, day_of_week: string, delta_arrival: float, delta_departure: float]

In [9]:
df_to_classify.head(5)

[Row(line_id='17010', product_id='Zug', stop_name='Basel Bad Bf', additional_trip='false', arrival_hour=None, departure_hour='5', day_of_week='3', delta_arrival=None, delta_departure=0.0),
 Row(line_id='17012', product_id='Zug', stop_name='Basel Bad Bf', additional_trip='false', arrival_hour=None, departure_hour='6', day_of_week='3', delta_arrival=None, delta_departure=0.0),
 Row(line_id='17013', product_id='Zug', stop_name='Basel Bad Bf', additional_trip='false', arrival_hour='6', departure_hour=None, day_of_week='3', delta_arrival=180.0, delta_departure=None),
 Row(line_id='17014', product_id='Zug', stop_name='Basel Bad Bf', additional_trip='false', arrival_hour=None, departure_hour='9', day_of_week='3', delta_arrival=None, delta_departure=0.0),
 Row(line_id='17015', product_id='Zug', stop_name='Basel Bad Bf', additional_trip='false', arrival_hour='8', departure_hour=None, day_of_week='3', delta_arrival=300.0, delta_departure=None)]

Next, for using the Decision Tree Regressor, and because each feature is in fact categorial, we must each one of them using a *StringIndexer*:

In [10]:
from pyspark.ml.feature import StringIndexer

def transform_dataset(dataset, departure):
    '''
    Function that transforms a dataset, adding for each categorial feature a column, which represents the output of the 
    StringIndexer applied to that column. 
    
    Parameters:
        - dataset: the dataset to be processed
        - departure: True if the dataset is for departures, False otherwise
    '''
    
    line_id_indexer = StringIndexer(inputCol="line_id", outputCol="line_id_cat", handleInvalid='keep') # keep nulls 
    product_id_indexer = StringIndexer(inputCol="product_id", outputCol="product_id_cat", handleInvalid='skip')
    stop_name_indexer = StringIndexer(inputCol="stop_name", outputCol="stop_name_cat", handleInvalid='skip')
    additional_trip_indexer = StringIndexer(inputCol="additional_trip", outputCol="additional_trip_cat", handleInvalid='skip')
    day_of_week_indexer = StringIndexer(inputCol="day_of_week", outputCol="day_of_week_cat", handleInvalid='skip')
    departure_hour_indexer = StringIndexer(inputCol="departure_hour", outputCol="departure_hour_cat", handleInvalid='skip')
    arrival_hour_indexer = StringIndexer(inputCol="arrival_hour", outputCol="arrival_hour_cat", handleInvalid='skip')

    indexers = [line_id_indexer, product_id_indexer, stop_name_indexer, additional_trip_indexer,day_of_week_indexer]
    
    if departure:
        indexers.append(departure_hour_indexer)
    else:
        indexers.append(arrival_hour_indexer)

    indexed = dataset

    for indexer in indexers:
        indexed = indexer.fit(indexed).transform(indexed) # add columns to dataset
        
    return indexed

Next, we use the *VectorAssembler* to construct the column for features, which will be used by the Decision Tree:

In [11]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import VectorIndexer

def compute_features_column(dataset, is_departure):
    '''
    Function that computes the features column for the given dataset.
    
    Parameters:
        - dataset: the dataset to compute the features column for
        - is_departure: True is dataset is used for departures, False otherwise.
    '''
    input_cols = ['line_id_cat', 'product_id_cat', 'stop_name_cat', 'additional_trip_cat', 'day_of_week_cat']
    
    if is_departure:
        input_cols.append('departure_hour_cat') # departure dataset
    else:
        input_cols.append('arrival_hour_cat') # arrival dataset
        
    vector_assembler = VectorAssembler(inputCols = input_cols, outputCol = 'features')
    dataset = transform_dataset(dataset, is_departure) # add categorial features
    
    df_features = vector_assembler.transform(dataset) # add features column
    # Use VectorIndexer to make sure that the added features are recognized as categorical
    
    featureIndexer = \
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=100000000).fit(df_features)
    
    df_features = featureIndexer.transform(df_features) # transform features to categorical
    
    if is_departure:
        df_final = df_features.select(df_features.indexedFeatures, df_features.delta_departure.alias("delta"))
    else:
        df_final = df_features.select(df_features.indexedFeatures, df_features.delta_arrival.alias("delta"))
    
    return df_final

Finally, we construct our datasets to input to the Decision Tree:

In [12]:
# Construct departures dataset
df_departure_to_regress = df_to_classify.filter(
    df_to_classify.departure_hour.isNotNull() & # filter only departures
    df_to_classify.delta_departure.isNotNull())

df_departure = compute_features_column(df_departure_to_regress, is_departure=True)

# Construct arrivals dataset
df_arrival_to_regress = df_to_classify.filter(
    df_to_classify.arrival_hour.isNotNull() & # filter only arrivals
    df_to_classify.delta_arrival.isNotNull())

df_arrival = compute_features_column(df_arrival_to_regress, is_departure=False)

Let's check the generated dataframes:

In [13]:
df_departure.head(5)

[Row(indexedFeatures=DenseVector([14483.0, 2.0, 2333.0, 0.0, 2.0, 18.0]), delta=0.0),
 Row(indexedFeatures=DenseVector([10292.0, 2.0, 2333.0, 0.0, 2.0, 11.0]), delta=0.0),
 Row(indexedFeatures=DenseVector([10275.0, 2.0, 2333.0, 0.0, 2.0, 13.0]), delta=0.0),
 Row(indexedFeatures=DenseVector([13971.0, 2.0, 2333.0, 0.0, 2.0, 12.0]), delta=0.0),
 Row(indexedFeatures=DenseVector([14059.0, 2.0, 2333.0, 0.0, 2.0, 8.0]), delta=780.0)]

In [14]:
df_arrival.head(5)

[Row(indexedFeatures=DenseVector([10223.0, 2.0, 2006.0, 0.0, 2.0, 11.0]), delta=180.0),
 Row(indexedFeatures=DenseVector([9633.0, 2.0, 2006.0, 0.0, 2.0, 4.0]), delta=300.0),
 Row(indexedFeatures=DenseVector([12909.0, 2.0, 2006.0, 0.0, 2.0, 12.0]), delta=60.0),
 Row(indexedFeatures=DenseVector([12924.0, 2.0, 2006.0, 0.0, 2.0, 8.0]), delta=300.0),
 Row(indexedFeatures=DenseVector([10172.0, 2.0, 2006.0, 0.0, 2.0, 6.0]), delta=300.0)]

Next, we write the function for training the Decision Tree Regressor:

In [15]:
from pyspark.ml.regression import DecisionTreeRegressor

def train_regressor(dataset):
    dt = DecisionTreeRegressor(featuresCol ='indexedFeatures', labelCol = 'delta', maxBins=100000000, maxDepth=3)
    dt_model = dt.fit(dataset)
    
    return dt_model

Finally, we train the decision trees for both datasets and we extract the most important features:

In [16]:
# Get most important fetrain_regressorpartures dataset
regressor_departures = train_regressor(df_departure)
print("Feature importances departures: {}".format(regressor_departures.featureImportances))

# Get most important features for departures dataset
regressor_arrivals = train_regressor(df_arrival)
print("Feature importances arrivals: {}".format(regressor_arrivals.featureImportances))

Feature importances departures: (6,[0,2,5],[0.284510290083,0.0974031881251,0.618086521792])
Feature importances arrivals: (6,[0,2,3,5],[0.334593468,0.0606024087098,0.0354578885646,0.569346234726])


So, we can see that the 3 most important features are, in both cases, the *hour*, the *line_id* and the *stop_name*. We can see that everything makes very much sense, because we have big differences of delays between normal hours and rush hours, for example, and also specific stops and routes have usually more delays than the others.

Therefore, we continue by constructing the probability distributions for each possible value of the three most important features.

## Computing the probability distributions 

First, we only consider the three most important features in the two initial datasets. We will consider the unity of time to be the minute from now on, instead of seconds: 

In [17]:
from pyspark.sql.types import IntegerType

df_best_feat_departures = df_departure_to_regress.select(
                df_departure_to_regress.departure_hour,
                df_departure_to_regress.stop_name,
                df_departure_to_regress.line_id,
                (df_departure_to_regress.delta_departure / 60).astype(IntegerType()).alias("delta_minutes"))

df_best_feat_departures = df_best_feat_departures.filter(df_best_feat_departures.delta_minutes <= 0) 
# only keep departures which left on time or earlier, we do not want to base our recommendation on assumption
# that a train or bus leaves with a delay.

df_best_feat_arrival = df_arrival_to_regress.select(
                df_arrival_to_regress.arrival_hour,
                df_arrival_to_regress.stop_name,
                df_arrival_to_regress.line_id,
                (df_arrival_to_regress.delta_arrival / 60).astype(IntegerType()).alias("delta_minutes"))

In [18]:
df_best_feat_departures.head(5)

[Row(departure_hour='5', stop_name='Basel Bad Bf', line_id='17010', delta_minutes=0),
 Row(departure_hour='6', stop_name='Basel Bad Bf', line_id='17012', delta_minutes=0),
 Row(departure_hour='9', stop_name='Basel Bad Bf', line_id='17014', delta_minutes=0),
 Row(departure_hour='10', stop_name='Basel Bad Bf', line_id='17016', delta_minutes=0),
 Row(departure_hour='14', stop_name='Basel Bad Bf', line_id='17024', delta_minutes=0)]

Finally, we want to make the distribution of delays for each possible value of the features, for both departures and arrivals:

In [19]:
from pyspark.sql.functions import collect_list, struct, count, lit

df_departures_grouped_count = df_best_feat_departures.groupby( 
                df_best_feat_departures.departure_hour,
                df_best_feat_departures.stop_name,
                df_best_feat_departures.line_id,
                df_best_feat_departures.delta_minutes).agg(count(lit(1)).alias("count_min")) # add a count for each possible value
        
df_departures_distribution = df_departures_grouped_count.\
                                    groupby('departure_hour', 'stop_name', 'line_id').\
                                    agg(collect_list(struct('delta_minutes', 'count_min')).alias('counts'))

# for each value of (departure_hour, stop_name, line_id), we have a list of the form [(delay_minutes, count)]

In [20]:
df_departures_grouped_count.head(3)

[Row(departure_hour='19', stop_name='Basel Bad Bf', line_id='17385', delta_minutes=0, count_min=4),
 Row(departure_hour='19', stop_name='Ependes', line_id='12172', delta_minutes=0, count_min=118),
 Row(departure_hour='16', stop_name='Vevey', line_id='12257', delta_minutes=0, count_min=148)]

In [21]:
df_departures_distribution.show(2)

+--------------+-----------+-------+--------+
|departure_hour|  stop_name|line_id|  counts|
+--------------+-----------+-------+--------+
|             0|        Bex|   3591|[[0,50]]|
|             0|Biel/Bienne|   7845|[[0,65]]|
+--------------+-----------+-------+--------+
only showing top 2 rows



In [1]:
def compute_key_for_feature_values(hour, line_id, stop_name):
    return '{}#{}#{}'.format(hour, line_id, stop_name)

In [23]:
collected = df_departures_distribution.collect()

distribution_departures = {
    compute_key_for_feature_values(x.departure_hour, x.line_id, x.stop_name) : 
    list(sorted(x.counts, key=lambda y: y[0])) for x in collected}

We do the same now for the arrivals: 

In [24]:
df_arrivals_grouped_count = df_best_feat_arrival.groupby( 
                df_best_feat_arrival.arrival_hour,
                df_best_feat_arrival.stop_name,
                df_best_feat_arrival.line_id,
                df_best_feat_arrival.delta_minutes).agg(count(lit(1)).alias("count_min")) # add a count for each possible value
        
df_arrivals_distribution = df_arrivals_grouped_count.\
                                    groupby('arrival_hour', 'stop_name', 'line_id').\
                                    agg(collect_list(struct('delta_minutes', 'count_min')).alias('counts'))
        
collected = df_arrivals_distribution.collect()

distribution_arrivals = {
    compute_key_for_feature_values(x.arrival_hour, x.line_id, x.stop_name) : 
    list(sorted(x.counts, key=lambda y: y[0])) for x in collected}

We also want to include a default distribution, for the case we have new data, which was not encountered anymore. We will compute it as the distribution of all the data:

In [25]:
df_default_distrib_departures = df_best_feat_departures.groupby('delta_minutes').agg(count(lit(1)).alias("count_min")) # add a count for each possible value
collected_default = df_default_distrib_departures.collect()
default_departures = list(sorted(collected_default, key=lambda x: x[0]))

df_default_distrib_arrivals = df_best_feat_arrival.groupby('delta_minutes').agg(count(lit(1)).alias("count_min")) # add a count for each possible value
collected_default = df_default_distrib_arrivals.collect()
default_arrivals = list(sorted(collected_default, key=lambda x: x[0]))

Next, we add the default values to the dictionary of distributions:

In [26]:
distribution_departures['default'] = default_departures
distribution_arrivals['default'] = default_arrivals

Finally, we transform the counts to probabilities, to be able to compute the final quality faster:

In [27]:
def transform_to_proba(counts_list):
    total_sum = 0
    final_proba = []
    
    for row in counts_list:
        total_sum += row.count_min
        
    for row in counts_list:
        final_proba.append((row.delta_minutes, row.count_min / total_sum))
        
    return final_proba

In [28]:
distribution_departures = {k : transform_to_proba(v) for k, v in distribution_departures.items()}
distribution_arrivals = {k : transform_to_proba(v) for k, v in distribution_arrivals.items()}

We finally write the computed dictionaries to file, to be able to load them later:

In [29]:
import pickle
import os

FILE_DISTRIBUTION_DEPARTURES = 'distrib_departures.pic'
FILE_DISTRIBUTION_ARRIVALS = 'distrib_arrivals.pic'

pickle.dump(distribution_departures, open(FILE_DISTRIBUTION_DEPARTURES, 'wb'))
pickle.dump(distribution_arrivals, open(FILE_DISTRIBUTION_ARRIVALS, 'wb'))

## The exposed API for computing distributions

Finally, the last part is to write a function which receives the features of a specific transfer, and it returns the quality of the transfer, by performing the convolution of the corresponding distributions, using the formula:

$\sum\limits_{t_a }\Pr[\mathcal{D}_a = t_a] \cdot \Pr[\mathcal{D}_d = k-1+t_a]$,
      
where we have taken into consideration the minute needed by the traveler for changing the transport. 
       
Here, we considered $\mathcal{D}_a$ to be the distribution of arrivals and $\mathcal{D}_d$ the distribution of departures.


In [2]:
from datetime import datetime
import time
DATE_FORMAT = '%b %d %Y %H:%M:%S'
    
def compute_quality(arrival_timestamp, departure_timestamp, stop_name, line_id_arr, line_id_dep):
    # Dec 31 2017 20:40:49,01
    global distribution_departures, distribution_arrivals
    
    arrival_time = datetime.strptime(arrival_timestamp[:-3], DATE_FORMAT)
    departure_time = datetime.strptime(departure_timestamp[:-3], DATE_FORMAT)
    
    arrival_hour = arrival_time.hour
    departure_hour = departure_time.hour
    delta_minutes = int((time.mktime(departure_time.timetuple()) - time.mktime(arrival_time.timetuple())) / 60)
    
    if delta_minutes < 0:
        return 0 # impossible to complete the transfer
    
    departure_key = compute_key_for_feature_values(departure_hour, line_id_dep, stop_name)
    if departure_key in distribution_departures:
        departure_dist = distribution_departures[departure_key]
    else: 
        departure_dist = distribution_departures['default'] # default distribution
       
    arrival_key = compute_key_for_feature_values(arrival_hour, line_id_arr, stop_name)
    if arrival_key in distribution_arrivals:
        arrival_dist = distribution_arrivals[arrival_key]
    else: 
        arrival_dist = distribution_arrivals['default'] # default distribution
        
    total_proba = 0
    
    for dep_delay, dep_proba in departure_dist:
        for arr_delay, arr_proba in arrival_dist:
            
            delta_minutes = ((departure_time - arrival_time).seconds // 60) % 60
            if delta_minutes >= dep_delay + arr_delay + 1:
                total_proba += (dep_proba * arr_proba)
            else:
                break
            
    return total_proba

Testing the code:

In [3]:
import pickle
import os
FILE_DISTRIBUTION_DEPARTURES = 'distrib_departures.pic'
FILE_DISTRIBUTION_ARRIVALS = 'distrib_arrivals.pic'
distribution_departures = pickle.load(open(FILE_DISTRIBUTION_DEPARTURES, 'rb'))
distribution_arrivals = pickle.load(open(FILE_DISTRIBUTION_ARRIVALS, 'rb'))

In [4]:
print(compute_quality('Dec 31 2017 00:40:49,01', 'Dec 31 2017 00:41:58,01', 'Dietikon, Birmensdorferstrasse', '85:849:303','85:849:303'))

0.8011350894443582


In [5]:
from pyspark.sql.functions import udf
import pyspark.sql.functions as fct

from pyspark.sql import Row
from geopy.distance import distance as geo_dist

import pandas as pd 

from ipywidgets import interact, interactive, fixed, interact_manual, widgets
import numpy as np
from pyspark.sql.types import *
from pyspark.sql.functions import when, col

import requests
import time

In [6]:
import getpass
import pyspark
from pyspark.sql import SparkSession

conf = pyspark.conf.SparkConf()
conf.setMaster('yarn')
conf.setAppName('final_proj-{0}'.format(getpass.getuser()))
conf.set('spark.executor.memory', '4g')
conf.set('spark.executor.instances', '10')
conf.set('spark.port.maxRetries', '100')
sc = pyspark.SparkContext.getOrCreate(conf)
conf = sc.getConf()
sc


In [7]:
spark = SparkSession(sc)

Load Data 

In [8]:
df = spark.read.csv('/datasets/project/istdaten/*/*/*', sep=';', header=True)

Rename columns: 

In [9]:
columns = 'TripDate string, TripId string, OperatorId string, OperatorAbbrv string, OperatorName string, ProductId string, LineId string, LineType string, UmlaufId string, TransportType string, AdditionalTrip boolean, FailedTrip boolean, BPUIC string, StopName string, ArrivalTimeScheduled string, ArrivalTimeActual string, ArrivalTimeActualStatus string,     DepartureTimeScheduled string, DepartureTimeActual string, DepartureTimeActualStatus string, SkipStation boolean'
columns = list(map(lambda x: x.split()[0],columns.split(',')))

for old, new in zip(df.columns, columns):
    #print(old, new)
    df = df.withColumnRenamed(old, new)

In [10]:
df.printSchema()

root
 |-- TripDate: string (nullable = true)
 |-- TripId: string (nullable = true)
 |-- OperatorId: string (nullable = true)
 |-- OperatorAbbrv: string (nullable = true)
 |-- OperatorName: string (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- LineId: string (nullable = true)
 |-- LineType: string (nullable = true)
 |-- UmlaufId: string (nullable = true)
 |-- TransportType: string (nullable = true)
 |-- AdditionalTrip: string (nullable = true)
 |-- FailedTrip: string (nullable = true)
 |-- BPUIC: string (nullable = true)
 |-- StopName: string (nullable = true)
 |-- ArrivalTimeScheduled: string (nullable = true)
 |-- ArrivalTimeActual: string (nullable = true)
 |-- ArrivalTimeActualStatus: string (nullable = true)
 |-- DepartureTimeScheduled: string (nullable = true)
 |-- DepartureTimeActual: string (nullable = true)
 |-- DepartureTimeActualStatus: string (nullable = true)
 |-- SkipStation: string (nullable = true)



In [11]:
df_tmp = df.select(fct.split(df.TripId, ':')[2].alias('Line_ID'),
              fct.split(df.TripId, ':')[3].alias('Line_ID_spec'),
              'LineType', 'ProductId','LineType', 'TripDate', 
                   'ArrivalTimeScheduled','DepartureTimeScheduled', 'StopName')

### Metadata

We first start by read metadata in order to select stop station within 10 km from Zürich 

In [12]:
df_meta = spark.read.csv('/datasets/project/metadata')

In [13]:
#Here we can see that we have some duplicated stop name 
#df_meta.filter(df_meta._c0.contains('Zürich')).head(30)

In [14]:
df_meta.printSchema()

root
 |-- _c0: string (nullable = true)



In [15]:
test = df_meta.filter(df_meta['_c0'].rlike("Lausanne")).collect()
len(test)

211

In [16]:
df_meta = df_meta.select(fct.split(df_meta['_c0'], '  ')[1].alias('Long'), 
                         fct.split(fct.split(df_meta['_c0'], '  ')[2], ' ')[0].alias('Lat'), 
                         fct.split(df_meta['_c0'], '% ')[1].alias('StopName_Meta') )

In [17]:
df_meta.show()

+---------+---------+-------------------+
|     Long|      Lat|      StopName_Meta|
+---------+---------+-------------------+
|26.074412|44.446770|          Bucuresti|
| 1.811446|50.901549|             Calais|
| 1.075329|51.284212|         Canterbury|
|-3.543547|50.729172|             Exeter|
| 9.733756|46.922368|            Fideris|
| 8.571251|50.051219|Frankfurt Flughafen|
|18.643803|54.355520|             Gdansk|
| 7.389462|47.191804|           Grenchen|
|29.019602|40.996348|           Istanbul|
| 9.873959|48.577852|  Amstetten (Württ)|
| 4.786044|43.921937|            Avignon|
| 2.140369|41.378914|          Barcelona|
| 7.589551|47.547405|              Basel|
| 7.395229|46.937482|       Bern Bümpliz|
|-1.899480|52.483627|         Birmingham|
| 6.838953|46.949588|          Boudry TN|
|17.106466|48.158910|         Bratislava|
| 4.335694|50.835376|          Bruxelles|
|-2.979650|53.404289|          Liverpool|
| 8.500049|47.114619|         Lothenbach|
+---------+---------+-------------

In [18]:
print(len(df_meta.filter(df_meta['StopName_Meta'].rlike("Lausanne")).collect()))

211


In [19]:
#Again we can see that we have many occurance of Zurich with different coordinate
#df_meta.filter(df_meta.StopName_Meta == 'Zürich').show()

In [20]:
df_meta = df_meta.withColumn("Long", df_meta["Long"].cast(FloatType()))
df_meta = df_meta.withColumn("Lat", df_meta["Lat"].cast(FloatType()))

In [21]:
df_meta.printSchema()

root
 |-- Long: float (nullable = true)
 |-- Lat: float (nullable = true)
 |-- StopName_Meta: string (nullable = true)



We can see that there is many duplicate name with different coordinate. 
For example we find many time Lausanne, after investigatin we understand that all the subway station where simply Lausanne. We decide to fill that problem using another dataset in order to merge them. 

We decide to merge the two dataset using coordinate, in order to do this we round coordinate to match them. A round at 3 decimal change the precission by max 135m. For example Google Maps use 6 decimal

First we only keep point in/near switzerland we decide to do this by draw a square arount the country and keep point inside.  Here we find the extreme points of switzerland: 
https://fr.wikipedia.org/wiki/Liste_de_points_extr%C3%AAmes_de_la_Suisse

In [22]:
df_meta.count()

25935

In [23]:
df_meta = df_meta.filter(df_meta.Lat.between(45.490404, 47.485074))
df_meta = df_meta.filter(df_meta.Long.between(5.572263, 10.2931))                       

In [24]:
df_meta.count()

22723

Then we see the minimum precision we have in our dataset in order to round all coordinate to this precision

In [25]:
slen = udf(lambda s: len(str(s).split('.')[1]), IntegerType())

In [26]:
df_meta = df_meta.withColumn("lat_len", slen(df_meta.Lat))
df_meta = df_meta.withColumn("lon_len", slen(df_meta.Long))
#df_meta = df_meta.withColumn("precision", min(df_meta.lat_len, df_meta.lon_len))
print(df_meta.agg({"lat_len": "min"}).collect())
print(df_meta.agg({"lon_len": "min"}).collect())

[Row(min(lat_len)=6)]
[Row(min(lon_len)=6)]


So we have a precision of 6 digit which is sufficient for our work. See why df_meta.show(5) not always display the same number of digit 

In [27]:
df_meta.show(5)

+--------+---------+-------------+-------+-------+
|    Long|      Lat|StopName_Meta|lat_len|lon_len|
+--------+---------+-------------+-------+-------+
|9.733756|46.922367|      Fideris|     15|     15|
|7.389462|47.191803|     Grenchen|     15|     15|
|7.395229| 46.93748| Bern Bümpliz|     14|     15|
|6.838953| 46.94959|    Boudry TN|     15|     15|
|8.500049| 47.11462|   Lothenbach|     15|     15|
+--------+---------+-------------+-------+-------+
only showing top 5 rows



In [28]:
df_meta = df_meta.select('Long', 'Lat', 'StopName_Meta')

In [29]:
round_6 = udf(lambda s: round(s, 6), DoubleType())

In [30]:
df_meta = df_meta.withColumn("Round_Long", round_6(df_meta.Long))
df_meta = df_meta.withColumn("Round_Lat", round_6(df_meta.Lat))

In [31]:
df_meta.show(5)

+--------+---------+-------------+----------+---------+
|    Long|      Lat|StopName_Meta|Round_Long|Round_Lat|
+--------+---------+-------------+----------+---------+
|9.733756|46.922367|      Fideris|  9.733756|46.922367|
|7.389462|47.191803|     Grenchen|  7.389462|47.191803|
|7.395229| 46.93748| Bern Bümpliz|  7.395229|46.937481|
|6.838953| 46.94959|    Boudry TN|  6.838953|46.949589|
|8.500049| 47.11462|   Lothenbach|  8.500049| 47.11462|
+--------+---------+-------------+----------+---------+
only showing top 5 rows



In [32]:
print(df_meta.distinct().count())
print(df_meta.select('Round_Lat', 'Round_Long').distinct().count())

22696
22671


### Use another dataset to fil missing names

In [33]:
with open('stops.txt', 'r') as file: 
    one_splitted = file.readline().strip().split(",")
    file_lines = [line.strip().split('"') for line in file.readlines()]
    
stop_names = [x[3] for x in file_lines]
Lat = [float(x[5]) for x in file_lines]
Long = [float(x[7]) for x in file_lines]

df_stop = pd.DataFrame({
        "StopName": stop_names, 
        "Lat_stop": Lat, 
        "Long_stop": Long,   
    })
df_stop.head()

Unnamed: 0,Lat_stop,Long_stop,StopName
0,45.989901,8.345062,"Anzola, chiesa"
1,46.167251,8.345807,Altoggio
2,46.060122,8.11362,Antronapiana
3,45.98987,8.345717,Anzola
4,46.261498,8.319253,Baceno


In [34]:
mySchema = StructType([ StructField("Lat_stop", DoubleType(), True)\
                        ,StructField("Long_stop", DoubleType(), True)\
                        ,StructField("StopName", StringType(), True) ])
df_stop = spark.createDataFrame(df_stop, mySchema)
df_stop.show()

+----------------+----------------+--------------------+
|        Lat_stop|       Long_stop|            StopName|
+----------------+----------------+--------------------+
|45.9899010293845|8.34506152974108|      Anzola, chiesa|
|46.1672513851495|  8.345807131427|            Altoggio|
| 46.060121674738|8.11361957990831|        Antronapiana|
|45.9898698225697|8.34571729989858|              Anzola|
|46.2614983591677|8.31925293162473|              Baceno|
|46.0790618438814|8.29927439970313|Beura Cardezza, c...|
|46.1222963432243|8.21077237789936|Bognanco, T. Vill...|
|46.0656504576122|8.26113193273411|           Boschetto|
|46.2978807772998| 8.3626325767009|            Cadarese|
|46.1340194356792|8.28619492916453|               Caddo|
|46.0916476333918|8.28041876188684|              Calice|
|45.9695691829797|8.04585965801774|            Campioli|
|46.4091810825782| 8.4117524564434|    Cascate del Toce|
|46.0205875326422| 8.2148866619012|         Castiglione|
|45.9710364221151|8.06992552448

In [35]:
df_stop = df_stop.withColumn("lat_len", slen(df_stop.Lat_stop))
df_stop = df_stop.withColumn("lon_len", slen(df_stop.Long_stop))
#df_meta = df_meta.withColumn("precision", min(df_meta.lat_len, df_meta.lon_len))
print(df_stop.agg({"lat_len": "min"}).collect())
print(df_stop.agg({"lon_len": "min"}).collect())

df_stop.orderBy('lon_len').show(2)

[Row(min(lat_len)=1)]
[Row(min(lon_len)=1)]
+----------------+------------+--------------------+-------+-------+
|        Lat_stop|   Long_stop|            StopName|lat_len|lon_len|
+----------------+------------+--------------------+-------+-------+
|             0.0|         0.0|     Isola Superiore|      1|      1|
|47.3611471419894|7.3110197892|Develier, St-Chri...|     13|     10|
+----------------+------------+--------------------+-------+-------+
only showing top 2 rows



In [36]:
print(df_stop.filter(df_stop['StopName'].rlike("Isola Superiore")).collect())

[Row(Lat_stop=0.0, Long_stop=0.0, StopName='Isola Superiore', lat_len=1, lon_len=1)]


Here we can see that this can from an error in the dataset, we use google maps to find the good coordinate of Isola Superiore which is: Isola Superiore: 45.901230 - 8.520450

In [37]:
df_stop = df_stop.withColumn("Lat_stop", \
              when(df_stop["StopName"] == 'Isola Superiore', 45.901230).otherwise(df_stop["Lat_stop"]))
df_stop = df_stop.withColumn("Long_stop", \
              when(df_stop["StopName"] == 'Isola Superiore', 8.520450).otherwise(df_stop["Long_stop"]))

Now we again round all the coordinate by 6 in order to merge both of the dataframe. 

In [38]:
df_stop = df_stop.withColumn("Round_Long", round_6(df_stop.Long_stop))
df_stop = df_stop.withColumn("Round_Lat", round_6(df_stop.Lat_stop))

### Merge Dataframe

In [39]:
Df_meta = df_meta.join(df_stop, on = ['Round_Lat', 'Round_Long'], how='outer') 

In [40]:
print(df_meta.filter(df_meta['StopName_Meta'].like("Lausanne")).count())
print(Df_meta.filter(Df_meta['StopName_Meta'].like("Lausanne") & Df_meta['StopName'].isNull()).count())

188
185


We can see that for the example of Lausanne we just recover 2 name over about a hundred. 
After investigation we find the coordidate for particular station in both dataset: 
<br/>
<br/>Lausanne Malley: 46.524212 - 6.603306 -- 46.524211 - 6.603309
<br/>Lausanne Bourdonette: 46.523466 - 6.589805 -- 46.523465 - 6.589807
<br/>Lausanne Provence: 46.523384 - 6.608102 -- 46.523382 - 6.608106

We can see that each time our merge fail for 1 digit

We try again with a round at 5 digits whith is still a very good precision

In [41]:
round_5 = udf(lambda s: round(s, 5), DoubleType())

In [42]:
df_meta = df_meta.withColumn("Round_Long", round_5(df_meta.Long))
df_meta = df_meta.withColumn("Round_Lat", round_5(df_meta.Lat))

In [43]:
df_stop = df_stop.withColumn("Round_Long", round_5(df_stop.Long_stop))
df_stop = df_stop.withColumn("Round_Lat", round_5(df_stop.Lat_stop))

In [44]:
Df_meta = df_meta.join(df_stop, on = ['Round_Lat', 'Round_Long'], how='outer') 

In [45]:
print(df_meta.filter(df_meta['StopName_Meta'].like("Lausanne")).count())
print(Df_meta.filter(Df_meta['StopName_Meta'].like("Lausanne") & Df_meta['StopName'].isNull()).count())

188
65


We now achieved a satisfactory result

In [46]:
Df_meta.printSchema()

root
 |-- Round_Lat: double (nullable = true)
 |-- Round_Long: double (nullable = true)
 |-- Long: float (nullable = true)
 |-- Lat: float (nullable = true)
 |-- StopName_Meta: string (nullable = true)
 |-- Lat_stop: double (nullable = true)
 |-- Long_stop: double (nullable = true)
 |-- StopName: string (nullable = true)
 |-- lat_len: integer (nullable = true)
 |-- lon_len: integer (nullable = true)



In [47]:
Df_meta = Df_meta.select('Long', 'Lat', 'StopName_Meta', 'StopName')
Df_meta.show()

+--------+---------+----------------+--------------------+
|    Long|      Lat|   StopName_Meta|            StopName|
+--------+---------+----------------+--------------------+
|    null|     null|            null|Macugnaga, Pestarena|
|    null|     null|            null| Lugano, Via Ginevra|
|    null|     null|            null|      Gandria, Paese|
|    null|     null|            null|               Gozzi|
|8.943882|46.034714|        Cureglia|   Cureglia, Rotonda|
|    null|     null|            null|        Bogno, Paese|
|6.090986| 46.15237|           Perly|                null|
|6.044045|46.161507|        Laconnex|Laconnex, Chemin ...|
|8.912559|46.179436|         Agarone|                null|
|8.699336| 46.18245|        Cresmino|      Cresmino, Case|
|6.246757|46.183704|       Annemasse|Annemasse, Généra...|
|7.393176| 46.19771|Les Mayens-de-S.|Les Mayens-de-S.,...|
|6.167676| 46.20001|          Genève|  Genève, Amandolier|
|6.157857|46.203766|          Genève|                nul

In [48]:
Df_meta = Df_meta.na.drop(subset=["Long", 'Lat'])

In [49]:
Df_meta = Df_meta.withColumn("StopName_Meta", \
              when(Df_meta["StopName"].isNotNull(), Df_meta["StopName"]).otherwise(Df_meta["StopName_Meta"]))

In [50]:
Df_meta = Df_meta.select('StopName_Meta', 'Lat', 'Long')

### Request Part

In [51]:
def get_lat_long(name): 
    tmp = Df_meta.select('Lat', 'Long').filter(Df_meta['StopName_Meta'].like(name)).collect()
    if(len(tmp) == 0): 
        assert "Probleme with the location {}".format(name)
    tmp = tmp[0]
    lat = str(tmp).split('=')[1].split(',')[0]
    long = str(tmp).split('=')[2].split(')')[0]
    return lat, long

In [113]:
def return_request(fromPlace, toPlace, departure, Months, Days, Hours, AM_PM, Minutes, Seconds, lat_long_from = False, lat_long_to = False):

    
    if (fromPlace.split(' ')[0] == 'stop'):
        fromPlace = fromPlace[5:-1]
    if lat_long_from == False:
        lat_from, long_from = get_lat_long(fromPlace)
    else:
        lat_from, long_from = lat_long_from[0], lat_long_from[1]
    
    
    
    if (toPlace.split(' ')[0] == 'stop'):
        toPlace = toPlace[5:-1]
    #print('TOOOOO PLACE ################## {} {}'.format(toPlace, len(toPlace)))    
    if lat_long_to == False:
        lat_to, long_to = get_lat_long(toPlace)
    else:
        lat_to, long_to = lat_long_to[0], lat_long_to[1]
    
    url = 'http://10.90.38.21:8829/otp/routers/default/plan?fromPlace=stop+'
    url += '+'.join(fromPlace.split()) +  '+%3A%3A' + str(lat_from) + '%2C' + str(long_from)
    url += '&toPlace=stop+' +  '+'.join(toPlace.split()) +  '+%3A%3A' + str(lat_to) + '%2C' + str(long_to)
    url += '&time={}%3A{}{}&date={}-{}-2018&mode=TRANSIT%2CWALK&maxWalkDistance=804.672&arriveBy={}&wheelchair=false&locale=en&numItineraries=3'.format(Hours, Minutes, AM_PM, Months, Days, not(departure))
    #url += '&time={}&date={}-{}-2018&mode=TRANSIT%2CWALK&maxWalkDistance=804.672&arriveBy=false&wheelchair=false&locale=en&numItineraries=3&departure=true'.format(Datetime, Months, Days)
    #print(url)
    r = requests.get(url)
    #print(r)
    #print(r.json())
    #read_json(r.json())
    return r.json()

### Create itineraries from JSON

In [53]:
def read_json_extract_itineraries(json_data, df_BT):
    info_list = []
    for route in json_data['plan']['itineraries']:
        #Here we show the 3 different path
       # print('-------- Route---------\n')
        info_list_route = []
        for step in route['legs']: 
            #Here we show all the step of the route   
           # print('---Step---\n')
            #print(step)
            mode = step['mode']
            from_ = step['from']['name']
            lat_from = step['from']['lat']
            lon_from = step['from']['lon']
            to_ = step['to']['name']
            lat_to = step['to']['lat']
            lon_to = step['to']['lon']
            
            start_time = str(step['from']['departure'])
            departure_time = time.strftime("%b %d %Y %H:%M:%S,%M", time.localtime(float(start_time[:len(start_time)-3])))
            end_time = str(step['endTime'])
            arrival_time = time.strftime("%b %d %Y %H:%M:%S,%M", time.localtime(float(end_time[:len(end_time)-3])))
            duration = str(step['duration'])
            
            route_id = 0
            trip_id = 0
            agency_name = 'unknown'
            if ('routeShortName' in step.keys()):
                route_id = step['routeShortName']
            if('tripShortName' in step.keys()):
                trip_id = step['tripShortName']
            if('agencyName' in step.keys()):
                agency_name = step['agencyName']
            line_id = trip_id
            if mode != 'RAIL' and mode!='WALK':
                if mode == 'BUS' or mode == 'Bus':
                    tmp = df_BT.where((col('ProductId') == 'Bus') | (col('ProductId') == 'BUS')).where(col('OperatorName') == agency_name).where(col('LineType')==route_id).head(1)
                elif mode =='TRAM' or mode == 'Tram':
                    tmp = df_BT.where((col('ProductId') == 'Tram')).where(col('OperatorName') == agency_name).where(col('LineType')==route_id).head(1)
                else: 
                    tmp = df_BT.where((col('ProductId') == mode)).where(col('OperatorName') == agency_name).where(col('LineType')==route_id).head(1)
                #print(agency_name)
                #print(mode)
                #print((df_BT.where(col('OperatorName') == agency_name).where((col('ProductId') == 'BUS') | (col('ProductId') == 'BUS')).head(2)))
                if len(tmp) == 0:
                    line_id = 'unknown'
                else:
                    line_id = tmp[0].asDict()['LineId']
                #print(final_id)
            
           # print('The product id is {}:'.format(mode))
           # print('Trip from {} at {} to {} at {} with {}'.format(from_,departure_time,to_,arrival_time, mode))
           # if('tripShortName' in step.keys()):
           #     print('The trip ID is {}'.format(trip_id))
           # if('routeShortName' in step.keys()):
           #     print('The route ID is {}'.format(route_id))
        
            info_list_route.append({'product_id': mode, 'from': from_, 'lat_long_from': [lat_from, lon_from] ,'departure_time':departure_time,'to':to_, 'lat_long_to': [lat_to, lon_to], 'arrival_time':arrival_time, 'line_id': line_id})
        info_list.append(info_list_route)
    return info_list 

### Select itineraries respecting the quality

In [54]:
def comp_itinerary_quality(itinerary):
    itinerary_quality_ = 1
    for j_ in range(len(itinerary)-1):
        leg_1 = itinerary[j_]
        leg_2 = itinerary[j_+1]    
        
        transfer_quality = compute_quality(leg_1['arrival_time'], leg_2['departure_time'], leg_1['to'], leg_1['line_id'], leg_2['line_id'])
        #print(transfer_quality)
        itinerary_quality_ = itinerary_quality_ * transfer_quality
    return itinerary_quality_

In [55]:
def split_with_quality(itinerary_list, Quality):
    itinerary_quality_ = [0,]*len(itinerary_list)
    #print('Quality of itineraries:  ')
    for i_ in range(len(itinerary_list)):
        itinerary_quality_[i_] = comp_itinerary_quality(itinerary_list[i_])
        #print('Itinerary number {}, quality: {}'.format(i_,itinerary_quality_[i_]))
    itinerary_list_accepted = np.array(itinerary_list)[[it_>Quality for it_ in itinerary_quality_]].tolist()
    itinerary_list_refused = np.array(itinerary_list)[[not(it_>Quality) for it_ in itinerary_quality_]].tolist()
    return itinerary_list_accepted, itinerary_list_refused

### Explore itineraries "around" a too-low-quality itinerary TODO: make it an actual tree

In [56]:
def date_to_cells(date):
    Month_dict= {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
    Months = Month_dict[date.split(' ')[0]]
    Days = date.split(' ')[1]
    Hours = str(int(date.split(' ')[3].split(':')[0])%12)
    Minutes = date.split(' ')[3].split(':')[1]
    AM_PM = 'AM'
    if int(int(date.split(' ')[3].split(':')[0])/12) == 1:
        AM_PM = 'PM'
    Seconds = date.split(' ')[3].split(':')[2].split(',')[0]
    
    return Months, Days, Hours, Minutes, AM_PM, Seconds

In [94]:
def explore_itineraries(itinerary, df_BT, quality):
    itinerary_list = []
    for j_ in range(len(itinerary)-1):
        #leg_1 = itinerary[j_]
        if comp_itinerary_quality(itinerary[0:j_+1]) < quality:
            continue
        arr_month, arr_day, arr_hour, arr_minute, arr_AM_PM, arr_second = date_to_cells(itinerary[j_]['arrival_time'])
        #new_semi_its = request_with_quality(fromPlace = leg_1['to'], toPlace = itinerary[-1]['to'], Months = arr_month, Days = arr_day, Hours = arr_hour, Minutes = arr_minute, Seconds = arr_second, AM_PM = arr_AM_PM, departure = True, Quality = quality, lat_long_from = leg_1['lat_long_from'], lat_long_to = itinerary[-1]['lat_long_to'] )
        temp_json = return_request(fromPlace = itinerary[j_]['to'], toPlace = itinerary[-1]['to'], Months = arr_month, Days = arr_day, Hours = arr_hour, Minutes = arr_minute, Seconds = arr_second, AM_PM = arr_AM_PM, departure = True, lat_long_from = itinerary[j_]['lat_long_to'], lat_long_to = itinerary[-1]['lat_long_to'])
        #print(temp_json)
        new_partial_its = read_json_extract_itineraries(temp_json, df_BT)
        new_itineraries = [np.append(itinerary[:j_+1],new_partial_its[k_]).tolist() for k_ in range(len(new_partial_its))]
        
        itinerary_list.extend(new_itineraries)
    return itinerary_list

In [62]:
#Get the json of quickest itineraries from local OTP server
test_json = return_request(fromPlace=fromPlace_ , toPlace= toPlace_ ,Months = Months_, Days= Days_, Hours= Hours_, AM_PM = AM_PM_, Minutes = Minutes_, Seconds = Seconds_, departure = departure_)
#Create Dataframe to find LineId from ProductId, LineType and OperatorName. Relevant for Bus and Tram
df_BT = df.where(col('ProductId') != 'Zug').select('ProductId','LineType','OperatorName','LineId').distinct().cache()
#Read json and create itinerary list of dicts
itinerary_test_list = read_json_extract_itineraries(test_json, df_BT)

### Get news from SBB

In [None]:
def display_info(date, stopName):
    url = 'https://data.sbb.ch/api/records/1.0/search/?dataset=rail-traffic-information&lang=en&rows=1000&sort=validityend&facet=validitybegin&facet=validityend&refine.validitybegin={}'.format(date[0])
    tmp = requests.get(url).json()
    infos = []
    for el in tmp['records']: 
        end = str(el['fields']['validityend'].split('T')[0]).split('-')
        if((int(end[0]) == int(date[0]) and int(end[1]) == int(date[1]) and int(end[2]) < int(date[2])) or (int(end[0]) == int(date[0]) and int(end[1]) < int(date[1])) or (int(end[0]) < int(date[0]))):
            break
        #print(end)
        title = el['fields']['title']
        if('End of announcement:' in title): 
            pass
        else:
            if(len(title.split(':')) > 1):
                title = str(title.split(':')[1])
            title = title.replace(' and', '-').replace('engineering work is in progress', '').replace(',','').replace('.', '').replace('Between', '').replace('In', '').replace(' station', '').replace('Work due to a disruption','').strip()
            #print(title.split('- '))
            for el_title in title.split('- '): 
                for el_stop in stopName: 
                    if(el_title.strip() == el_stop.strip()): 
                        print(el_title)
                        infos.append(el['fields']['description'])
    return infos

### Find best itineraries

In [None]:
df_BT = df.where(col('ProductId') != 'Zug').select('ProductId','LineType','OperatorName','LineId').distinct().cache()

In [158]:
quality = 0.90
fromPlace_ = "Zürich, Zürichbergstrasse"
toPlace_ = 'Zürich Enge, Bahnhof'
Months_ = 2
Days_ = 4
Hours_ = 6
AM_PM_ = 'PM'
Minutes_ = 20
Seconds_ = 1
departure_ = True


#Get the json of quickest itineraries from local OTP server
test_json = return_request(fromPlace=fromPlace_ , toPlace= toPlace_ ,Months = Months_, Days= Days_, Hours= Hours_, AM_PM = AM_PM_, Minutes = Minutes_, Seconds = Seconds_, departure = departure_)
#Create Dataframe to find LineId from ProductId, LineType and OperatorName. Relevant for Bus and Tram
#
#Read json and create itinerary list of dicts
itinerary_first_list = read_json_extract_itineraries(test_json, df_BT)
itinerary_acc, itinerary_refu = split_with_quality(itinerary_first_list, Quality = quality)
itinerary_searched = []
iter_=0

## sort bad quality itineraries by arrival time
sorter_ids = np.argsort([itinerary_refu[i_][-1]['arrival_time'] for i_ in range(len(itinerary_refu))])
itinerary_refu = np.array(itinerary_refu)[sorter_ids].tolist()
while len(itinerary_refu) != 0:
    if len(itinerary_acc)>=3:
        break
    iter_+=1
    print('itineraries searched: {}'.format(iter_))
    itinerary_searched_ = itinerary_refu.pop(0)
    itinerary_searched.append(itinerary_searched_)
    print(itinerary_searched_[0]['departure_time'], itinerary_searched_[-1]['arrival_time'])
    #print(itinerary_searched_)
    itinerary_test_list_explored = explore_itineraries(itinerary_searched_, df_BT, quality)
    itinerary_acc_explored, itinerary_refu_explored = split_with_quality(itinerary_test_list_explored, quality)
    for iti_refu in itinerary_refu_explored:
        if not(any([(iti_refu == iti) for iti in itinerary_refu+itinerary_searched])):
            itinerary_refu.append(iti_refu)
    for iti_acc in itinerary_acc_explored:
        if not(any([(iti_acc == iti) for iti in itinerary_acc])):
            itinerary_acc.append(iti_acc)
    
    ## sort by arrival time
    sorter_ids = np.argsort([itinerary_refu[i_][-1]['arrival_time'] for i_ in range(len(itinerary_refu))])
    itinerary_refu = np.array(itinerary_refu)[sorter_ids].tolist()
    

## sort selected itineraries by arrival time
sorter_ids = np.argsort([itinerary_acc[i_][-1]['arrival_time'] for i_ in range(len(itinerary_acc))])
itinerary_acc = np.array(itinerary_acc)[sorter_ids].tolist()

itineraries searched: 1
Feb 04 2018 18:21:15,21 Feb 04 2018 18:41:00,41
itineraries searched: 2
Feb 04 2018 18:31:15,31 Feb 04 2018 18:51:00,51


### Results

In [160]:
## Compare to initial output from OTP
## Print out the arrival time and quality of the three select paths
itinerary_initial_quality_ = [0,]*len(itinerary_first_list)
for i_ in range(len(itinerary_first_list)):
    itinerary_initial_quality_[i_] = comp_itinerary_quality(itinerary_first_list[i_])
    print('Fastest itineraries without quality constraint:')
    print('Itinerary number: {}, quality: {}, dpt: {}, arr: {}, transfers: {}'.format(i_,itinerary_initial_quality_[i_], itinerary_first_list[i_][0]['departure_time'], itinerary_first_list[i_][-1]['arrival_time'],len(itinerary_first_list[i_])-1))

Itinerary number: 0, quality: 0.016150519696289896, dpt: Feb 04 2018 18:21:15,21, arr: Feb 04 2018 18:41:00,41, transfers: 1
Itinerary number: 1, quality: 0.016150519696289896, dpt: Feb 04 2018 18:31:15,31, arr: Feb 04 2018 18:51:00,51, transfers: 1
Itinerary number: 2, quality: 0.016150519696289896, dpt: Feb 04 2018 18:41:15,41, arr: Feb 04 2018 19:01:00,01, transfers: 1


In [159]:
## Print out the arrival time and quality of the three selected paths
itinerary_selected_quality_ = [0,]*len(itinerary_acc)
for i_ in range(len(itinerary_acc)):
    itinerary_selected_quality_[i_] = comp_itinerary_quality(itinerary_acc[i_])
    print('Fastest itineraries with quality constraint:')
    print('Itinerary number: {}, quality: {}, dpt: {}, arr: {}, transfers: {}'.format(i_,itinerary_selected_quality_[i_], itinerary_acc[i_][0]['departure_time'], itinerary_acc[i_][-1]['arrival_time'],len(itinerary_acc[i_])-1))

Itinerary number: 0, quality: 0.9932763142975513, dpt: Feb 04 2018 18:21:15,21, arr: Feb 04 2018 18:51:00,51, transfers: 1
Itinerary number: 1, quality: 0.9975081657292914, dpt: Feb 04 2018 18:21:15,21, arr: Feb 04 2018 19:01:00,01, transfers: 1
Itinerary number: 2, quality: 0.9932763142975513, dpt: Feb 04 2018 18:31:15,31, arr: Feb 04 2018 19:01:00,01, transfers: 1
Itinerary number: 3, quality: 0.9975081657292914, dpt: Feb 04 2018 18:31:15,31, arr: Feb 04 2018 19:11:00,11, transfers: 1


In [62]:
## Print news from SBB regarding the path
date = [2018, Months_, Days_]
test = display_info(date, [fromPlace_, toPlace_])
for el in test: 
    print('\n')
    print(el)

### TODO: Interface to be included above

In [166]:
quality_ = 0.90
fromPlace_ = "Zürich, Zürichbergstrasse"
toPlace_ = 'Zürich Enge, Bahnhof'
Months_ = 2
Days_ = 4
Hours_ = 6
AM_PM_ = 'PM'
Minutes_ = 20
Seconds_ = 1
departure_ = True

In [174]:
def find_itinerary_with_quality(fromPlace , toPlace, Months, Days, Hours, AM_PM, Minutes, Seconds, departure, quality):

    #Get the json of quickest itineraries from local OTP server
    test_json = return_request(fromPlace=fromPlace , toPlace= toPlace ,Months = Months, Days= Days, Hours= Hours, AM_PM = AM_PM, Minutes = Minutes, Seconds = Seconds, departure = departure)
    #Create Dataframe to find LineId from ProductId, LineType and OperatorName. Relevant for Bus and Tram
    #
    #Read json and create itinerary list of dicts
    itinerary_first_list = read_json_extract_itineraries(test_json, df_BT)
    itinerary_acc, itinerary_refu = split_with_quality(itinerary_first_list, Quality = quality)
    itinerary_searched = []
    iter_=0

    ## sort bad quality itineraries by arrival time
    sorter_ids = np.argsort([itinerary_refu[i_][-1]['arrival_time'] for i_ in range(len(itinerary_refu))])
    itinerary_refu = np.array(itinerary_refu)[sorter_ids].tolist()
    while len(itinerary_refu) != 0:
        if len(itinerary_acc)>=3:
            break
        iter_+=1
        print('itineraries expanded: {}'.format(iter_))
        itinerary_searched_ = itinerary_refu.pop(0)
        itinerary_searched.append(itinerary_searched_)
        #print(itinerary_searched_[0]['departure_time'], itinerary_searched_[-1]['arrival_time'])
        #print(itinerary_searched_)
        itinerary_test_list_explored = explore_itineraries(itinerary_searched_, df_BT, quality)
        itinerary_acc_explored, itinerary_refu_explored = split_with_quality(itinerary_test_list_explored, quality)
        for iti_refu in itinerary_refu_explored:
            if not(any([(iti_refu == iti) for iti in itinerary_refu+itinerary_searched])):
                itinerary_refu.append(iti_refu)
        for iti_acc in itinerary_acc_explored:
            if not(any([(iti_acc == iti) for iti in itinerary_acc])):
                itinerary_acc.append(iti_acc)

        ## sort by arrival time
        sorter_ids = np.argsort([itinerary_refu[i_][-1]['arrival_time'] for i_ in range(len(itinerary_refu))])
        itinerary_refu = np.array(itinerary_refu)[sorter_ids].tolist()


    ## sort selected itineraries by arrival time
    sorter_ids = np.argsort([itinerary_acc[i_][-1]['arrival_time'] for i_ in range(len(itinerary_acc))])
    itinerary_acc = np.array(itinerary_acc)[sorter_ids].tolist()


    print('\n Fastest itineraries without quality constraint:')
    ## Initial OTP output
    ## Print out the arrival time and quality of the three select paths
    itinerary_initial_quality_ = [0,]*len(itinerary_first_list)
    for i_ in range(len(itinerary_first_list)):
        itinerary_initial_quality_[i_] = comp_itinerary_quality(itinerary_first_list[i_])
        print('Itinerary number: {}, quality: {}, dpt: {}, arr: {}, transfers: {}'.format(i_,itinerary_initial_quality_[i_], itinerary_first_list[i_][0]['departure_time'], itinerary_first_list[i_][-1]['arrival_time'],len(itinerary_first_list[i_])-1))
    
    print('\n Fastest itineraries with quality constraint:')
    ## Print out the arrival time and quality of the three selected paths
    itinerary_selected_quality_ = [0,]*len(itinerary_acc)
    for i_ in range(len(itinerary_acc)):
        itinerary_selected_quality_[i_] = comp_itinerary_quality(itinerary_acc[i_])
        print('Itinerary number: {}, quality: {}, dpt: {}, arr: {}, transfers: {}'.format(i_,itinerary_selected_quality_[i_], itinerary_acc[i_][0]['departure_time'], itinerary_acc[i_][-1]['arrival_time'],len(itinerary_acc[i_])-1))


In [188]:
itinerary_test_list[0][1]

{'arrival_time': 'Feb 04 2018 18:41:00,41',
 'departure_time': 'Feb 04 2018 18:26:00,26',
 'from': 'Zürich, Kirche Fluntern',
 'lat_long_from': [47.3766006837906, 8.56023498977618],
 'lat_long_to': [47.3641286895461, 8.53156974905593],
 'line_id': '85:3849:005',
 'product_id': 'TRAM',
 'to': 'Zürich Enge, Bahnhof'}

In [175]:
find_itinerary_with_quality(fromPlace_ , toPlace_, Months_, Days_, Hours_, AM_PM_, Minutes_, Seconds_, departure_, quality_)

itineraries expanded: 1
itineraries expanded: 2

 Fastest itineraries without quality constraint:
Itinerary number: 0, quality: 0.016150519696289896, dpt: Feb 04 2018 18:21:15,21, arr: Feb 04 2018 18:41:00,41, transfers: 1
Itinerary number: 1, quality: 0.016150519696289896, dpt: Feb 04 2018 18:31:15,31, arr: Feb 04 2018 18:51:00,51, transfers: 1
Itinerary number: 2, quality: 0.016150519696289896, dpt: Feb 04 2018 18:41:15,41, arr: Feb 04 2018 19:01:00,01, transfers: 1

 Fastest itineraries with quality constraint:
Itinerary number: 0, quality: 0.9932763142975513, dpt: Feb 04 2018 18:21:15,21, arr: Feb 04 2018 18:51:00,51, transfers: 1
Itinerary number: 1, quality: 0.9975081657292914, dpt: Feb 04 2018 18:21:15,21, arr: Feb 04 2018 19:01:00,01, transfers: 1
Itinerary number: 2, quality: 0.9932763142975513, dpt: Feb 04 2018 18:31:15,31, arr: Feb 04 2018 19:01:00,01, transfers: 1
Itinerary number: 3, quality: 0.9975081657292914, dpt: Feb 04 2018 18:31:15,31, arr: Feb 04 2018 19:11:00,11, t

In [184]:
for leg in itinerary_acc[0]:
    print('Take {} from {} at {} to {} arriving at {}'.format(leg['product_id'],leg['from'], leg['departure_time'], leg['to'], leg['arrival_time']))

Take WALK from stop Zürich, Zürichbergstrasse  at Feb 04 2018 18:21:15,21 to Zürich, Kirche Fluntern arriving at Feb 04 2018 18:26:00,26
Take TRAM from Zürich, Kirche Fluntern at Feb 04 2018 18:36:00,36 to Zürich Enge, Bahnhof arriving at Feb 04 2018 18:51:00,51


In [None]:
## Print out the arrival time and quality of the three selected paths
    itinerary_selected_quality_ = [0,]*len(itinerary_acc)
    for i_ in range(len(itinerary_acc)):
        itinerary_selected_quality_[i_] = comp_itinerary_quality(itinerary_acc[i_])
        print('Itinerary number: {}, quality: {}, dpt: {}, arr: {}, transfers: {}'.format(i_,itinerary_selected_quality_[i_], itinerary_acc[i_][0]['departure_time'], itinerary_acc[i_][-1]['arrival_time'],len(itinerary_acc[i_])-1))


In [185]:
from prettytable import PrettyTable

ModuleNotFoundError: No module named 'prettytable'

In [170]:
['Red','Yellow','Green','Brown','Blue','Pink','Grey']\n",
    "nb_pix_t = PrettyTable()\n",
    "nb_pix_t.field_names = ['Color','Image 1', 'Image 2', 'Image 3', 'Image 4']\n",
    "for i_ in range(nb_of_colors):\n",
    "    nb_pix_t.add_row([color_labels[i_], pixels_per_color[0,i_],pixels_per_color[1,i_],pixels_per_color[2,i_],pixels_per_color[3,i_]])\n",
    "print (nb_pix_t)"


SyntaxError: invalid syntax (<ipython-input-170-34eafd9ad13a>, line 1)

In [None]:
Days = [i for i in range(1, 32)]
Months = [i for i in range(1, 13)]
Hours = [i for i in range(0, 13)]
AM_PM = ['AM', 'PM']
Minutes = [0, 15, 30, 45]
Seconds = [0, 15, 30, 45]

In [None]:
StopName = Df_meta.select('StopName_Meta').distinct().collect()
StopName = [str(x).replace('"', "'") for x in StopName]
StopName = [str(x)[19:] for x in StopName]
StopName = [str(x).split("')")[0] for x in StopName]
StopName = sorted(StopName)

In [None]:
interact_manual(return_request, fromPlace=StopName, toPlace= StopName,Months = Months, Days= Days, Hours= Hours, AM_PM = AM_PM,Minutes=Minutes, Seconds = Seconds ,departure=True)

In [None]:
info_list = return_read_json(test_json)

### Test to see what we have in the data

It seems that we don't have data for bus and subway, at least near Lausanne 

After investigation it's seems that we have data for the LEB in Lausanne. 

In [None]:
df_tmp.printSchema()

In [None]:
df_tmp.select('ProductId').distinct().show()

In [None]:
df_tmp.filter(df_tmp['StopName'].rlike("Lausanne") & (col('ProductId') != 'Zug')).show(1)

In [None]:
df_tmp.where((col('ProductId') == 'Zug') & (col('Line_ID') == '108')).show(10)

In [None]:
df.where((col('ProductId') == 'BUS')).select('BPUIC').distinct().count()

In [None]:
LineType_temp = itinerary_test_list[0][1]['f_id']
StopName_temp = itinerary_test_list[0][1]['to']
df.where((col('LineType') == LineType_temp)).where(col('StopName') == StopName_temp).select('LineId').head(2)[0].asDict()['LineId']

In [None]:
list(df.where((col('LineType') == LineType_temp)).where(col('StopName') == StopName_temp).select('LineId').head(2)[1].asDict())

In [None]:
df.head(1)

In [None]:
df.where((col('ProductId') == 'BUS')).select('LineId','LineType','StopName').show(20)

In [None]:
list((df.where((col('ProductId') == 'Zug')).select('TripId','LineId','LineType').where(df['LineId']==2538).head(1)[0]).asDict())

In [119]:
df_tmp.iloc[0]

AttributeError: 'DataFrame' object has no attribute 'iloc'

In [None]:
df_tmp.where(df_tmp['StopName'].rlike("Lausanne") &(col('ProductId') == 'Bus')).show(10)

In [None]:
def read_json(json_data):
    for route in json_data['plan']['itineraries']:
        #Here we show the 3 different path
        print('-------- Route---------\n')
        #print(route)
        for step in route['legs']: 
            #Here we show all the step of the route   
            print('---Step---\n')
            #print(step)
            from_ = step['from']['name']
            to_ = step['to']['name']
            mode = step['mode']
            if('tripShortName' in step.keys()):
                route_id = step['tripShortName']
                #print(step)
            end_time = str(step['endTime'])
            duration = str(step['duration'])
            print('For the travel from {} to {} in {} \n'.format(from_, to_, mode))
            print('The duration is {} and the arrival time is {}\n'.format(time.strftime("%H:%M:%S", time.gmtime(float(duration))), time.strftime("%b %d %Y %H:%M:%S,%M", time.gmtime(float(end_time[:len(end_time)-3])))))
            if('tripShortName' in step.keys()):
                print('The route ID is {}'.format(route_id))
    

In [None]:
def request(fromPlace, toPlace, departure,Hours, AM_PM,Minutes, Months, Days):
    lat_from, long_from = get_lat_long(fromPlace)
    lat_to, long_to = get_lat_long(toPlace)
    url = 'http://10.90.38.21:8829/otp/routers/default/plan?fromPlace=stop+'
    url += '+'.join(fromPlace.split()) +  '+%3A%3A' + str(lat_from) + '%2C' + str(long_from)
    url += '&toPlace=stop+' +  '+'.join(toPlace.split()) +  '+%3A%3A' + str(lat_to) + '%2C' + str(long_to)
    url += '&time={}%3A{}{}&date={}-{}-2018&mode=TRANSIT%2CWALK&maxWalkDistance=804.672&arriveBy=true&wheelchair=false&locale=en&numItineraries=6'.format(Hours, Minutes,AM_PM, Months, Days)
    print(url)
    r = requests.get(url)
    print(r)
    #print(r.json())
    read_json(r.json())
    #return r.json()

In [58]:
def explore_itineraries_tree(itinerary, df_BT):
    itinerary_list = []
    for j_ in range(len(itinerary)-1):
        leg_1 = itinerary[j_]
        arr_month, arr_day, arr_hour, arr_minute, arr_AM_PM, arr_second = date_to_cells(leg_1['arrival_time'])
        #new_semi_its = request_with_quality(fromPlace = leg_1['to'], toPlace = itinerary[-1]['to'], Months = arr_month, Days = arr_day, Hours = arr_hour, Minutes = arr_minute, Seconds = arr_second, AM_PM = arr_AM_PM, departure = True, Quality = quality, lat_long_from = leg_1['lat_long_from'], lat_long_to = itinerary[-1]['lat_long_to'] )
        temp_json = return_request(fromPlace = leg_1['to'], toPlace = itinerary[-1]['to'], Months = arr_month, Days = arr_day, Hours = arr_hour, Minutes = arr_minute, Seconds = arr_second, AM_PM = arr_AM_PM, departure = True, lat_long_from = leg_1['lat_long_to'], lat_long_to = itinerary[-1]['lat_long_to'])
        #print(temp_json)
        new_partial_its = read_json_extract_itineraries(temp_json, df_BT)
        it_list = []
        for k_ in range(len(new_partial_its)):
            if len(new_partial_its[k_]) == 1:
                it_list.extend(new_partial_its[k_])
            if len(new_partial_its[k_]) != 1:
                it_list.extend(explore_itineraries(new_partial_its[k_], df_BT))
        
        new_itineraries = [np.append(itinerary[:j_+1],it_list[m_]).tolist() for m_ in range(len(it_list))]
        
        itinerary_list.extend(new_itineraries)
    return itinerary_list