In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import cross_validate
from tqdm import tnrange, tqdm_notebook, tqdm
from datetime import timedelta
from datetime import datetime
from sklearn import metrics

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import scipy.stats as stats
import xgboost as xgb
import requests as r
import pandas as pd
import seaborn as s
import numpy as np
import googlemaps
import postgres
import holidays
import config
import json
import math
import gc

# enable automatic garbage collection
gc.enable()

import warnings
warnings.filterwarnings("ignore")

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Importing Data

### Stop Distances

In [4]:
stop_distances = pd.read_csv("stored_queries/distancedata.csv", header=None)
stop_distances.columns = ['stopid','previous_stopid','distance']
stop_distances.head()

Unnamed: 0,stopid,previous_stopid,distance
0,1636,1635,397
1,1338,1337,151
2,837,836,137
3,2740,7177,481
4,1201,4385,324


### Stops

In [5]:
stops = pd.read_csv("stop_information.csv")
cols = list(stops.columns)
cols[0] = 'ix'
stops.columns = cols
stops.drop(columns=cols[0], inplace=True)

### weather Data Preprocessing

In [6]:
weather = pd.read_csv("stored_queries/weather.csv")
weather.icon = weather.icon.astype('category')
weather.dayofservice = pd.to_datetime(weather.dayofservice)

### Leavetimes Data Preprocessing

In [7]:
data = pd.read_csv("stored_queries/combined.csv")
data.columns = ['dayofservice','tripid','lineid','direction','progrnumber','stopid','plannedDEP','plannedARR','actualDEP','actualARR','routeid']
gc.collect()

12

In [8]:
data.drop(columns=['routeid','plannedDEP','plannedARR','actualDEP'], inplace=True)
data.drop_duplicates()
gc.collect()

Unnamed: 0,dayofservice,tripid,lineid,direction,progrnumber,stopid,actualARR
0,2018-01-01,5966674,1,1,12,119,54023
1,2018-01-01,5959105,1,1,12,119,59955
2,2018-01-01,5966888,1,1,12,119,58771
3,2018-01-01,5965960,1,1,12,119,56309
4,2018-01-01,5965964,1,1,12,119,70663
5,2018-01-01,5958117,1,1,12,119,72539
6,2018-01-01,5959109,1,1,12,119,74173
7,2018-01-01,5972114,1,1,12,119,40096
8,2018-01-01,5959099,1,1,12,119,38271
9,2018-01-01,5965956,1,1,12,119,42017


7

In [9]:
data.dayofservice = pd.to_datetime(data.dayofservice.loc[:])
gc.collect()

0

In [10]:
data.sort_values(by=['dayofservice','lineid','tripid','direction','progrnumber'], inplace=True)
gc.collect()

36

## Combine in a loop and process per stops set to save memory

### Stop Pairs

In [12]:
stop_pairs = stop_distances[['stopid','previous_stopid']].drop_duplicates()

print("There are %d unique pairs of stops in stop distances" % (stop_pairs.count()[0]))

There are 119817 unique pairs of stops in stop distances


In [13]:
gc.collect()

14

### Loop thorugh stop pairs

In [14]:
data.actualARR = data.dayofservice + pd.to_timedelta(data.actualARR, unit = 'seconds') # in nanoseconds

# new columns for combining
data['weather_merge_time'] = data.actualARR.dt.round('H') #  .dt useful

# weather data
weather.dayofservice = weather.dayofservice + pd.to_timedelta(weather.hour, unit='hour')

# new column for combining
weather['rkey'] = weather.dayofservice

### Combining Data

In [15]:
combinedata = data.merge(weather[['icon','temperature','humidity','windSpeed','rain','rkey','hour']], 
                         left_on='weather_merge_time', 
                         right_on='rkey', 
                         how='left')

In [18]:
gc.enable()

In [19]:
gc.get_stats()

[{'collections': 479, 'collected': 3366, 'uncollectable': 0},
 {'collections': 44, 'collected': 421, 'uncollectable': 0},
 {'collections': 10, 'collected': 150, 'uncollectable': 0}]

In [20]:
combinedata.drop(columns=['rkey','lineid','weather_merge_time'], inplace=True)
gc.collect()

7

In [37]:
combinedata.count()

dayofservice    115580037
tripid          115580037
direction       115580037
progrnumber     115580037
stopid          115580037
actualARR       115580037
icon            115433892
temperature     115433892
humidity        115433892
windSpeed       115335338
rain            115433892
hour            115433892
dtype: int64

In [36]:
combinedata.head()

Unnamed: 0,dayofservice,tripid,direction,progrnumber,stopid,actualARR,icon,temperature,humidity,windSpeed,rain,hour
0,2018-01-01,5956265,1,1,226,2018-01-01 10:00:48,partly-cloudy-day,43.08,0.79,15.69,0.0,10.0
1,2018-01-01,5956265,1,2,228,2018-01-01 10:01:15,partly-cloudy-day,43.08,0.79,15.69,0.0,10.0
2,2018-01-01,5956265,1,3,229,2018-01-01 10:01:31,partly-cloudy-day,43.08,0.79,15.69,0.0,10.0
3,2018-01-01,5956265,1,4,227,2018-01-01 10:02:05,partly-cloudy-day,43.08,0.79,15.69,0.0,10.0
4,2018-01-01,5956265,1,5,230,2018-01-01 10:03:03,partly-cloudy-day,43.08,0.79,15.69,0.0,10.0


In [24]:
# ==================== REMOVE INACTIVE STOPS ====================== #
active_stopids = stops.stopid.values

# remove all inactive stops from the dataset. -> additional models that arent needed. 
combinedata = combinedata[combinedata.stopid.isin(active_stopids)]

In [45]:
gc.collect()

0

In [None]:
# ==================== #
# previous stopid
previousstops =  list(combinedata.stopid)
previousstops = np.array(previousstops[:-1]).astype(int)

# progrnumber of previous stopid
previousstops_progrnumber = list(combinedata.progrnumber)
previousstops_progrnumber = np.array(previousstops_progrnumber[:-1]).astype(int)

# Actual arrival time of previous stopid
previousstops_actualARR = list(combinedata.actualARR)
previousstops_actualARR = np.array(previousstops_actualARR[:-1])

gc.collect()

In [None]:
# ==================== #
# previous stopid
previousstops =  list(combinedata.stopid)
previousstops = np.array(previousstops[:-1]).astype(int)

# progrnumber of previous stopid
previousstops_progrnumber = list(combinedata.progrnumber)
previousstops_progrnumber = np.array(previousstops_progrnumber[:-1]).astype(int)

# Actual arrival time of previous stopid
previousstops_actualARR = list(combinedata.actualARR)
previousstops_actualARR = np.array(previousstops_actualARR[:-1])

# Delete the first row of the dataframe to shift the progrnumbers by one. 
combinedata = combinedata.iloc[1:]

# garbage collection to free memory
gc.collect()

combinedata['previous_stopid'] = previousstops
combinedata['previous_stopARR'] = previousstops_actualARR
combinedata['previous_progrnumber'] = previousstops_progrnumber

#=======================#

combinedata = combinedata[combinedata.progrnumber != 1]
combinedata.dropna(inplace=True);

# recast type of integer cols from float to int. 
combinedata.previous_stopid = combinedata.previous_stopid.astype(int)
combinedata.previous_progrnumber = combinedata.previous_progrnumber.astype(int)

# make progrnumber difference column and then drop anything thats not exactly 1, removes data which skips stops. 
combinedata['progrnumber_difference'] = combinedata.progrnumber - combinedata.previous_progrnumber

#======================#
# remove non-consecutive stop pairs.
combinedata = combinedata[combinedata.progrnumber_difference==1]

# Remove additional columns added for this operantion
# combinedata.drop(columns=['progrnumber','previous_progrnumber','progrnumber_difference'], inplace=True);

# ordering rows [and dropping irrelevant ones: direction, route_id]
combinedata = combinedata[['dayofservice', 'tripid','stopid', 'previous_stopid', 'actualARR', 'previous_stopARR',
                           'icon', 'temperature', 'humidity', 'windSpeed', 'rain', 'hour', 'weekend', 'holiday']]

#=========================#
# convert to seconds
combinedata['travel_time'] = (combinedata.actualARR - combinedata.previous_stopARR).astype(int)/10**9

# drop any values less than 5 seconds [assumed erroneous]
combinedata = combinedata[combinedata.travel_time > 5]

#========================#

combinedata['month'] = combinedata.dayofservice.dt.month

def set_season(x):
    winter = [11,12,1]
    autumn = [10,9,8]
    spring = [4,3,2]

    if x in winter:
        return 'Winter'
    elif x in autumn:
        return 'Autumn'
    elif x in spring:
        return 'Spring'
    else:
        return 'Summer'
    
combinedata['season'] = combinedata.dayofservice.dt.month.apply(set_season)

#=======================#
combinedata.season = combinedata.season.astype('category', categories=['Summer','Spring','Autumn','Winter'])

combinedata = pd.concat([combinedata, pd.get_dummies(combinedata.season, prefix='season')], axis=1)
combinedata.drop(columns=['season'], inplace=True)

#=======================#
combinedata.icon = combinedata.icon.astype('category', categories=['partly-cloudy-day', 'partly-cloudy-night', 'clear-day', 'clear-night', 'rain', 'fog', 'cloudy', 'wind'])

combinedata = pd.concat([combinedata, pd.get_dummies(combinedata.icon, prefix='icon')], axis=1)
combinedata.drop(columns=['icon'], inplace=True)


#=======================#
gc.collect()

combinedata = combinedata.dropna() # drop na values. 
combinedata.dtypes

print("There are %d valid pairs" % combinedata.count()[0])


#=======================#
stop_pairs = combinedata[['stopid','previous_stopid']].drop_duplicates()

## Separating stop_pairs

## Loop thorugh and train each model individually. 

In [None]:
def Train_model(df, A, B):
    
    modeldata = combinedata[['travel_time','stopid','previous_stopid','distance',
                         'temperature','humidity', 'windSpeed', 'rain', 'hour', 'holiday', 'weekend',
                         'month','season_Winter','season_Autumn','season_Summer','season_Spring',
                         'icon_clear-day', 'icon_clear-night', 'icon_cloudy', 'icon_fog',
                         'icon_partly-cloudy-day', 'icon_partly-cloudy-night', 'icon_rain','icon_wind']]
    
    target     = ['travel_time']
    predictors = ['temperature','humidity', 'windSpeed', 'rain', 'hour', 'holiday', 'weekend',
                  'month','season_Winter','season_Autumn','season_Summer','season_Spring',
                  'icon_clear-day', 'icon_clear-night', 'icon_cloudy', 'icon_fog',
                  'icon_partly-cloudy-day', 'icon_partly-cloudy-night', 'icon_rain','icon_wind']
    

In [None]:
# Training Model for all pairs of stops in the dataset

models = dict()
emptys = []
metric = dict()

no_stops = stop_pairs.count()[0]
print("There are %d models to train." % no_stops)

for pair in tqdm_notebook(stop_pairs.iterrows(), desc="Progress: ", total=no_stops):

    # Travelling From A -> B
    A = pair[1][1]
    B = pair[1][0]
    
    # ========================= Rows from A -> B =========================== #
    data = modeldata[(modeldata.stopid==B)&(modeldata.previous_stopid==A)]
    
    if data.count()[0] > 10:
    
        # ========================= Removing Outliers ========================== #
        travel_sigma = data.travel_time.std()

        # Only allow travel times greater than zero 
        data = data[data.travel_time >= 0]

        # Filter outliers from the dataset [ 2σ as the cutoff ~95% of data ]
        data = data[abs(data.travel_time - data.travel_time.mean()) < 2*travel_sigma]

        # ========================= Remove Null Data =========================== #
        data.dropna(inplace=True)

        # ========================= Test/Train Splits ========================== #
        X_train, X_test, y_train, y_test = train_test_split(data[predictors],data[target].values.ravel(), test_size=0.3, shuffle=True)

        # ========================== Making DMatrices ========================== #
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)

        # =========================== Training Model =========================== #

        param = {
            'eta': 0.15,
            'max_depth': 6
        }

        num_rounds = 10000

        try:
            
            # ============ Train ============= #
            model = xgb.train(param, dtrain, num_rounds, evals=[(dtest, 'Test')], verbose_eval=False, early_stopping_rounds=100)
            models[f'{A}_{B}'] = model
            
            # ============ Testing Accuracy ========== #
            preds = model.predict(dtest)
            metric[f'{A}_{B}'] = dict()
            metric[f'{A}_{B}']['rmse'] = np.sqrt(metrics.mean_squared_error(preds, y_test))
            metric[f'{A}_{B}']['preds']= preds
            metric[f'{A}_{B}']['ytest']= y_test
            
            
        except Exception as e:
            print(f"Error with route: {A} -> {B}")
            data.head()
            y_train, y_test
            print(repr(e), end='\n================================================\n')
    else:
        print(f"Empty Set Error: {A} -> {B}")
        emptys.append((A,B))