## Setup and importing modules

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import cross_validate
from tqdm import tnrange, tqdm_notebook, tqdm
from datetime import timedelta
from datetime import datetime
from sklearn import metrics

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import scipy.stats as stats
import xgboost as xgb
import requests as r
import pandas as pd
import seaborn as s
import numpy as np
import googlemaps
import postgres
import holidays
import config
import json
import math

import warnings
warnings.filterwarnings("ignore")

In [None]:
# parallelisation with Dask for handling large dataframe
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
PB = ProgressBar()
PB.register()

In [2]:
import importlib
importlib.reload(postgres.config)

<module 'config' from '/media/storage/College/S3/Github/DublinBus/Analytics/config.py'>

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Import Data

### Bus Data

In [4]:
# data = postgres.query("SELECT * FROM combined;", tunnel=True)
# data = pd.DataFrame(data)

# data = pd.read_csv("stored_queries/combined145.csv")

# Dask Version
# data = dd.read_csv("stored_queries/combined.csv")

# Drop duplicates 
data.drop_duplicates(inplace=True)

NameError: name 'data' is not defined

In [None]:
data.columns = ['dayofservice','tripid','lineid','direction','progrnumber','stopid','plannedDEP','plannedARR','actualDEP','actualARR','routeid']

In [None]:
data.head()

In [None]:
data.drop(columns=['routeid','plannedDEP','plannedARR','actualDEP'], inplace=True)

In [None]:
# Dask Version
# data.dayofservice = dd.to_datetime(data.dayofservice.loc[:])


data.dayofservice = pd.to_datetime(data.dayofservice.loc[:])
# data.lineid = data.lineid.astype('category')
# data.routeid= data.routeid.astype('category')

In [None]:
data.sort_values(by=['dayofservice','lineid','tripid','direction','progrnumber'],inplace=True)
# data.to_csv("stored_queries/combined145.csv", index=False, chunksize=500000)

### Trips information [for full route prediction]

In [None]:
tripsdata = pd.read_csv("stored_queries/trips_df.csv")
tripsdata.head()

In [None]:
tripsdata = tripsdata[['dayofservice', 'tripid', 'lineid', 'routeid', 'direction', 'actual_arr', 'actual_dep']]
tripsdata.dayofservice = pd.to_datetime(tripsdata.dayofservice)
tripsdata.dropna(inplace=True)

### Stop Information

In [None]:
stops = pd.read_csv("stop_information.csv")

In [None]:
cols = list(stops.columns)
cols[0] = 'ix'
stops.columns = cols
stops.drop(columns=cols[0], inplace=True)

stops.head()

### Weather Data

In [None]:
weather = pd.read_csv("stored_queries/weather.csv")

weather.head()

In [None]:
weather.count()

In [None]:
weather.icon = weather.icon.astype('category')
weather.dayofservice = pd.to_datetime(weather.dayofservice)

### Export/ Import the number of stops on each lineid for basic model.

In [None]:
# # Read in all lineids from teh database and store in a text file.


# lineids = postgres.query("Select distinct(lineid) from combined;", tunnel=True)

# q = dict()
# for lidx in tnrange(len(lineids)):
    
#     lid = lineids[lidx]
#     q[lid[0]] = postgres.query("SELECT MAX(progrnumber) FROM combined WHERE lineid='%s';" % str(lid[0]), tunnel=True)
    
# with open("stops_per_line.txt",'w') as f:
#     f.write(json.dumps(q))
# f.closed


In [None]:
with open("stops_per_line.txt",'r') as g:
    max_stops_per_line = json.loads(g.readlines()[0])

### Import distances between stops data

In [None]:
stop_distances = pd.read_csv("stored_queries/distancedata.csv", header=None)
stop_distances.columns = ['stopid','previous_stopid','distance']
stop_distances.head()

## Prepairing Data for Combining

#### Weather and leavetimes

In [None]:
# leavetimes data
# data.plannedARR = data.dayofservice + pd.to_timedelta(data.plannedARR, unit = 'seconds') # in nanoseconds
# data.plannedDEP = data.dayofservice + pd.to_timedelta(data.plannedDEP, unit = 'seconds') # in nanoseconds
data.actualARR = data.dayofservice + pd.to_timedelta(data.actualARR, unit = 'seconds') # in nanoseconds
# data.actualDEP = data.dayofservice + pd.to_timedelta(data.actualDEP, unit = 'seconds') # in nanoseconds

# new columns for combining
# data['time_at_stop'] = data.actualDEP - data.actualARR
data['weather_merge_time'] = data.actualARR.dt.round('H') #  .dt useful


# weather data
weather.dayofservice = weather.dayofservice + pd.to_timedelta(weather.hour, unit='hour')

# new column for combining
weather['rkey'] = weather.dayofservice

#### Trips data preparation

In [None]:
tripsdata.actual_arr = tripsdata.dayofservice + pd.to_timedelta(tripsdata.actual_arr, unit='seconds')
tripsdata.actual_dep = tripsdata.dayofservice + pd.to_timedelta(tripsdata.actual_dep, unit='seconds')
tripsdata['triplength'] = tripsdata.actual_arr - tripsdata.actual_dep
tripsdata['leavehour'] = tripsdata.actual_dep.dt.hour

In [None]:
tripsdata.head()

In [None]:
weather.head()

In [None]:
data.head()

## Combining Data

### Combining weather and leavetimes

In [None]:
gc.enable()
gc.get_stats()
gc.collect()

In [None]:
data.head()

In [None]:
combinedata = data.merge(weather[['icon','temperature','humidity','windSpeed','rain','rkey','hour']], 
                         left_on='weather_merge_time', 
                         right_on='rkey', 
                         how='left')

In [None]:
# Free up memory after the data table is deleted. 
del data

import gc
gc.collect()

In [None]:
# drop lineid as all are 145
# combinedata.drop(columns=['rkey','lineid','weather_merge_time','plannedDEP','plannedARR','time_at_stop','actualDEP'], inplace=True)
combinedata.drop(columns=['rkey','lineid','weather_merge_time'], inplace=True)
gc.collect()

### Combining trips and weather

In [None]:
tripsdata['weather_merge_time'] = tripsdata.actual_dep.dt.round('H')

In [None]:
combinedtrip = tripsdata.merge(weather[['icon','temperature','humidity','windSpeed','rain','rkey','hour']], 
                               left_on='weather_merge_time', 
                               right_on='rkey', 
                               how='left')

## Cleaning / Adding Additional features

### Remove inactive stops from data

In [None]:
active_stopids = stops.stopid.values

# remove all inactive stops from the dataset. -> additional models that arent needed. 
combinedata = combinedata[combinedata.stopid.isin(active_stopids)]

### weekday vs weekend

In [None]:
combinedata['weekend'] = combinedata.dayofservice.dt.weekday.isin([5,6])

### holidays

In [None]:
combinedata.count()[0]

In [None]:
ie_holidays = holidays.Ireland()
combinedata['holiday'] = combinedata.dayofservice.apply(lambda x: x in ie_holidays)

### Pair Consecutive Stop IDs 

#### Matching progrnumbers to previous stop

In [None]:
# previous stopid
previousstops =  list(combinedata.stopid)
previousstops = np.array(previousstops[:-1]).astype(int)

# progrnumber of previous stopid
previousstops_progrnumber = list(combinedata.progrnumber)
previousstops_progrnumber = np.array(previousstops_progrnumber[:-1]).astype(int)

# Actual arrival time of previous stopid
previousstops_actualARR = list(combinedata.actualARR)
previousstops_actualARR = np.array(previousstops_actualARR[:-1])

# Delete the first row of the dataframe to shift the progrnumbers by one. 
combinedata = combinedata.iloc[1:]

# garbage collection to free memory
gc.collect()

In [None]:
combinedata['previous_stopid'] = previousstops
combinedata['previous_stopARR'] = previousstops_actualARR
combinedata['previous_progrnumber'] = previousstops_progrnumber

In [None]:
combinedata.head()

#### Dropping mis-matched progrnumbers

In [None]:
# Dropping rows where progrnumber==1 as the first row is currently aligned with the last row of the previous tripid.
combinedata = combinedata[combinedata.progrnumber != 1]
combinedata.dropna(inplace=True);

#### Dropping non-consecutive stop combinations

In [None]:
# recast type of integer cols from float to int. 
combinedata.previous_stopid = combinedata.previous_stopid.astype(int)
combinedata.previous_progrnumber = combinedata.previous_progrnumber.astype(int)

# make progrnumber difference column and then drop anything thats not exactly 1, removes data which skips stops. 
combinedata['progrnumber_difference'] = combinedata.progrnumber - combinedata.previous_progrnumber

# checking how many rows will be left. 
# combinedata.progrnumber_difference.value_counts()

In [None]:
# remove non-consecutive stop pairs.
combinedata = combinedata[combinedata.progrnumber_difference==1]

# Remove additional columns added for this operantion
# combinedata.drop(columns=['progrnumber','previous_progrnumber','progrnumber_difference'], inplace=True);

# ordering rows [and dropping irrelevant ones: direction, route_id]
combinedata = combinedata[['dayofservice', 'tripid','stopid', 'previous_stopid', 'actualARR', 'previous_stopARR',
                           'icon', 'temperature', 'humidity', 'windSpeed', 'rain', 'hour', 'weekend', 'holiday']]

#### Unique Stopid combinations

In [None]:
# all unique stop combinations for a given lineid.
stop_pairs = combinedata[['stopid','previous_stopid']].drop_duplicates()

# print("There are %d unique pairs of stops on line: %s" % (stop_pairs.count()[0], data.lineid.unique()[0]))

### Travel Time

In [None]:
# convert to seconds
combinedata['travel_time'] = (combinedata.actualARR - combinedata.previous_stopARR).astype(int)/10**9

# drop any values less than 5 seconds [assumed erroneous]
combinedata = combinedata[combinedata.travel_time > 5]

In [None]:
print("There are %d valid pairs" % combinedata.count()[0])

### Distance between stops [ === Don't run again === ]

In [34]:
# Function to get the distance between two stops. 
# def get_distance(start, finish):
#     """
#     Distance between two (lat,lng) pairs
    
#     Inputs:
#     ================================
#     (int) start: stopid of first stop
#     (int) finish: stopid of last stop
    
#     Outputs:
#     ===============================
#     (int) the distance in metres between the stops. 
    
#     Notes:
#     ===============================
#     If there is an error, or the api fails to find the distance a value of None will be returned. 
#     """
#     try:
#         begin = (stops[stops.stopid==start ]['lat'].values[0], stops[stops.stopid==start ]['lng'].values[0])
#         end   = (stops[stops.stopid==finish]['lat'].values[0], stops[stops.stopid==finish]['lng'].values[0])

#     except Exception as e:

#         print(start, finish)
#         print(repr(e)) 
#         return None
        
#     API_key = config.dmatrix_key #enter Google Maps API key
#     gmaps = googlemaps.Client(key=API_key)
    
#     try:
#         call = gmaps.distance_matrix(begin, end, mode='walking')
    
#     except Exception as eL:
        
#         print(repr(eL))
#         return None
    
#     status = call['status']
    
#     if status=='OK':
#         return call["rows"][0]["elements"][0]['distance']['value']
    
#     else:
#         print(status)
#         return None

# distances_list = []

# for index, pair in tqdm_notebook(stop_pairs.iterrows(), total=stop_pairs.shape[0]):

# # for pair in stop_pairs.iterrows():
# #     start_stopid, finish_stopid = pair[1]

#     start_stopid, finish_stopid = pair[0], pair[1]
#     distances_list.append(get_distance(start_stopid, finish_stopid))
    
# distance_array = np.array(distances_list)

# stop_pairs['distance'] = distance_array

In [None]:
combinedata = combinedata.merge(stop_distances, how='left', left_on=['stopid','previous_stopid'], right_on=['stopid','previous_stopid'])
combinedata.distance = combinedata.distance.astype(int)

### Average Speed

In [None]:
# Taking average speed as distance / time (km/h)
combinedata['avgvel'] = (combinedata.distance / combinedata.travel_time) * (3600/1000)

# Note need to drop all data over 120 km/h -> erroneous data
combinedata = combinedata[combinedata.avgvel <= 120]

### Adding Month/Season

In [None]:
combinedata['month'] = combinedata.dayofservice.dt.month

def set_season(x):
    winter = [11,12,1]
    autumn = [10,9,8]
    spring = [4,3,2]

    if x in winter:
        return 'Winter'
    elif x in autumn:
        return 'Autumn'
    elif x in spring:
        return 'Spring'
    else:
        return 'Summer'
    
combinedata['season'] = combinedata.dayofservice.dt.month.apply(set_season)

## Correlation / boxplots

In [None]:
# Note: travel time is continuous but is needed for comparison
categorical_features = ['hour','holiday', 'weekend','month','season','icon']

continuous_features  = ['travel_time','distance','temperature','windSpeed','rain','humidity']

In [None]:
corr = combinedata[continuous_features].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

In [None]:
for cat_ft in categorical_features:
    combinedata.boxplot(column=['travel_time'], by=cat_ft, grid=False, figsize=(15,6), showfliers=False)
    plt.show()

## Encoding categorical data

#### Season Categories

In [None]:
combinedata.season = combinedata.season.astype('category', categories=['Summer','Spring','Autumn','Winter'])

combinedata = pd.concat([combinedata, pd.get_dummies(combinedata.season, prefix='season')], axis=1)
combinedata.drop(columns=['season'], inplace=True)

#### Icon Categories

In [None]:
combinedata.icon = combinedata.icon.astype('category', categories=['partly-cloudy-day', 'partly-cloudy-night', 'clear-day', 'clear-night', 'rain', 'fog', 'cloudy', 'wind'])

combinedata = pd.concat([combinedata, pd.get_dummies(combinedata.icon, prefix='icon')], axis=1)
combinedata.drop(columns=['icon'], inplace=True)

### Drop all N/A values

In [None]:
gc.collect()

combinedata = combinedata.dropna() # drop na values. 
combinedata.dtypes

print("There are %d valid pairs" % combinedata.count()[0])

In [None]:
combinedata.head().T

In [None]:
# Updating stop pairs
# all unique stop combinations for a given lineid.
stop_pairs = combinedata[['stopid','previous_stopid']].drop_duplicates()

print("There are %d unique pairs of stops" % (stop_pairs.count()[0]))

## Train Models

### Setting Predictor / Target variables

In [None]:
modeldata = combinedata[['travel_time','stopid','previous_stopid','distance',
                         'temperature','humidity', 'windSpeed', 'rain', 'hour', 'holiday', 'weekend',
                         'month','season_Winter','season_Autumn','season_Summer','season_Spring',
                         'icon_clear-day', 'icon_clear-night', 'icon_cloudy', 'icon_fog',
                         'icon_partly-cloudy-day', 'icon_partly-cloudy-night', 'icon_rain','icon_wind']]
modeldata.dtypes

In [None]:
# need to put this in a loop over the pairs of stops. (unique)
target     = ['travel_time']
predictors = ['temperature','humidity', 'windSpeed', 'rain', 'hour', 'holiday', 'weekend',
              'month','season_Winter','season_Autumn','season_Summer','season_Spring',
              'icon_clear-day', 'icon_clear-night', 'icon_cloudy', 'icon_fog',
              'icon_partly-cloudy-day', 'icon_partly-cloudy-night', 'icon_rain','icon_wind']

General_predictors = ['temperature','humidity', 'windSpeed', 'rain', 'hour', 
                      'holiday', 'weekend','month','distance',
                      'season_Winter','season_Autumn','season_Summer','season_Spring',
                      'icon_clear-day', 'icon_clear-night', 'icon_cloudy', 'icon_fog',
                      'icon_partly-cloudy-day', 'icon_partly-cloudy-night', 
                      'icon_rain','icon_wind']

In [None]:
print("There are %d rows in model data" % (modeldata.count()[0]))
print("Average Travel Time between stops: {}".format(modeldata.travel_time.mean()))

### General Model for stops not in the data [ will take ages to train ]

#### Filtering outliers

In [None]:
travel_time_deviation = modeldata.travel_time.std()

# Only allow travel times greater than zero 
General_modeldata = modeldata[modeldata.travel_time >= 0]

# Filter outliers from the dataset
General_modeldata = General_modeldata[abs(General_modeldata.travel_time-General_modeldata.travel_time.mean()) < 3*travel_time_deviation]

# Show spread of data
General_modeldata.travel_time.hist(bins=150)

#### Test train split

In [None]:
# ========================= Test/Train Splits ========================== #
General_X_train, General_X_test, General_y_train, General_y_test = train_test_split(General_modeldata[General_predictors],
                                                                                     General_modeldata[target].values.ravel(), 
                                                                                     test_size=0.2, 
                                                                                     shuffle=True)
General_X_train.head()

#### Training Model

In [51]:
# ========================== Making DMatrices ========================== #
General_dtrain = xgb.DMatrix(General_X_train, label=General_y_train)
General_dtest = xgb.DMatrix(General_X_test, label=General_y_test)

# =========================== Training Model =========================== #

param = {
    'eta':0.1,
    'max_depth':6
}
num_rounds = 10000

General_model = xgb.train(param, General_dtrain, num_rounds, evals=[(General_dtest, 'Test')], verbose_eval=50, early_stopping_rounds=100)

# ============================  Predictions ============================ #
General_xgbpreds = General_model.predict(General_dtest)

# ====================== Feature Importance graph ====================== #
# xgb.plot_importance(General_model)

[0]	Test-rmse:77.3825
Will train until Test-rmse hasn't improved in 100 rounds.
[50]	Test-rmse:32.4332
[100]	Test-rmse:31.119


KeyboardInterrupt: 

### Stop Wise Models

In [None]:
# Training Model for all pairs of stops in the dataset

models = dict()
emptys = []
metric = dict()

no_stops = stop_pairs.count()[0]
print("There are %d models to train." % no_stops)

for pair in tqdm_notebook(stop_pairs.iterrows(), desc="Progress: ", total=no_stops):

    # Travelling From A -> B
    A = pair[1][1]
    B = pair[1][0]
    
    # ========================= Rows from A -> B =========================== #
    data = modeldata[(modeldata.stopid==B)&(modeldata.previous_stopid==A)]
    
    if data.count()[0] > 10:
    
        # ========================= Removing Outliers ========================== #
        travel_sigma = data.travel_time.std()

        # Only allow travel times greater than zero 
        data = data[data.travel_time >= 0]

        # Filter outliers from the dataset [ 2σ as the cutoff ~95% of data ]
        data = data[abs(data.travel_time - data.travel_time.mean()) < 2*travel_sigma]

        # ========================= Remove Null Data =========================== #
        data.dropna(inplace=True)

        # ========================= Test/Train Splits ========================== #
        X_train, X_test, y_train, y_test = train_test_split(data[predictors],data[target].values.ravel(), test_size=0.3, shuffle=True)

        # ========================== Making DMatrices ========================== #
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)

        # =========================== Training Model =========================== #

        param = {
            'eta': 0.15,
            'max_depth': 6
        }

        num_rounds = 10000

        try:
            
            # ============ Train ============= #
            model = xgb.train(param, dtrain, num_rounds, evals=[(dtest, 'Test')], verbose_eval=False, early_stopping_rounds=100)
            models[f'{A}_{B}'] = model
            
            # ============ Testing Accuracy ========== #
            preds = model.predict(dtest)
            metric[f'{A}_{B}'] = dict()
            metric[f'{A}_{B}']['rmse'] = np.sqrt(metrics.mean_squared_error(preds, y_test))
            metric[f'{A}_{B}']['preds']= preds
            metric[f'{A}_{B}']['ytest']= y_test
            
            
        except Exception as e:
            print(f"Error with route: {A} -> {B}")
            data.head()
            y_train, y_test
            print(repr(e), end='\n================================================\n')
    else:
        print(f"Empty Set Error: {A} -> {B}")
        emptys.append((A,B))

## Evaluate Models

### General Model

#### Plotting Errors

In [None]:
# ========================= Visualising Errors ========================= #
General_xgbresiduals = General_xgbpreds - General_y_test

# best fit of data
(xmu, xsigma) = stats.norm.fit(General_xgbresiduals)

# # The histogram of the data
xn, xbins, xpatches = plt.hist(General_xgbresiduals, 100, density=True, facecolor='blue', alpha=0.6)

# add a 'best fit' line
xy = mlab.normpdf(xbins, xmu, xsigma)
xl = plt.plot(xbins, xy, 'r--', linewidth=2)
plt.show()

print(xsigma,xmu)

#### Cross Validation

In [None]:
General_rmse = np.sqrt(metrics.mean_squared_error(General_xgbpreds, General_y_test))

print(f"""
General Model RMSE: {General_rmse}
""")


### Individual Models

In [None]:
scores_sample = []
for key in metric.keys():
        
    scores_sample.append(metric[key]['rmse'])
    
    if metric[key]['rmse'] > 50:
        
        plt.figure()
        plt.title("{} - {}".format(key, metric[key]['rmse']))
        plt.plot(metric[key]['preds'],'r')
        plt.plot(metric[key]['ytest'],'b', alpha=0.5)

plt.figure(figsize=(20,5))
plt.plot(scores_sample, 'b.')
plt.axhline(y=30)
plt.show()

## Exporting Model

In [None]:
# Exporting General Model

General_model.save_model("ModelFiles/StopModels/General.model")


# Exporting all models 
    
mkeys = list(models.keys())

for k in tnrange(len(mkeys)):
    
    key = mkeys[k]
    
    models[key].save_model(f"ModelFiles/StopModels/{key}.model")