Other analyses of the same data:

https://github.com/charlie1347/TfL_bikes

https://medium.com/@AJOhrn/data-footprint-of-bike-sharing-in-london-be9e11425248

In [655]:
import os
import pandas as pd
import numpy as np
from sklearn import svm
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns
from pathlib import Path
from IPython.display import set_matplotlib_formats

In [2]:
# For pretty and exportable matplotlib plots.
# If you are running this yourself and want interactivity,
# try `%matplotlib widget` instead.
# set_matplotlib_formats("svg")
# %matplotlib inline
%matplotlib widget
# Set a consistent plotting style across the notebook using Seaborn.
sns.set_style("darkgrid")
sns.set_context("notebook")

In [3]:
bikefolder = "./data/bikes"

In [4]:
def add_station_names(station_names, df, namecolumn, idcolumn):
    namemaps = (
        df[[idcolumn, namecolumn]]
        .groupby(idcolumn)
        .aggregate(lambda x: x.unique())
    )
    for number, names in namemaps.iterrows():
        current_names = station_names.get(number, set())
        # The following two lines are a stupid dance around the annoying fact that pd.unique sometimes returns a single value,
        # sometimes a numpy array of values, but since the single value is a string, it too is an iterable.
        vals = names[0]
        new_names = set([vals]) if type(vals) == str else set(vals)
        current_names.update(new_names)
        station_names[number] = current_names


def clean_datetime_column(df, colname, roundto="H"):
    # A bit of a hacky way to use the first entry to figure out which date format this file uses.
    # Not super robust, but works. TODO Improve this.
    if len(df[colname].iloc[0]) > 16:
        format = "%d/%m/%Y %H:%M:%S"
    else:
        format = "%d/%m/%Y %H:%M"
    df[colname] = pd.to_datetime(df[colname], format=format)
    df[colname] = df[colname].dt.round(roundto)
    early_cutoff = pd.datetime(2010, 7, 30)  # When the program started.
    late_cutoff = pd.datetime(2020, 1, 1)  # Approximately now.
    df = df[(late_cutoff > df[colname]) & (df[colname] >= early_cutoff)]
    return df


def compute_single_events(df, which):
    stationcol = "{}Station Id".format(which)
    datecol = "{} Date".format(which)
    events = (
        df.rename(columns={stationcol: "Station", datecol: "Date"})
        .groupby(["Date", "Station"])
        .size()
        .unstack("Station")
    )
    return events


def compute_both_events(df):
    ends = compute_single_events(df, "End")
    starts = compute_single_events(df, "Start")
    both = (
        pd.concat([ends, starts], keys=["End", "Start"], axis=1)
        .reorder_levels([1, 0], axis=1)
        .fillna(0.0)
    )
    return both


def castable_to_int(obj):
    try:
        int(obj)
        return True
    except ValueError:
        return False


def cast_to_int(df, colname):
    try:
        df = df.astype({colname: np.int_}, copy=False)
    except ValueError:
        castable_rows = df[colname].apply(castable_to_int)
        df = df[castable_rows]
        df = df.astype({colname: np.int_}, copy=False)
    return df


events_by_station_path = Path("./events_by_station.p")
if events_by_station_path.exists():
    events_by_station = pd.read_pickle(events_by_station_path)
else:
    datafiles = sorted(os.listdir(bikefolder))
    folderpath = Path(bikefolder)
    datapaths = [folderpath / Path(file) for file in datafiles]
    datapaths = [p for p in datapaths if p.suffix == ".csv"]

    station_allnames = {}

    pieces = []
    #     datapaths = [
    #         folderpath / Path(file)
    #         for file in [
    #             "21JourneyDataExtract31Aug2016-06Sep2016.csv",
    #             "15JourneyDataExtract20Jul2016-26Jul2016.csv",
    #             "13b. Journey Data Extract 22Dec14-03Jan15.csv",
    #             "16JourneyDataExtract27Jul2016-02Aug2016.csv",
    #             "10b. Journey Data Extract 28Sep14-11Oct14.csv",
    #             "6. Journey Data Extract_27May-23Jun12.csv",
    #             "6. Journey Data Extract_27May-23Jun12.csv",
    #         ]
    #     ]
    cols = [
        "Duration",
        "End Date",
        "EndStation Id",
        "EndStation Name",
        "Start Date",
        "StartStation Id",
        "StartStation Name",
    ]
    problem_paths = []
    for path in datapaths:
        print(path)
        try:
            df = pd.read_csv(path, usecols=cols, encoding="ISO-8859-2")
        except ValueError as e:
            # Some files have missing or abnormaly named columns. We'll deal with them later.
            problem_paths.append(path)
            continue
        # Drop any rows that have missing values.
        df = df[~df.isna().any(axis=1)]
        # Drop any anomalously short trips. Probably somebody just taking a bike and putting
        # it right back in.
        df = df[df["Duration"] > 60]
        # Cast the columns to the right types. This is easier ones NAs have been dropped.
        df = cast_to_int(df, "EndStation Id")
        df = cast_to_int(df, "StartStation Id")
        # Turn the date columns from strings into datetime objects rounded to the hour.
        df = clean_datetime_column(df, "End Date")
        df = clean_datetime_column(df, "Start Date")
        events = compute_both_events(df)
        pieces.append(events)

        add_station_names(
            station_allnames, df, "EndStation Name", "EndStation Id"
        )
        add_station_names(
            station_allnames, df, "StartStation Name", "StartStation Id"
        )

    station_ids = {}
    station_names = {}
    for k, v in station_allnames.items():
        v = sorted(v)
        station_names[k] = v[0]
        for name in v:
            station_ids[name] = k

    def get_station_id(name):
        try:
            return station_ids[name]
        except KeyError:
            return np.nan

    print("Doing the problem cases ({} of them).".format(len(problem_paths)))
    safe_cols = [
        "Duration",
        "End Date",
        "EndStation Name",
        "Start Date",
        "StartStation Name",
    ]
    for path in problem_paths:
        print(path)
        df = pd.read_csv(path, usecols=safe_cols, encoding="ISO-8859-2")
        # Drop any rows that have missing values.
        df = df[~df.isna().any(axis=1)]
        # Drop any anomalously short trips. Probably somebody just taking a bike and putting
        # it right back in.
        df = df[df["Duration"] > 60]
        # Add a column of station ids, based on names.
        df["EndStation Id"] = df["EndStation Name"].apply(get_station_id)
        df["StartStation Id"] = df["StartStation Name"].apply(get_station_id)
        # Turn the date columns from strings into datetime objects rounded to the hour.
        clean_datetime_column(df, "End Date")
        clean_datetime_column(df, "Start Date")
        events = compute_both_events(df)
        pieces.append(events)

    events_by_station = pd.concat(pieces).fillna(0.0).sort_index()
    events_by_station = events_by_station.rename(
        mapper=station_names, axis=1, level=0
    )

    events_by_station.to_pickle(events_by_station_path)

In [5]:
a = events_by_station.sum(axis=1, level=0)


def mean_within_window(s):
    starttime = s.ne(0).idxmax()
    endtime = s[::-1].ne(0).idxmax()
    return s[starttime:endtime].mean()


(a.max() / a.aggregate(mean_within_window)).sort_values(ascending=False)[:30]

Station
Electrical Workshop PS                                      1946.693548
PENTON STREET COMMS TEST TERMINAL _ CONTACT MATT McNULTY    1735.230769
tabletop1                                                   1123.777778
Contact Centre, Southbury House                             1027.705263
Pop Up Dock 1                                                824.012017
6                                                            582.662026
Mechanical Workshop Penton                                   248.971754
South Quay East, Canary Wharf                                150.185113
Westfield Eastern Access Road, Shepherd's Bush               100.810728
Thornfield House, Poplar                                      93.689453
Upper Grosvenor Street, Mayfair                               79.666708
Courland Grove , Wandsworth Road                              64.008406
Fore Street Avenue: Guildhall                                 63.179106
Aberfeldy Street, Poplar                                

In [6]:
improper_stations = [
    "Electrical Workshop PS",
    "PENTON STREET COMMS TEST TERMINAL _ CONTACT MATT McNULTY",
    "tabletop1",
    "Pop Up Dock 1",
    "6",
    "Mechanical Workshop Penton",
]
events_by_station = events_by_station.drop(improper_stations, axis=1, level=0)

In [741]:
# TODO What is up with this?
events_by_station["Exhibition Road Museums, Knightsbridge"]

Unnamed: 0_level_0,End,Start,End,Start
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-04 00:00:00,0.0,0.0,0.0,0.0
2012-01-04 01:00:00,0.0,0.0,0.0,0.0
2012-01-04 02:00:00,2.0,0.0,0.0,0.0
2012-01-04 03:00:00,0.0,0.0,0.0,0.0
2012-01-04 04:00:00,0.0,0.0,0.0,0.0
...,...,...,...,...
2017-05-16 22:00:00,1.0,1.0,0.0,0.0
2017-05-16 23:00:00,0.0,0.0,0.0,0.0
2017-05-17 00:00:00,1.0,0.0,0.0,0.0
2017-05-17 01:00:00,0.0,0.0,0.0,0.0


In [7]:
stations = events_by_station.columns.get_level_values(0).unique()

In [173]:
events_by_time = events_by_station.sum(axis=1, level=1)
totals_by_station = events_by_station.sum(axis=0)

In [9]:
times = events_by_station.index.to_series()
predictors = pd.DataFrame(
    {
        "Week": times.dt.week,
        "Weekday": times.dt.weekday,
        "Hour": times.dt.hour,
    },
    index=times,
)

In [346]:
class RollingValidator:
    def __init__(self, data, predictors, min_training_time, prediction_time):
        cv_batches = []
        first_time = data.index.min()
        last_time = data.index.max()
        test_end_time = last_time
        cutoff = test_end_time - prediction_time
        while cutoff > first_time + min_training_time:
            training_data = data[:cutoff]
            training_predictors = predictors[:cutoff]
            test_data = data[cutoff:test_end_time]
            test_predictors = predictors[cutoff:test_end_time]
            cv_batches.append(
                (
                    training_data,
                    training_predictors,
                    test_data,
                    test_predictors,
                )
            )
            test_end_time = cutoff
            cutoff = test_end_time - prediction_time
        self.cv_batches = cv_batches
        self.models = {}
        print(
            "Created a RollingValidator with {} cross-validation batches.".format(
                len(cv_batches)
            )
        )

    def test_modelclass(self, modelclass):
        test_errors = []
        training_errors = []
        test_predictions = []
        training_predictions = []
        for (i, (
            training_data,
            training_predictors,
            test_data,
            test_predictors,)
        ) in enumerate(self.cv_batches):
            print("Training for batch {}.".format(i))
            model = modelclass()
            model.train(training_data, training_predictors)
            print("Predicting for batch {}.".format(i))
            test_prediction = model.predict(test_predictors)
            training_prediction = model.predict(training_predictors)
            test_error = test_data - test_prediction
            training_error = training_data - training_prediction
            test_errors.append(test_error)
            training_errors.append(training_error)
            test_predictions.append(test_prediction)
            training_predictions.append(training_predictions)
        test_mse = (pd.concat(test_errors)**2).mean()
        training_mse = (pd.concat(training_errors)**2).mean()
        self.models[modelclass] = {
            "test_mse": test_mse,
            "training_mse": training_mse,
            "test_errors": test_errors,
            "training_errors": training_errors,
            "test_predictions": test_predictions,
            "training_predictions": training_predictions,
        }
        return test_mse, training_mse

In [746]:
test_stations = [
    "Waterloo Station 3, Waterloo",
    "Hyde Park Corner, Hyde Park",
    "Wenlock Road , Hoxton",
    "Stonecutter Street, Holborn",
]
test_columns = sum(
    [[(s, es) for s in test_stations] for es in ("End", "Start")], []
)
rv = RollingValidator(
    events_by_station[test_columns],
    predictors,
    2 * pd.Timedelta("365d"),
    0.5 * pd.Timedelta("365d"),
)

Created a RollingValidator with 6 cross-validation batches.


In [747]:
example_means_over_week = (
    events_by_station[test_stations]
    .groupby([times.dt.weekday, times.dt.hour])
    .sum()
)
example_means_over_week.index.rename(["Day", "Hour"], inplace=True)
example_means_over_week = (
    example_means_over_week.stack(level=[0, 1])
    .reset_index()
    .rename(columns={"level_3": "End/Start", 0: "Count"})
)
example_means_over_week["Weekday"] = example_means_over_week.apply(
    lambda x: x["Day"] + x["Hour"] / 24, axis=1,
)
g = sns.FacetGrid(
    example_means_over_week,
    col_wrap=2,
    col="Station",
    hue="End/Start",
    sharey=False,
    sharex=True,
)
g.map(plt.plot, "Weekday", "Count").set_titles("{col_name}")
g.add_legend();

  fig = plt.figure(figsize=figsize)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [748]:
example_means_over_week = (
    events_by_station[test_stations].groupby(times.dt.week).sum()
)
example_means_over_week.index.rename("Week", inplace=True)
example_means_over_week = (
    example_means_over_week.stack(level=[0, 1])
    .reset_index()
    .rename(columns={"level_2": "End/Start", 0: "Count"})
)
g = sns.FacetGrid(
    example_means_over_week,
    col_wrap=2,
    col="Station",
    hue="End/Start",
    sharey=False,
    sharex=True,
)
g.map(plt.plot, "Week", "Count").set_titles("{col_name}")
g.add_legend();

  fig = plt.figure(figsize=figsize)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [448]:
class MeanByTime:
    def __init__(self):
        self.model = None

    def train(self, data, predictors):
        df = pd.concat((data, predictors), axis=1)
        means = df.groupby(["Week", "Weekday", "Hour"]).mean()
        means.reset_index()
        self.model = means

    def predict(self, predictors):
        means = self.model
        predictions = predictors.join(means, on=["Week", "Weekday", "Hour"])
        predictions = predictions.drop(["Week", "Weekday", "Hour"], axis=1)
        return predictions

In [449]:
rv.test_modelclass(MeanByTime)

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.


(Station                                   
 Waterloo Station 3, Waterloo         End      257.274328
 Hyde Park Corner, Hyde Park          End      150.216634
 Brushfield Street, Liverpool Street  Start     21.285615
 dtype: float64, Station                                   
 Waterloo Station 3, Waterloo         End      114.131614
 Hyde Park Corner, Hyde Park          End       87.161287
 Brushfield Street, Liverpool Street  Start     12.919588
 dtype: float64)

In [450]:
class SimpleMean:
    def __init__(self):
        self.model = None

    def train(self, data, predictors):
        df = pd.concat((data, predictors), axis=1)
        mean = pd.DataFrame(data.mean())
        m = mean.T
        self.model = mean

    def predict(self, predictors):
        mean = self.model
        predictions = mean.T.apply(
            lambda x: [x[0]] * len(predictors.index)
        ).set_index(predictors.index)
        return predictions

In [451]:
rv.test_modelclass(SimpleMean)

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.


(Station                                   
 Waterloo Station 3, Waterloo         End      450.098345
 Hyde Park Corner, Hyde Park          End      256.653515
 Brushfield Street, Liverpool Street  Start     32.816423
 dtype: float64, Station                                   
 Waterloo Station 3, Waterloo         End      428.016644
 Hyde Park Corner, Hyde Park          End      213.831273
 Brushfield Street, Liverpool Street  Start     37.222466
 dtype: float64)

The first question is, can we do linear models. The answer is most probably not. This is because our predictors need to be thought of as either categorical or as cyclic, and this doesn't work well with linear models.

TODO Read up on how people do linear models on categorical data.

Well then, can we use the kernel trick? Yes! But we need to think carefully about the kernel. Inner-product based kernels don't make sense, since things like labeling weekdays as integers 0-6 doesn't really come with a sensible notion of magnitude for the elements. But differences of the elements do make sense, _as long as we take them with a modulus_. This means that we can work for instance with the RBF kernel, as long as we take differences with a modulus. Like this:

In [560]:
sigma = 1/4
def modrbf(a, b):
    print(a.shape)
    print(b.shape)
    result = np.zeros((a.shape[0], b.shape[0]))
    for i in range(a.shape[1]):
        ai = a[:, i]
        bi = b[:, i]
        x, y = np.meshgrid(ai, bi,indexing="ij")
        diff = ((x - y + 0.5) % 1) - 0.5
        expdiff = np.exp(-(diff ** 2)/(2*sigma**2))
        result += expdiff
    return result

One thing we could try to use this kernel with is support vector machines. scikit-learn as an implementation of support vector machine based regression, but unfortunately it is too heavy to run in our case. The memory cost of SVMs scales as number of samples squared. We have about ~50,000 samples in each training set, which is too much to store as a full matrix (would take of the order of 20 gigs of RAM). sklearn's training algorithm, based on a C++ library called libsvm, is smart enough to avoid constructing this matrix in full, but only if it gets to use one if it's built-in kernels. For custom kernels, it's either 20 gigs of RAM or go home. Too bad.

We can run the SVR with the usual, built-in RBF kernel, without the modulus stuff. It doesn't make too much sense, but let's do it anyway.

In [555]:
class SVR:
    def __init__(self):
        self.model = {}

    @staticmethod
    def normalize_predictors(p):
        return p.apply(lambda x: x / x.max())
        
    def train(self, data, predictors):
        predictors = self.normalize_predictors(predictors)
        for c in data.columns:
            model = svm.SVR(kernel="rbf", gamma=0.1, cache_size=2000)
            model.fit(predictors, data[c])
            self.model[c] = model

    def predict(self, predictors):
        predictors = self.normalize_predictors(predictors)
        model = self.model
        predictions = pd.DataFrame({"Time": predictors.index}).set_index("Time")
        for k, v in model.items():
            predictions[k] = v.predict(predictors)
        return predictions

In [556]:
rv.test_modelclass(SVR)

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.


(Station                                   
 Waterloo Station 3, Waterloo         End      427.068123
 Hyde Park Corner, Hyde Park          End      173.079288
 Brushfield Street, Liverpool Street  Start     22.921515
 dtype: float64, Station                                   
 Waterloo Station 3, Waterloo         End      401.596921
 Hyde Park Corner, Hyde Park          End      127.162489
 Brushfield Street, Liverpool Street  Start     25.727630
 dtype: float64)

As expected, the results aren't great. They are signifcantly worse than our basic averaging model (though still better than the really simple-minded average), so no cookie for SVM. Sure, we didn't do any hyperparameter tuning, but still.

KernelRidge also can't handle this amount of training data with chocking on RAM. LinearSVR is as good as SimpleMean, i.e. really bad.

In [585]:
events_by_station.loc[:, ].rolling("180d").mean().plot()

  fig = self.plt.figure(figsize=self.figsize)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7f7f0be3af10>

('Windsor Terrace, Hoxton', 'End')
('Windsor Terrace, Hoxton', 'Start')
('Fanshaw Street, Hoxton', 'End')
('Fanshaw Street, Hoxton', 'Start')
('Wenlock Road , Hoxton', 'End')
('Wenlock Road , Hoxton', 'Start')
('East Road, Hoxton', 'End')
('East Road, Hoxton', 'Start')
('Murray Grove , Hoxton', 'End')
('Murray Grove , Hoxton', 'Start')
('Falkirk Street, Hoxton', 'End')
('Falkirk Street, Hoxton', 'Start')
('Shoreditch Park, Hoxton', 'End')
('Shoreditch Park, Hoxton', 'Start')
('New North Road 1, Hoxton', 'End')
('New North Road 1, Hoxton', 'Start')
('New North Road 2, Hoxton', 'End')
('New North Road 2, Hoxton', 'Start')
('Pitfield Street (North),Hoxton', 'End')
('Pitfield Street (North),Hoxton', 'Start')
('Hoxton Station, Haggerston', 'End')
('Hoxton Station, Haggerston', 'Start')
('Pitfield Street Central, Hoxton', 'End')
('Pitfield Street Central, Hoxton', 'Start')
('Eagle Wharf Road, Hoxton', 'End')
('Eagle Wharf Road, Hoxton', 'Start')
('Hoxton Street, Hoxton', 'End')
('Hoxton Stre

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [578]:
events_by_station.columns

MultiIndex([(             'River Street , Clerkenwell',   'End'),
            (             'River Street , Clerkenwell', 'Start'),
            (         'Phillimore Gardens, Kensington',   'End'),
            (         'Phillimore Gardens, Kensington', 'Start'),
            (   'Christopher Street, Liverpool Street',   'End'),
            (   'Christopher Street, Liverpool Street', 'Start'),
            (        'St. Chad's Street, King's Cross',   'End'),
            (        'St. Chad's Street, King's Cross', 'Start'),
            (          'Sedding Street, Sloane Square',   'End'),
            (          'Sedding Street, Sloane Square', 'Start'),
            ...
            (            'Riverlight South, Nine Elms',   'End'),
            (            'Riverlight South, Nine Elms', 'Start'),
            (           'One Tower Bridge, Bermondsey',   'End'),
            (           'One Tower Bridge, Bermondsey', 'Start'),
            (           'Belvedere Road 2, South Bank',   'E