Other analyses of the same data:

https://github.com/charlie1347/TfL_bikes

https://medium.com/@AJOhrn/data-footprint-of-bike-sharing-in-london-be9e11425248

In [1]:
import os
import pandas as pd
import numpy as np
import scipy as sp
import statsmodels.api as sm
from sklearn import linear_model, svm, neighbors, naive_bayes, tree
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns
from pathlib import Path
from IPython.display import set_matplotlib_formats

In [2]:
# For pretty and exportable matplotlib plots.
# If you are running this yourself and want interactivity,
# try `%matplotlib widget` instead.
# set_matplotlib_formats("svg")
# %matplotlib inline
%matplotlib widget
# Set a consistent plotting style across the notebook using Seaborn.
sns.set_style("darkgrid")
sns.set_context("notebook")

# Processing and cleaning the data

In [3]:
bikefolder = "./data/bikes"

In [4]:
def add_station_names(station_names, df, namecolumn, idcolumn):
    namemaps = (
        df[[idcolumn, namecolumn]]
        .groupby(idcolumn)
        .aggregate(lambda x: x.unique())
    )
    for number, names in namemaps.iterrows():
        current_names = station_names.get(number, set())
        # The following two lines are a stupid dance around the annoying fact that pd.unique sometimes returns a single value,
        # sometimes a numpy array of values, but since the single value is a string, it too is an iterable.
        vals = names[0]
        new_names = set([vals]) if type(vals) == str else set(vals)
        current_names.update(new_names)
        station_names[number] = current_names


def clean_datetime_column(df, colname, roundto="H"):
    # A bit of a hacky way to use the first entry to figure out which date format this file uses.
    # Not super robust, but works. TODO Improve this.
    if len(df[colname].iloc[0]) > 16:
        format = "%d/%m/%Y %H:%M:%S"
    else:
        format = "%d/%m/%Y %H:%M"
    df[colname] = pd.to_datetime(df[colname], format=format)
    df[colname] = df[colname].dt.round(roundto)
    early_cutoff = pd.datetime(2010, 7, 30)  # When the program started.
    late_cutoff = pd.datetime(2020, 1, 1)  # Approximately now.
    df = df[(late_cutoff > df[colname]) & (df[colname] >= early_cutoff)]
    return df


def compute_single_events(df, which):
    stationcol = "{}Station Id".format(which)
    datecol = "{} Date".format(which)
    events = (
        df.rename(columns={stationcol: "Station", datecol: "Date"})
        .groupby(["Date", "Station"])
        .size()
        .unstack("Station")
    )
    return events


def compute_both_events(df):
    ends = compute_single_events(df, "End")
    starts = compute_single_events(df, "Start")
    both = (
        pd.concat([ends, starts], keys=["End", "Start"], axis=1)
        .reorder_levels([1, 0], axis=1)
        .fillna(0.0)
    )
    return both


def castable_to_int(obj):
    try:
        int(obj)
        return True
    except ValueError:
        return False


def cast_to_int(df, colname):
    try:
        df = df.astype({colname: np.int_}, copy=False)
    except ValueError:
        castable_rows = df[colname].apply(castable_to_int)
        df = df[castable_rows]
        df = df.astype({colname: np.int_}, copy=False)
    return df


events_by_station_path = Path("./events_by_station.p")
if events_by_station_path.exists():
    events_by_station = pd.read_pickle(events_by_station_path)
else:
    datafiles = sorted(os.listdir(bikefolder))
    folderpath = Path(bikefolder)
    datapaths = [folderpath / Path(file) for file in datafiles]
    datapaths = [p for p in datapaths if p.suffix == ".csv"]

    station_allnames = {}

    pieces = []
    #     datapaths = [
    #         folderpath / Path(file)
    #         for file in [
    #             "21JourneyDataExtract31Aug2016-06Sep2016.csv",
    #             "15JourneyDataExtract20Jul2016-26Jul2016.csv",
    #             "13b. Journey Data Extract 22Dec14-03Jan15.csv",
    #             "16JourneyDataExtract27Jul2016-02Aug2016.csv",
    #             "10b. Journey Data Extract 28Sep14-11Oct14.csv",
    #             "6. Journey Data Extract_27May-23Jun12.csv",
    #             "6. Journey Data Extract_27May-23Jun12.csv",
    #         ]
    #     ]
    cols = [
        "Duration",
        "End Date",
        "EndStation Id",
        "EndStation Name",
        "Start Date",
        "StartStation Id",
        "StartStation Name",
    ]
    problem_paths = []
    for path in datapaths:
        print(path)
        try:
            df = pd.read_csv(path, usecols=cols, encoding="ISO-8859-2")
        except ValueError as e:
            # Some files have missing or abnormaly named columns. We'll deal with them later.
            problem_paths.append(path)
            continue
        # Drop any rows that have missing values.
        df = df[~df.isna().any(axis=1)]
        # Drop any anomalously short trips. Probably somebody just taking a bike and putting
        # it right back in.
        df = df[df["Duration"] > 60]
        # Cast the columns to the right types. This is easier ones NAs have been dropped.
        df = cast_to_int(df, "EndStation Id")
        df = cast_to_int(df, "StartStation Id")
        # Turn the date columns from strings into datetime objects rounded to the hour.
        df = clean_datetime_column(df, "End Date")
        df = clean_datetime_column(df, "Start Date")
        events = compute_both_events(df)
        pieces.append(events)

        add_station_names(
            station_allnames, df, "EndStation Name", "EndStation Id"
        )
        add_station_names(
            station_allnames, df, "StartStation Name", "StartStation Id"
        )

    station_ids = {}
    station_names = {}
    for k, v in station_allnames.items():
        v = sorted(v)
        station_names[k] = v[0]
        for name in v:
            station_ids[name] = k

    def get_station_id(name):
        try:
            return station_ids[name]
        except KeyError:
            return np.nan

    print("Doing the problem cases ({} of them).".format(len(problem_paths)))
    safe_cols = [
        "Duration",
        "End Date",
        "EndStation Name",
        "Start Date",
        "StartStation Name",
    ]
    for path in problem_paths:
        print(path)
        df = pd.read_csv(path, usecols=safe_cols, encoding="ISO-8859-2")
        # Drop any rows that have missing values.
        df = df[~df.isna().any(axis=1)]
        # Drop any anomalously short trips. Probably somebody just taking a bike and putting
        # it right back in.
        df = df[df["Duration"] > 60]
        # Add a column of station ids, based on names.
        df["EndStation Id"] = df["EndStation Name"].apply(get_station_id)
        df["StartStation Id"] = df["StartStation Name"].apply(get_station_id)
        # Turn the date columns from strings into datetime objects rounded to the hour.
        clean_datetime_column(df, "End Date")
        clean_datetime_column(df, "Start Date")
        events = compute_both_events(df)
        pieces.append(events)

    events_by_station = pd.concat(pieces).fillna(0.0)
    # Several files may have contained entries for the same hour, which means that
    # events_by_station has duplicate entries in the index. Get rid of them by summing.
    events_by_station = events_by_station.groupby("Date").sum().sort_index()
    # Finally rename the columns according to the chosen names for stations.
    events_by_station = events_by_station.rename(
        mapper=station_names, axis=1, level=0
    )

    events_by_station.to_pickle(events_by_station_path)

In [5]:
def mean_within_window(s):
    starttime = s.ne(0).idxmax()
    endtime = s[::-1].ne(0).idxmax()
    return s[starttime:endtime].mean()


a = events_by_station.sum(axis=1, level=0)
(a.max() / a.aggregate(mean_within_window)).sort_values(ascending=False)[:30]

Station
Electrical Workshop PS                                      1757.661290
PENTON STREET COMMS TEST TERMINAL _ CONTACT MATT McNULTY    1615.675676
tabletop1                                                   1066.166667
Contact Centre, Southbury House                              780.694737
6                                                            567.974684
Pop Up Dock 1                                                567.806009
Mechanical Workshop Penton                                   223.818810
South Quay East, Canary Wharf                                133.732653
Westfield Eastern Access Road, Shepherd's Bush                88.865062
Thornfield House, Poplar                                      83.572294
Fore Street Avenue: Guildhall                                 72.241404
Upper Grosvenor Street, Mayfair                               71.496163
Courland Grove , Wandsworth Road                              57.658476
Manfred Road, East Putney                               

In [6]:
improper_stations = [
    "Electrical Workshop PS",
    "PENTON STREET COMMS TEST TERMINAL _ CONTACT MATT McNULTY",
    "tabletop1",
    "Pop Up Dock 1",
    "6",
    "Mechanical Workshop Penton",
]
events_by_station = events_by_station.drop(improper_stations, axis=1, level=0)

In [7]:
# TODO What is up with this?
events_by_station["Exhibition Road Museums, Knightsbridge"]

Unnamed: 0_level_0,End,Start,End,Start
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-04 00:00:00,0.0,0.0,0.0,0.0
2012-01-04 01:00:00,0.0,0.0,0.0,0.0
2012-01-04 02:00:00,2.0,0.0,0.0,0.0
2012-01-04 03:00:00,0.0,0.0,0.0,0.0
2012-01-04 04:00:00,0.0,0.0,0.0,0.0
...,...,...,...,...
2017-05-16 22:00:00,1.0,1.0,0.0,0.0
2017-05-16 23:00:00,0.0,0.0,0.0,0.0
2017-05-17 00:00:00,1.0,0.0,0.0,0.0
2017-05-17 01:00:00,0.0,0.0,0.0,0.0


# Exploring

In [8]:
stations = events_by_station.columns.get_level_values(0).unique()
events_by_time = events_by_station.sum(axis=1, level=1)
totals_by_station = events_by_station.sum(axis=0)
times = events_by_station.index.to_series()

In [9]:
test_stations = [
    "Waterloo Station 3, Waterloo",
    "Hyde Park Corner, Hyde Park",
    "Wenlock Road , Hoxton",
    "Stonecutter Street, Holborn",
]

In [10]:
# TODO Give the plots widths from variance or something like [25%, 75%] limits (what are these called again?).
example_means_over_week = (
    events_by_station[test_stations]
    .groupby([times.dt.weekday, times.dt.hour])
    .sum()
)
example_means_over_week.index.rename(["Day", "Hour"], inplace=True)
example_means_over_week = (
    example_means_over_week.stack(level=[0, 1])
    .reset_index()
    .rename(columns={"level_3": "End/Start", 0: "Count"})
)
example_means_over_week["Weekday"] = example_means_over_week.apply(
    lambda x: x["Day"] + x["Hour"] / 24, axis=1,
)
g = sns.FacetGrid(
    example_means_over_week,
    col_wrap=2,
    col="Station",
    hue="End/Start",
    sharey=False,
    sharex=True,
)
g.map(plt.plot, "Weekday", "Count").set_titles("{col_name}")
g.add_legend();

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [11]:
# TODO Give the plots widths from variance or something like [25%, 75%] limits (what are these called again?).
example_means_over_year = (
    events_by_station[test_stations].groupby(times.dt.week).sum()
)
# Leave out the first and last weeks, since they are usually shorter and thus the data isn't comparable.
example_means_over_year = example_means_over_year.iloc[1:51]
example_means_over_year.index.rename("Week", inplace=True)
example_means_over_year = (
    example_means_over_year.stack(level=[0, 1])
    .reset_index()
    .rename(columns={"level_2": "End/Start", 0: "Count"})
)
g = sns.FacetGrid(
    example_means_over_year,
    col_wrap=2,
    col="Station",
    hue="End/Start",
    sharey=False,
    sharex=True,
)
g.map(plt.plot, "Week", "Count").set_titles("{col_name}")
g.add_legend();

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [12]:
yearly_rolling = (
    events_by_station.loc[:, test_stations]
    .rolling("365d", min_periods=24 * 90)
    .mean()
)
yearly_rolling = (
    yearly_rolling.stack(level=[0, 1])
    .reset_index()
    .rename(columns={"level_2": "End/Start", 0: "Count"})
)
g = sns.FacetGrid(
    yearly_rolling,
    col_wrap=2,
    col="Station",
    hue="End/Start",
    sharey=False,
    sharex=True,
)
g.map(plt.plot, "Date", "Count").set_titles("{col_name}")
g.add_legend();

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


# Feature extraction

Given the extremely strong weekly pattern every station seems to have, one we thing we do straight away is instead of trying to predict the data itself, try to predict the change compared to a week ago. This makes fitting models much easier since they don't have to concentrate on trying to get the shape of the weekly trend right. The data also naturally becomes roughly mean-zero, which is useful for some models. Moreover, the long term trends, which may be entirely unpredictable given the data we have, don't ruin our results once we try to predict week-on-week changes. The only downside is that we have to discard the first week of our data.

In [13]:
class RollingValidator:
    def __init__(
        self,
        data,
        common_predictors,
        specific_predictors,
        min_training_time,
        prediction_time,
    ):
        cv_batches = []
        first_time = data.index.min()
        last_time = data.index.max()
        test_end_time = last_time
        cutoff = test_end_time - prediction_time
        while cutoff > first_time + min_training_time:
            training_data = data[:cutoff]
            training_common_predictors = common_predictors[:cutoff]
            training_specific_predictors = specific_predictors[:cutoff]
            test_data = data[cutoff:test_end_time]
            test_common_predictors = common_predictors[cutoff:test_end_time]
            test_specific_predictors = specific_predictors[
                cutoff:test_end_time
            ]
            cv_batches.append(
                (
                    training_data,
                    training_common_predictors,
                    training_specific_predictors,
                    test_data,
                    test_common_predictors,
                    test_specific_predictors,
                )
            )
            test_end_time = cutoff
            cutoff = test_end_time - prediction_time
        self.cv_batches = cv_batches
        self.models = {}
        print(
            "Created a RollingValidator with {} cross-validation batches.".format(
                len(cv_batches)
            )
        )

    def test_modelclass(self, modelclass, print_progress=False):
        test_errors = []
        training_errors = []
        test_predictions = []
        training_predictions = []
        for (i, cv_batch,) in enumerate(self.cv_batches):
            (
                training_data,
                training_common_predictors,
                training_specific_predictors,
                test_data,
                test_common_predictors,
                test_specific_predictors,
            ) = cv_batch
            if print_progress:
                print("Training for batch {}.".format(i))
            model = modelclass()
            model.train(
                training_data,
                training_common_predictors,
                training_specific_predictors,
            )
            if print_progress:
                print("Predicting for batch {}.".format(i))
            test_prediction = model.predict(
                test_common_predictors, test_specific_predictors
            )
            training_prediction = model.predict(
                training_common_predictors, training_specific_predictors
            )
            test_error = test_prediction - test_data
            training_error = training_prediction - training_data
            test_errors.append(test_error)
            training_errors.append(training_error)
            test_predictions.append(test_prediction)
            training_predictions.append(training_predictions)
        test_mae = pd.concat(test_errors).abs().mean()
        training_mae = pd.concat(training_errors).abs().mean()
        self.models[modelclass.classname] = {
            "test_mae": test_mae,
            "training_mae": training_mae,
            "test_errors": test_errors,
            "training_errors": training_errors,
            "test_predictions": test_predictions,
            "training_predictions": training_predictions,
        }
        return test_mae.sum()

In [14]:
weekday_dummies = pd.get_dummies(times.dt.weekday_name)
week_dummies = pd.get_dummies(times.dt.week)
hour_dummies = pd.get_dummies(times.dt.hour)
hour_dummies = hour_dummies.rename(
    columns={c: "Hour {}".format(c) for c in hour_dummies.columns}
)
week_dummies = week_dummies.rename(
    columns={c: "Week {}".format(c) for c in week_dummies.columns}
)
predictors_dum = pd.concat(
    [week_dummies, hour_dummies, weekday_dummies,], axis=1,
)

day_angles = 2 * np.pi * times.dt.hour / 24
year_angles = (
    2 * np.pi * times.dt.week / 52
)  # TODO Should we do this by day or week or month?
predictors_trig = pd.concat(
    [
        pd.DataFrame(
            {
                "Year sine": np.sin(year_angles),
                "Year cosine": np.cos(year_angles),
            }
        ),
        pd.DataFrame(
            {"Day sine": np.sin(day_angles), "Day cosine": np.cos(day_angles)}
        ),
        weekday_dummies,
    ],
    axis=1,
)

In [15]:
test_columns = [
    ("Waterloo Station 3, Waterloo", "Start"),
    ("Hyde Park Corner, Hyde Park", "End"),
    ("Wenlock Road , Hoxton", "End"),
]
# test_columns = sum(
#     [[(s, es) for s in test_stations] for es in ("End", "Start")], []
# )

In [16]:
# weekly_rolling = changes.rolling("7d").mean()
# weekly_rolling = weekly_rolling - weekly_rolling.mean()
# weekly_rolling = weekly_rolling / weekly_rolling.std()

In [17]:
weekoffset = events_by_station.shift(freq=pd.Timedelta("7d"))

In [18]:
first_time = weekoffset.index.min()
last_time = events_by_station.index.max()
data = events_by_station.loc[first_time:last_time, test_columns]
weekoffset = weekoffset.reindex(
    data.index, method="nearest"
)  # TODO Is "nearest" good?
# weekly_rolling = weekly_rolling[first_time:last_time]
predictors_trig = predictors_trig.loc[first_time:last_time, :]
predictors_dum = predictors_dum.loc[first_time:last_time, :]

In [19]:
specific_predictors = weekoffset[test_columns]
min_training_time = 2 * pd.Timedelta("365d")
prediction_time = 0.5 * pd.Timedelta("365d")
rv_trig = RollingValidator(
    data,
    predictors_trig,
    specific_predictors,
    min_training_time,
    prediction_time,
)
rv_dum = RollingValidator(
    data,
    predictors_dum,
    specific_predictors,
    min_training_time,
    prediction_time,
)

Created a RollingValidator with 6 cross-validation batches.
Created a RollingValidator with 6 cross-validation batches.


In [20]:
# For debugging
# dbg_timerange = slice("2017-01-01", None)
# min_training_time = 2 * pd.Timedelta("30d")
# prediction_time = 0.5 * pd.Timedelta("30d")
# rv_trig = RollingValidator(
#     data[dbg_timerange],
#     predictors_trig[dbg_timerange],
#     specific_predictors[dbg_timerange],
#     min_training_time,
#     prediction_time,
# )
# rv_dum = RollingValidator(
#     data[dbg_timerange],
#     predictors_dum[dbg_timerange],
#     specific_predictors[dbg_timerange],
#     min_training_time,
#     prediction_time,
# )

# Models

In [21]:
class SimpleMean:
    classname = "SimpleMean"

    def __init__(self):
        self.model = None

    def train(self, data, common_predictors, specific_predictors):
        mean = pd.DataFrame(data.mean())
        self.model = mean

    def predict(self, common_predictors, specific_predictors):
        mean = self.model
        index = common_predictors.index
        predictions = mean.T.apply(lambda x: [x[0]] * len(index)).set_index(
            index
        )
        return predictions

In [22]:
class LastWeek:
    classname = "LastWeek"

    def train(self, data, common_predictors, specific_predictors):
        pass

    def predict(self, common_predictors, specific_predictors):
        predictions = specific_predictors
        return predictions

Results for Linear are bad because all time variation has to be essentially sinusoidal.

In [23]:
class GenericModel:
    # Place-holders for subclasses to replace.
    regressor = None
    classname = None

    def __init__(self):
        self.model = {}
        self.std = None

    @classmethod
    def normalize_predictor(cls, predictor):
        return predictor, std

    def train(self, data, common_predictors, specific_predictors):
        if specific_predictors is not None:
            std = specific_predictors.std()
            specific_predictors = specific_predictors / std
            self.std = std

        for c in data.columns:
            if specific_predictors is not None and c in specific_predictors:
                predictors = pd.concat(
                    [common_predictors, specific_predictors[c]], axis=1
                )
            else:
                predictors = common_predictors
            model = self.regressor()
            model.fit(predictors, data[c])
            self.model[c] = model

    def predict(self, common_predictors, specific_predictors):
        if specific_predictors is not None:
            specific_predictors = specific_predictors / self.std
        model = self.model
        index = common_predictors.index
        predictions = pd.DataFrame({"Time": index}).set_index("Time")
        for c, m in model.items():
            if specific_predictors is not None and c in specific_predictors:
                predictors = pd.concat(
                    [common_predictors, specific_predictors[c]], axis=1
                )
            else:
                predictors = common_predictors
            predictions[c] = m.predict(predictors)
        return predictions

In [24]:
class Linear(GenericModel):
    classname = "Linear"
    regressor = linear_model.Ridge


class LinearSVR(GenericModel):
    classname = "LinearSVR"
    regressor = lambda x: svm.LinearSVR(max_iter=5000)


class KNeighbors(GenericModel):
    classname = "KNeighbors"
    regressor = lambda x: neighbors.KNeighborsRegressor(
        n_neighbors=5, weights="distance"
    )


class SVRrbf(GenericModel):
    classname = "SVRrbf"
    regressor = lambda x: svm.SVR(kernel="rbf", cache_size=500)


class SVRpoly(GenericModel):
    classname = "SVRpoly"
    regressor = lambda x: svm.SVR(kernel="poly", cache_size=500)


class SVRsigmoid(GenericModel):
    classname = "SVRsigmoid"
    regressor = lambda x: svm.SVR(kernel="sigmoid", cache_size=500)


# class GaussianNB(GenericModel):
#     classname = "GaussianNB"
#     regressor = lambda x: naive_bayes.GaussianNB()


class DecisionTree(GenericModel):
    classname = "DecisionTree"
    regressor = lambda x: tree.DecisionTreeRegressor(
        criterion="mae", max_depth=3
    )

Note that linear models probably fit to squared error, we measure absolute error.

KernelRidge also can't handle this amount of training data with chocking on RAM.

In [25]:
def make_diffmodelclass(modelclass):
    class DiffModel(modelclass):
        parent = modelclass
        classname = "DiffModel({})".format(modelclass.classname)

        def train(self, data, common_predictors, specific_predictors):
            data = data.copy()
            for c in data.columns:
                data[c] = data[c] - specific_predictors[c]
            super().train(data, common_predictors, None)

        def predict(self, common_predictors, specific_predictors):
            predictions = super().predict(common_predictors, None)
            for c in predictions.columns:
                predictions[c] = predictions[c] + specific_predictors[c]
            return predictions

    return DiffModel

In [26]:
DiffSimpleMean = make_diffmodelclass(SimpleMean)
DiffLinear = make_diffmodelclass(Linear)
DiffKNeighbors = make_diffmodelclass(KNeighbors)
DiffLinearSVR = make_diffmodelclass(LinearSVR)
# DiffGaussianNB = make_diffmodelclass(GaussianNB)
DiffDecisionTree = make_diffmodelclass(DecisionTree)
DiffSVRrbf = make_diffmodelclass(SVRrbf)
DiffSVRpoly = make_diffmodelclass(SVRpoly)
DiffSVRsigmoid = make_diffmodelclass(SVRsigmoid)

In [27]:
# print(rv_dum.test_modelclass(DiffLinear, print_progress=True))
# print(rv_trig.test_modelclass(DiffKNeighbors, print_progress=True))
# print(rv_trig.test_modelclass(DiffDecisionTree, print_progress=True))
print(rv_trig.test_modelclass(Linear, print_progress=True))

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
12.536609855143324


In [26]:
from timeit import default_timer as timer


import pickle

classes = [
    SimpleMean,
    LastWeek,
    Linear,
    KNeighbors,
    LinearSVR,
    #     GaussianNB,
    DecisionTree,
    SVRrbf,
    SVRpoly,
    #     SVRsigmoid,
    DiffSimpleMean,
    DiffLinear,
    DiffKNeighbors,
    DiffLinearSVR,
    #     DiffGaussianNB,
    DiffDecisionTree,
    DiffSVRrbf,
    DiffSVRpoly,
    #     DiffSVRsigmoid,
]
for modelclass in classes:
    print(modelclass.classname)
    for rv, rv_name in ((rv_trig, "trig"), (rv_dum, "dumm")):
        start = timer()
        try:
            err = rv.test_modelclass(modelclass)
        except Exception as e:
            print(e)
        stop = timer()
        time = (stop - start) / 60
        print("{} mae: {:.3f}   (took {:.1f} mins)".format(rv_name, err, time))
        with open("latest_rv_dum.p", "wb") as f:
            pickle.dump(rv_dum, f)
        with open("latest_rv_trig.p", "wb") as f:
            pickle.dump(rv_trig, f)
    print()

SimpleMean
trig mae: 25.587   (took 0.0 mins)
dumm mae: 25.587   (took 0.0 mins)

LastWeek
trig mae: 12.285   (took 0.0 mins)
dumm mae: 12.285   (took 0.0 mins)

Linear
trig mae: 12.537   (took 0.0 mins)
dumm mae: 12.615   (took 0.0 mins)

KNeighbors
trig mae: 11.346   (took 0.2 mins)
dumm mae: 11.491   (took 30.2 mins)

LinearSVR
trig mae: 11.496   (took 0.1 mins)
dumm mae: 11.361   (took 0.1 mins)

DecisionTree
trig mae: 11.586   (took 5.6 mins)
dumm mae: 11.724   (took 21.7 mins)

SVRrbf
trig mae: 10.800   (took 16.9 mins)
dumm mae: 10.915   (took 69.0 mins)

SVRpoly
trig mae: 10.553   (took 16.9 mins)
dumm mae: 10.686   (took 72.9 mins)

DiffModel(SimpleMean)
trig mae: 12.300   (took 0.0 mins)
dumm mae: 12.300   (took 0.0 mins)

DiffModel(Linear)
trig mae: 12.436   (took 0.0 mins)
dumm mae: 13.029   (took 0.0 mins)

DiffModel(KNeighbors)
trig mae: 14.328   (took 0.2 mins)
dumm mae: 14.316   (took 31.5 mins)

DiffModel(LinearSVR)
trig mae: 12.285   (took 0.0 mins)
dumm mae: 12.296  

In [192]:
import pickle

with open("latest_rv_dum.p", "rb") as f:
    rv_dum = pickle.load(f)
with open("latest_rv_trig.p", "rb") as f:
    rv_trig = pickle.load(f)

In [193]:
for k, v in rv_dum.models.items():
    test_mae = v["test_mae"].sum()
    training_mae = v["training_mae"].sum()
    print("{:25}: {:.3f}   {:.3f}".format(k, test_mae, training_mae))

SimpleMean               : 25.587   24.211
LastWeek                 : 12.285   11.369
Linear                   : 12.615   11.821
KNeighbors               : 11.491   1.104
LinearSVR                : 11.361   10.479
DecisionTree             : 11.724   10.689
SVRrbf                   : 10.915   9.282
SVRpoly                  : 10.686   8.827
DiffModel(SimpleMean)    : 12.300   11.384
DiffModel(Linear)        : 13.029   11.942
DiffModel(KNeighbors)    : 14.316   10.089
DiffModel(LinearSVR)     : 12.296   11.367
DiffModel(DecisionTree)  : 12.284   11.335
DiffModel(SVRrbf)        : 12.452   11.083
DiffModel(SVRpoly)       : 12.513   10.898


In [28]:
rv = rv_trig
plot_column = ("Hyde Park Corner, Hyde Park", "End")
# plot_columns = [("Wenlock Road , Hoxton", "End")]
# plot_columns = [("Waterloo Station 3, Waterloo",  "Start")]
modelclass = Linear
err = pd.concat(rv.models[modelclass.classname]["test_errors"]).sort_index()
pred = pd.concat(rv.models[modelclass.classname]["test_predictions"]).sort_index()
truth = pd.concat([l[3] for l in rv.cv_batches]).sort_index()
plt.figure()
plt.plot(truth[plot_column], pred[plot_column], ls="", marker="*", ms=1)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7fe7ce6ffed0>]

In [30]:
rv = rv_dum
plot_columns = [("Hyde Park Corner, Hyde Park", "End")]
# plot_columns = [("Wenlock Road , Hoxton", "End")]
# plot_columns = [("Waterloo Station 3, Waterloo",  "Start")]
modelclass = SVRrbf
err = pd.concat(rv.models[modelclass.classname]["test_errors"]).sort_index()
pred = pd.concat(rv.models[modelclass.classname]["test_predictions"]).sort_index()
truth = pd.concat([l[3] for l in rv.cv_batches]).sort_index()
plt.figure()
plt.plot(truth[plot_columns])
plt.plot(pred[plot_columns])

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7fe7c5725510>]

In [79]:
plt.figure()
example_morning = data[("Waterloo Station 3, Waterloo",  "Start")][(times.dt.weekday == 1) & (times.dt.hour == 7)]
m = example_morning.mean()
example_morning.hist(bins=50)
xaxis = example_morning.unique()
xaxis.sort()
yaxis = sp.stats.poisson.pmf(xaxis, m) * len(example_morning)
plt.plot(xaxis, yaxis)

  """Entry point for launching an IPython kernel.


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7fe7c375ce90>]

In [189]:
class ZeroPredictor:

    def predict(self, predictors):
        prediction = pd.Series(0.0, index=predictors.index)
        return prediction


class PoissonGLM:
    classname = "PoissonGLM"

    def __init__(self):
        self.models = {}
        self.std = None

    def train(self, data, common_predictors, specific_predictors):
        self.models = {}
        if specific_predictors is not None:
            std = specific_predictors.std()
            specific_predictors = specific_predictors / std
            self.std = std

        times = data.index.to_series()
        groupers = [times.dt.weekday, times.dt.hour]
        data_groups = data.groupby(groupers)
        common_predictors_groups = common_predictors.groupby(groupers)
        if specific_predictors is not None:
            specific_predictors_groups = specific_predictors.groupby(groupers)
        for c in data.columns:
            self.models[c] = {}
            for group_label, data_group in data_groups:
                common_predictors_group = common_predictors_groups.get_group(
                    group_label
                )
                if (
                    specific_predictors is not None
                    and c in specific_predictors
                ):
                    specific_predictors_group = specific_predictors_groups.get_group(
                        group_label
                    )
                    predictors_group = pd.concat(
                        [
                            common_predictors_group,
                            specific_predictors_group[c],
                        ],
                        axis=1,
                    )
                else:
                    predictors_group = common_predictors_group
                glm_poisson = sm.GLM(
                    data_group[c],
                    predictors_group,
                    family=sm.families.Poisson(),
                )
                try:
                    model = glm_poisson.fit()
                except ValueError as e:
                    # The GLM can't handle casese like data that is all zeros.
                    # In those cases, just make a predictor that always predicts zero.
                    model = ZeroPredictor()
                    if data_group.max()[c] > 0.0:
                        # The usual reason all all-zero is not why we errored this time.
                        # Print some diagnostics to figure out what went wrong.
                        print("glm_poisson.fit() raise a ValueError.")
                        print(data_group[c].describe())
                        print(predictors_group.describe())
                self.models[c][group_label] = model

    def predict(self, common_predictors, specific_predictors):
        if specific_predictors is not None:
            specific_predictors = specific_predictors / self.std
        times = common_predictors.index.to_series()
        groupers = [times.dt.weekday, times.dt.hour]
        common_predictors_groups = common_predictors.groupby(groupers)
        predictions = pd.DataFrame({"Time": times}).set_index("Time")
        if specific_predictors is not None:
            specific_predictors_groups = specific_predictors.groupby(groupers)
        for c in self.models.keys():
            predictions_c = []
            for (
                group_label,
                common_predictors_group,
            ) in common_predictors_groups:
                if (
                    specific_predictors is not None
                    and c in specific_predictors
                ):
                    specific_predictors_group = specific_predictors_groups.get_group(
                        group_label
                    )
                    predictors_group = pd.concat(
                        [
                            common_predictors_group,
                            specific_predictors_group[c],
                        ],
                        axis=1,
                    )
                else:
                    predictors_group = common_predictors_group
                model = self.models[c][group_label]
                if model == 0.0:
                    predictions_group = pd.DataFrame()
                else:
                    predictions_group = model.predict(predictors_group)
                predictions_c.append(predictions_group)
            predictions_c = pd.concat(predictions_c, axis=0)
            predictions[c] = predictions_c
        return predictions

In [190]:
min_training_time = 2.0 * pd.Timedelta("365d")
prediction_time = 0.5 * pd.Timedelta("365d")
rv_poiss = RollingValidator(
    data,
    predictors_trig,
    specific_predictors,
    min_training_time,
    prediction_time,
)

Created a RollingValidator with 6 cross-validation batches.


In [191]:
rv_poiss.test_modelclass(PoissonGLM, print_progress=True)

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
glm_poisson.fit() raise a ValueError.
count    123.000000
mean       0.040650
std        0.198287
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: (Wenlock Road , Hoxton, End), dtype: float64
        Year sine   Year cosine      Day sine  Day cosine  Friday  Monday  \
count  123.000000  1.230000e+02  1.230000e+02       123.0   123.0   123.0   
mean     0.119221  2.938535e-02  8.660254e-01         0.5     0.0     0.0   
std      0.715754  6.933202e-01  1.114764e-16         0.0     0.0     0.0   
min     -1.000000 -1.000000e+00  8.660254e-01         0.5     0.0     0.0   
25%     -0.568065 -6.631227e-01  8.660254e-01         0.5     0.0     0.0   
50%      0.239316 -1.608123e-16  8.66

10.812167837366967

In [None]:
# specific_predictors.loc["2016-08-15":"2016-08-16", :]

In [146]:
list(rv_poiss.models.values())[0]["test_mae"]

(Waterloo Station 3, Waterloo, Start)    3.699950
(Hyde Park Corner, Hyde Park, End)       4.303425
(Wenlock Road , Hoxton, End)             0.954040
dtype: float64

In [147]:
pd.concat(list(rv_poiss.models.values())[0]["test_predictions"], axis=0).sort_index().loc["2016-11-01":, :].plot()

  fig = self.plt.figure(figsize=self.figsize)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7f16adbe8610>

In [59]:
test_column = ("Waterloo Station 3, Waterloo", "Start")
data_poissgroups = data.groupby([times.dt.weekday, times.dt.hour])
predictions = []
for group, data_poiss in data_poissgroups:
    times_poiss = data_poiss.index.to_series()
    #     year_angles = 2 * np.pi * times_poiss.dt.week / 52
    # predictors_poiss = pd.DataFrame(
    #     {"Year sine": np.sin(year_angles), "Year cosine": np.cos(year_angles),}
    # )
    predictors_poiss = pd.get_dummies(times_poiss.dt.month)
    predictors_poiss = pd.concat(
        [predictors_poiss, specific_predictors.loc[times_poiss, test_column]],
        axis=1,
    )
    glm_poisson = sm.GLM(
        data_poiss[test_column], predictors_poiss, family=sm.families.Poisson()
    )
    res = glm_poisson.fit()
    prediction = res.predict(predictors_poiss)
    predictions.append(prediction)
prediction_all = pd.concat(predictions, axis=0).sort_index()
plt.figure()
plt.plot(data[test_column])
plt.plot(prediction_all)

Date
2012-01-11 00:00:00    5.008715e-01
2012-01-11 01:00:00    7.849588e-02
2012-01-11 02:00:00    4.000000e-02
2012-01-11 03:00:00    6.794409e-12
2012-01-11 04:00:00    8.000000e-02
                           ...     
2017-05-16 22:00:00    1.140790e+00
2017-05-16 23:00:00    7.271702e-01
2017-05-17 00:00:00    3.090881e-01
2017-05-17 01:00:00    7.704727e-02
2017-05-17 02:00:00    4.166667e-02
Length: 46711, dtype: float64




Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7f16c1866990>]

In [111]:
wdf = pd.read_csv("./data/weather/london_weather_data.csv", usecols=["DATE", "PRCP", "TAVG"], encoding="ISO-8859-2")
wdf["DATE"] = pd.to_datetime(wdf["DATE"])
wdf = wdf.set_index("DATE")

In [222]:
wdf = wdf[first_time:last_time]

In [228]:
# There's only 6 NaNs in this time range, so how we fill them doesn't really matter much.
wdf["PRCP"] = wdf["PRCP"].fillna(0.0)

In [229]:
wdf.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [235]:
wdfn = wdf.copy()
wdfn["PRCP"] /= wdfn["PRCP"].std()
wdfn["TAVG"] = (wdfn["TAVG"] - wdfn["TAVG"].mean())/wdfn["TAVG"].std()

In [238]:
wdfn.hist(bins=20)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f57d2c80790>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f57d2f19b50>]],
      dtype=object)

In [None]:
# specific_predictors = weekoffset[test_columns]
# min_training_time = 2 * pd.Timedelta("365d")
# prediction_time = 0.5 * pd.Timedelta("365d")
# rv_weather = RollingValidator(
#     data,
#     predictors_trig,
#     specific_predictors,
#     min_training_time,
#     prediction_time,
# )