Other analyses of the same data:

https://github.com/charlie1347/TfL_bikes

https://medium.com/@AJOhrn/data-footprint-of-bike-sharing-in-london-be9e11425248

In [188]:
import os
import pandas as pd
import numpy as np
from sklearn import linear_model, svm, neighbors, naive_bayes
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns
from pathlib import Path
from IPython.display import set_matplotlib_formats

In [134]:
# For pretty and exportable matplotlib plots.
# If you are running this yourself and want interactivity,
# try `%matplotlib widget` instead.
# set_matplotlib_formats("svg")
# %matplotlib inline
%matplotlib widget
# Set a consistent plotting style across the notebook using Seaborn.
sns.set_style("darkgrid")
sns.set_context("notebook")

# Processing and cleaning the data

In [3]:
bikefolder = "./data/bikes"

In [4]:
def add_station_names(station_names, df, namecolumn, idcolumn):
    namemaps = (
        df[[idcolumn, namecolumn]]
        .groupby(idcolumn)
        .aggregate(lambda x: x.unique())
    )
    for number, names in namemaps.iterrows():
        current_names = station_names.get(number, set())
        # The following two lines are a stupid dance around the annoying fact that pd.unique sometimes returns a single value,
        # sometimes a numpy array of values, but since the single value is a string, it too is an iterable.
        vals = names[0]
        new_names = set([vals]) if type(vals) == str else set(vals)
        current_names.update(new_names)
        station_names[number] = current_names


def clean_datetime_column(df, colname, roundto="H"):
    # A bit of a hacky way to use the first entry to figure out which date format this file uses.
    # Not super robust, but works. TODO Improve this.
    if len(df[colname].iloc[0]) > 16:
        format = "%d/%m/%Y %H:%M:%S"
    else:
        format = "%d/%m/%Y %H:%M"
    df[colname] = pd.to_datetime(df[colname], format=format)
    df[colname] = df[colname].dt.round(roundto)
    early_cutoff = pd.datetime(2010, 7, 30)  # When the program started.
    late_cutoff = pd.datetime(2020, 1, 1)  # Approximately now.
    df = df[(late_cutoff > df[colname]) & (df[colname] >= early_cutoff)]
    return df


def compute_single_events(df, which):
    stationcol = "{}Station Id".format(which)
    datecol = "{} Date".format(which)
    events = (
        df.rename(columns={stationcol: "Station", datecol: "Date"})
        .groupby(["Date", "Station"])
        .size()
        .unstack("Station")
    )
    return events


def compute_both_events(df):
    ends = compute_single_events(df, "End")
    starts = compute_single_events(df, "Start")
    both = (
        pd.concat([ends, starts], keys=["End", "Start"], axis=1)
        .reorder_levels([1, 0], axis=1)
        .fillna(0.0)
    )
    return both


def castable_to_int(obj):
    try:
        int(obj)
        return True
    except ValueError:
        return False


def cast_to_int(df, colname):
    try:
        df = df.astype({colname: np.int_}, copy=False)
    except ValueError:
        castable_rows = df[colname].apply(castable_to_int)
        df = df[castable_rows]
        df = df.astype({colname: np.int_}, copy=False)
    return df


events_by_station_path = Path("./events_by_station.p")
if events_by_station_path.exists():
    events_by_station = pd.read_pickle(events_by_station_path)
else:
    datafiles = sorted(os.listdir(bikefolder))
    folderpath = Path(bikefolder)
    datapaths = [folderpath / Path(file) for file in datafiles]
    datapaths = [p for p in datapaths if p.suffix == ".csv"]

    station_allnames = {}

    pieces = []
    #     datapaths = [
    #         folderpath / Path(file)
    #         for file in [
    #             "21JourneyDataExtract31Aug2016-06Sep2016.csv",
    #             "15JourneyDataExtract20Jul2016-26Jul2016.csv",
    #             "13b. Journey Data Extract 22Dec14-03Jan15.csv",
    #             "16JourneyDataExtract27Jul2016-02Aug2016.csv",
    #             "10b. Journey Data Extract 28Sep14-11Oct14.csv",
    #             "6. Journey Data Extract_27May-23Jun12.csv",
    #             "6. Journey Data Extract_27May-23Jun12.csv",
    #         ]
    #     ]
    cols = [
        "Duration",
        "End Date",
        "EndStation Id",
        "EndStation Name",
        "Start Date",
        "StartStation Id",
        "StartStation Name",
    ]
    problem_paths = []
    for path in datapaths:
        print(path)
        try:
            df = pd.read_csv(path, usecols=cols, encoding="ISO-8859-2")
        except ValueError as e:
            # Some files have missing or abnormaly named columns. We'll deal with them later.
            problem_paths.append(path)
            continue
        # Drop any rows that have missing values.
        df = df[~df.isna().any(axis=1)]
        # Drop any anomalously short trips. Probably somebody just taking a bike and putting
        # it right back in.
        df = df[df["Duration"] > 60]
        # Cast the columns to the right types. This is easier ones NAs have been dropped.
        df = cast_to_int(df, "EndStation Id")
        df = cast_to_int(df, "StartStation Id")
        # Turn the date columns from strings into datetime objects rounded to the hour.
        df = clean_datetime_column(df, "End Date")
        df = clean_datetime_column(df, "Start Date")
        events = compute_both_events(df)
        pieces.append(events)

        add_station_names(
            station_allnames, df, "EndStation Name", "EndStation Id"
        )
        add_station_names(
            station_allnames, df, "StartStation Name", "StartStation Id"
        )

    station_ids = {}
    station_names = {}
    for k, v in station_allnames.items():
        v = sorted(v)
        station_names[k] = v[0]
        for name in v:
            station_ids[name] = k

    def get_station_id(name):
        try:
            return station_ids[name]
        except KeyError:
            return np.nan

    print("Doing the problem cases ({} of them).".format(len(problem_paths)))
    safe_cols = [
        "Duration",
        "End Date",
        "EndStation Name",
        "Start Date",
        "StartStation Name",
    ]
    for path in problem_paths:
        print(path)
        df = pd.read_csv(path, usecols=safe_cols, encoding="ISO-8859-2")
        # Drop any rows that have missing values.
        df = df[~df.isna().any(axis=1)]
        # Drop any anomalously short trips. Probably somebody just taking a bike and putting
        # it right back in.
        df = df[df["Duration"] > 60]
        # Add a column of station ids, based on names.
        df["EndStation Id"] = df["EndStation Name"].apply(get_station_id)
        df["StartStation Id"] = df["StartStation Name"].apply(get_station_id)
        # Turn the date columns from strings into datetime objects rounded to the hour.
        clean_datetime_column(df, "End Date")
        clean_datetime_column(df, "Start Date")
        events = compute_both_events(df)
        pieces.append(events)

    events_by_station = pd.concat(pieces).fillna(0.0).sort_index()
    events_by_station = events_by_station.rename(
        mapper=station_names, axis=1, level=0
    )

    events_by_station.to_pickle(events_by_station_path)

In [5]:
a = events_by_station.sum(axis=1, level=0)


def mean_within_window(s):
    starttime = s.ne(0).idxmax()
    endtime = s[::-1].ne(0).idxmax()
    return s[starttime:endtime].mean()


(a.max() / a.aggregate(mean_within_window)).sort_values(ascending=False)[:30]

Station
Electrical Workshop PS                                      1946.693548
PENTON STREET COMMS TEST TERMINAL _ CONTACT MATT McNULTY    1735.230769
tabletop1                                                   1123.777778
Contact Centre, Southbury House                             1027.705263
Pop Up Dock 1                                                824.012017
6                                                            582.662026
Mechanical Workshop Penton                                   248.971754
South Quay East, Canary Wharf                                150.185113
Westfield Eastern Access Road, Shepherd's Bush               100.810728
Thornfield House, Poplar                                      93.689453
Upper Grosvenor Street, Mayfair                               79.666708
Courland Grove , Wandsworth Road                              64.008406
Fore Street Avenue: Guildhall                                 63.179106
Aberfeldy Street, Poplar                                

In [6]:
improper_stations = [
    "Electrical Workshop PS",
    "PENTON STREET COMMS TEST TERMINAL _ CONTACT MATT McNULTY",
    "tabletop1",
    "Pop Up Dock 1",
    "6",
    "Mechanical Workshop Penton",
]
events_by_station = events_by_station.drop(improper_stations, axis=1, level=0)

In [7]:
# TODO What is up with this?
events_by_station["Exhibition Road Museums, Knightsbridge"]

Unnamed: 0_level_0,End,Start,End,Start
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-04 00:00:00,0.0,0.0,0.0,0.0
2012-01-04 01:00:00,0.0,0.0,0.0,0.0
2012-01-04 02:00:00,2.0,0.0,0.0,0.0
2012-01-04 03:00:00,0.0,0.0,0.0,0.0
2012-01-04 04:00:00,0.0,0.0,0.0,0.0
...,...,...,...,...
2017-05-16 22:00:00,1.0,1.0,0.0,0.0
2017-05-16 23:00:00,0.0,0.0,0.0,0.0
2017-05-17 00:00:00,1.0,0.0,0.0,0.0
2017-05-17 01:00:00,0.0,0.0,0.0,0.0


# Exploring

In [8]:
stations = events_by_station.columns.get_level_values(0).unique()
events_by_time = events_by_station.sum(axis=1, level=1)
totals_by_station = events_by_station.sum(axis=0)
times = events_by_station.index.to_series()

In [9]:
test_stations = [
    "Waterloo Station 3, Waterloo",
    "Hyde Park Corner, Hyde Park",
    "Wenlock Road , Hoxton",
    "Stonecutter Street, Holborn",
]

In [10]:
# TODO Give the plots widths from variance or something like [25%, 75%] limits (what are these called again?).
example_means_over_week = (
    events_by_station[test_stations]
    .groupby([times.dt.weekday, times.dt.hour])
    .sum()
)
example_means_over_week.index.rename(["Day", "Hour"], inplace=True)
example_means_over_week = (
    example_means_over_week.stack(level=[0, 1])
    .reset_index()
    .rename(columns={"level_3": "End/Start", 0: "Count"})
)
example_means_over_week["Weekday"] = example_means_over_week.apply(
    lambda x: x["Day"] + x["Hour"] / 24, axis=1,
)
g = sns.FacetGrid(
    example_means_over_week,
    col_wrap=2,
    col="Station",
    hue="End/Start",
    sharey=False,
    sharex=True,
)
g.map(plt.plot, "Weekday", "Count").set_titles("{col_name}")
g.add_legend();

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [11]:
# TODO Give the plots widths from variance or something like [25%, 75%] limits (what are these called again?).
example_means_over_year = (
    events_by_station[test_stations].groupby(times.dt.week).sum()
)
example_means_over_year.index.rename("Week", inplace=True)
example_means_over_year = (
    example_means_over_year.stack(level=[0, 1])
    .reset_index()
    .rename(columns={"level_2": "End/Start", 0: "Count"})
)
g = sns.FacetGrid(
    example_means_over_year,
    col_wrap=2,
    col="Station",
    hue="End/Start",
    sharey=False,
    sharex=True,
)
g.map(plt.plot, "Week", "Count").set_titles("{col_name}")
g.add_legend();

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [12]:
yearly_rolling = events_by_station.loc[:, test_stations].rolling("365d", min_periods=24*90).mean()
yearly_rolling = (
    yearly_rolling.stack(level=[0, 1])
    .reset_index()
    .rename(columns={"level_2": "End/Start", 0: "Count"})
)
g = sns.FacetGrid(
    yearly_rolling,
    col_wrap=2,
    col="Station",
    hue="End/Start",
    sharey=False,
    sharex=True,
)
g.map(plt.plot, "Date", "Count").set_titles("{col_name}")
g.add_legend();

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


# Feature extraction

In [13]:
weekday_dummies = pd.get_dummies(times.dt.weekday_name)
week_dummies = pd.get_dummies(times.dt.week)
hour_dummies = pd.get_dummies(times.dt.hour)
hour_dummies = hour_dummies.rename(columns={c: "Hour {}".format(c) for c in hour_dummies.columns})
week_dummies = week_dummies.rename(columns={c: "Week {}".format(c) for c in week_dummies.columns})
predictors_cat = pd.concat(
    [week_dummies, hour_dummies, weekday_dummies,], axis=1,
)

day_angles = 2 * np.pi * times.dt.hour / 24
year_angles = (
    2 * np.pi * times.dt.week / 52
)  # TODO Should we do this by day or week or month?
predictors_trig = pd.concat(
    [
        pd.DataFrame(
            {
                "Year sine": np.sin(year_angles),
                "Year cosine": np.cos(year_angles),
            }
        ),
        pd.DataFrame(
            {"Day sine": np.sin(day_angles), "Day cosine": np.cos(day_angles),}
        ),
        weekday_dummies,
    ],
    axis=1,
)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7fbe1b551dd0>

In [152]:
class RollingValidator:
    def __init__(
        self,
        data,
        common_predictors,
        specific_predictors,
        min_training_time,
        prediction_time,
    ):
        cv_batches = []
        first_time = data.index.min()
        last_time = data.index.max()
        test_end_time = last_time
        cutoff = test_end_time - prediction_time
        while cutoff > first_time + min_training_time:
            training_data = data[:cutoff]
            training_common_predictors = common_predictors[:cutoff]
            training_specific_predictors = specific_predictors[:cutoff]
            test_data = data[cutoff:test_end_time]
            test_common_predictors = common_predictors[cutoff:test_end_time]
            test_specific_predictors = specific_predictors[
                cutoff:test_end_time
            ]
            cv_batches.append(
                (
                    training_data,
                    training_common_predictors,
                    training_specific_predictors,
                    test_data,
                    test_common_predictors,
                    test_specific_predictors,
                )
            )
            test_end_time = cutoff
            cutoff = test_end_time - prediction_time
        self.cv_batches = cv_batches
        self.models = {}
        print(
            "Created a RollingValidator with {} cross-validation batches.".format(
                len(cv_batches)
            )
        )

    def test_modelclass(self, modelclass):
        test_errors = []
        training_errors = []
        test_predictions = []
        training_predictions = []
        for (i, cv_batch,) in enumerate(self.cv_batches):
            (
                training_data,
                training_common_predictors,
                training_specific_predictors,
                test_data,
                test_common_predictors,
                test_specific_predictors,
            ) = cv_batch
            print("Training for batch {}.".format(i))
            model = modelclass()
            model.train(
                training_data,
                training_common_predictors,
                training_specific_predictors,
            )
            print("Predicting for batch {}.".format(i))
            test_prediction = model.predict(
                test_common_predictors, test_specific_predictors
            )
            training_prediction = model.predict(
                training_common_predictors, training_specific_predictors
            )
            test_error = test_prediction - test_data
            training_error = training_prediction - training_data
            test_errors.append(test_error)
            training_errors.append(training_error)
            test_predictions.append(test_prediction)
            training_predictions.append(training_predictions)
        test_mae = pd.concat(test_errors).abs().mean()
        training_mae = pd.concat(training_errors).abs().mean()
        self.models[modelclass] = {
            "test_mae": test_mae,
            "training_mae": training_mae,
            "test_errors": test_errors,
            "training_errors": training_errors,
            "test_predictions": test_predictions,
            "training_predictions": training_predictions,
        }
        return test_mae

In [153]:
weekly_rolling = events_by_station.rolling("7d").mean()
weekly_rolling = weekly_rolling - weekly_rolling.mean()
weekly_rolling = weekly_rolling / weekly_rolling.std()

In [154]:
test_columns = [
    ("Waterloo Station 3, Waterloo", "Start"),
    ("Hyde Park Corner, Hyde Park", "End"),
    ("Wenlock Road , Hoxton", "End"),
]
# test_columns = sum(
#     [[(s, es) for s in test_stations] for es in ("End", "Start")], []
# )
data = events_by_station[test_columns]
specific_predictors = weekly_rolling[test_columns]
min_training_time = 2 * pd.Timedelta("365d")
prediction_time = 0.5 * pd.Timedelta("365d")
rv_trig = RollingValidator(
    data, predictors_trig, specific_predictors, min_training_time, prediction_time,
)
rv_cat = RollingValidator(
    data, predictors_cat, specific_predictors, min_training_time, prediction_time,
)

Created a RollingValidator with 6 cross-validation batches.
Created a RollingValidator with 6 cross-validation batches.


# Models

In [155]:
class SimpleMean:
    def __init__(self):
        self.model = None

    def train(self, data, common_predictors, specific_predictors):
        mean = pd.DataFrame(data.mean())
        m = mean.T
        self.model = mean

    def predict(self, common_predictors, specific_predictors):
        mean = self.model
        index = common_predictors.index
        predictions = mean.T.apply(lambda x: [x[0]] * len(index)).set_index(
            index
        )
        return predictions

In [156]:
print(rv_cat.test_modelclass(SimpleMean))

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
Station                            
Waterloo Station 3, Waterloo  Start    11.757080
Hyde Park Corner, Hyde Park   End      10.506151
Wenlock Road , Hoxton         End       1.352579
dtype: float64


Results for Linear are bad because all time variation has to be essentially sinusoidal.

In [157]:
class GenericModel:
    regressor = None
    
    def __init__(self):
        self.model = {}

    def train(self, data, common_predictors, specific_predictors):
        for c in data.columns:
            predictors = pd.concat([common_predictors, specific_predictors[c]], axis=1)
            model = self.regressor()
            model.fit(predictors, data[c])
            self.model[c] = model

    def predict(self, common_predictors, specific_predictors):
        model = self.model
        index = common_predictors.index
        predictions = pd.DataFrame({"Time": index}).set_index("Time")
        for c, m in model.items():
            predictors = pd.concat([common_predictors, specific_predictors[c]], axis=1)
            predictions[c] = m.predict(predictors)
        return predictions

In [158]:
class Linear(GenericModel):
    regressor = linear_model.Ridge

In [159]:
print(rv_cat.test_modelclass(Linear))
print(rv_trig.test_modelclass(Linear))

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
(Waterloo Station 3, Waterloo, Start)    9.036405
(Hyde Park Corner, Hyde Park, End)       8.232443
(Wenlock Road , Hoxton, End)             1.101858
dtype: float64
Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
(Waterloo Station 3, Waterloo, Start)    12.851157
(Hyde Park Corner, Hyde Park, End)        8.414188
(Wenlock Road , Hoxton, End)              1.192353
dtype: float64


In [160]:
class KNeighbors(GenericModel):
    regressor = lambda x: neighbors.KNeighborsRegressor(n_neighbors=5, weights="distance")

In [161]:
#print(rv_cat.test_modelclass(KNeighbors))
print(rv_trig.test_modelclass(KNeighbors))

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
(Waterloo Station 3, Waterloo, Start)    6.140720
(Hyde Park Corner, Hyde Park, End)       6.343389
(Wenlock Road , Hoxton, End)             1.064122
dtype: float64


In [None]:
class SVR(GenericModel):
    regressor = lambda x: svm.SVR(kernel="rbf", cache_size=500)

In [55]:
#print(rv_cat.test_modelclass(SVR))
print(rv_trig.test_modelclass(SVR))

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
(Station                            
Waterloo Station 3, Waterloo  Start    7.285227
Hyde Park Corner, Hyde Park   End      6.160579
Wenlock Road , Hoxton         End      1.041755
dtype: float64, Station                            
Waterloo Station 3, Waterloo  Start    7.053634
Hyde Park Corner, Hyde Park   End      5.614971
Wenlock Road , Hoxton         End      0.889218
dtype: float64)


KernelRidge also can't handle this amount of training data with chocking on RAM.

In [162]:
class LinearSVR(GenericModel):
    regressor = lambda x: svm.LinearSVR()

In [163]:
print(rv_cat.test_modelclass(LinearSVR))
print(rv_trig.test_modelclass(LinearSVR))

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
(Waterloo Station 3, Waterloo, Start)    7.052375
(Hyde Park Corner, Hyde Park, End)       7.213297
(Wenlock Road , Hoxton, End)             1.042861
dtype: float64
Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
(Waterloo Station 3, Waterloo, Start)    7.784467
(Hyde Park Corner, Hyde Park, End)       7.432490
(Wenlock Road , Hoxton, End)             1.112732
dtype: float64


In [192]:
rv = rv_trig
plot_column = ("Hyde Park Corner, Hyde Park", "End")
# plot_columns = [("Wenlock Road , Hoxton", "End")]
# plot_columns = [("Waterloo Station 3, Waterloo",  "Start")]
modelclass = GaussianNB
err = pd.concat(rv.models[modelclass]["test_errors"]).sort_index()
pred = pd.concat(rv.models[modelclass]["test_predictions"]).sort_index()
truth = pd.concat([l[3] for l in rv.cv_batches]).sort_index()
plt.figure()
plt.plot(truth[plot_column], err[plot_column], ls="", marker="*", ms=1)

  if __name__ == '__main__':
__init__() missing 1 required positional argument: 'figure'
This is deprecated in traitlets 4.2.This error will be raised in a future release of traitlets.
  super(Widget, self).__init__(**kwargs)
__init__() missing 1 required positional argument: 'canvas'
This is deprecated in traitlets 4.2.This error will be raised in a future release of traitlets.
  super(Widget, self).__init__(**kwargs)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7fbe0e31eed0>]

In [187]:
plt.figure()
predictors_trig.loc[10 == data[plot_column], "Day cosine"].hist()

__init__() missing 1 required positional argument: 'figure'
This is deprecated in traitlets 4.2.This error will be raised in a future release of traitlets.
  super(Widget, self).__init__(**kwargs)
__init__() missing 1 required positional argument: 'canvas'
This is deprecated in traitlets 4.2.This error will be raised in a future release of traitlets.
  super(Widget, self).__init__(**kwargs)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7fbe0e46ee10>

In [196]:
# TODO Try this without the rolling average data. Also try Multinomial.
class CategoricalNB(GenericModel):
    regressor = lambda x: naive_bayes.CategoricalNB()

In [197]:
print(rv_cat.test_modelclass(CategoricalNB))
print(rv_trig.test_modelclass(CategoricalNB))

Training for batch 0.


ValueError: X must not contain negative values.

In [90]:
(events_by_station[test_columns] - events_by_station[test_columns].shift(freq=-pd.Timedelta("7d"))).plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7f47890e6d90>

In [191]:
rv = rv_cat
plot_columns = [("Hyde Park Corner, Hyde Park", "End")]
# plot_columns = [("Wenlock Road , Hoxton", "End")]
# plot_columns = [("Waterloo Station 3, Waterloo",  "Start")]
modelclass = GaussianNB
err = pd.concat(rv.models[modelclass]["test_errors"]).sort_index()
pred = pd.concat(rv.models[modelclass]["test_predictions"]).sort_index()
truth = pd.concat([l[3] for l in rv.cv_batches]).sort_index()
plt.figure()
plt.plot(truth[plot_columns])
plt.plot(pred[plot_columns])

__init__() missing 1 required positional argument: 'figure'
This is deprecated in traitlets 4.2.This error will be raised in a future release of traitlets.
  super(Widget, self).__init__(**kwargs)
__init__() missing 1 required positional argument: 'canvas'
This is deprecated in traitlets 4.2.This error will be raised in a future release of traitlets.
  super(Widget, self).__init__(**kwargs)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7fbe0e736290>]

In [205]:
for c in test_columns:
    data = events_by_station[c]
    data = data / data.mean()
    # data = data / data.rolling("365d").mean()
    data.rolling("14d").mean().plot()

  raw_cell, store_history, silent, shell_futures)
  fig = self.plt.figure(figsize=self.figsize)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  raw_cell, store_history, silent, shell_futures)
  fig = self.plt.figure(figsize=self.figsize)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  raw_cell, store_history, silent, shell_futures)
  fig = self.plt.figure(figsize=self.figsize)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …