Other analyses of the same data:

https://github.com/charlie1347/TfL_bikes

https://medium.com/@AJOhrn/data-footprint-of-bike-sharing-in-london-be9e11425248

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn import linear_model, svm, neighbors, naive_bayes, tree
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns
from pathlib import Path
from IPython.display import set_matplotlib_formats

In [2]:
# For pretty and exportable matplotlib plots.
# If you are running this yourself and want interactivity,
# try `%matplotlib widget` instead.
# set_matplotlib_formats("svg")
# %matplotlib inline
%matplotlib widget
# Set a consistent plotting style across the notebook using Seaborn.
sns.set_style("darkgrid")
sns.set_context("notebook")

# Processing and cleaning the data

In [3]:
bikefolder = "./data/bikes"

In [4]:
def add_station_names(station_names, df, namecolumn, idcolumn):
    namemaps = (
        df[[idcolumn, namecolumn]]
        .groupby(idcolumn)
        .aggregate(lambda x: x.unique())
    )
    for number, names in namemaps.iterrows():
        current_names = station_names.get(number, set())
        # The following two lines are a stupid dance around the annoying fact that pd.unique sometimes returns a single value,
        # sometimes a numpy array of values, but since the single value is a string, it too is an iterable.
        vals = names[0]
        new_names = set([vals]) if type(vals) == str else set(vals)
        current_names.update(new_names)
        station_names[number] = current_names


def clean_datetime_column(df, colname, roundto="H"):
    # A bit of a hacky way to use the first entry to figure out which date format this file uses.
    # Not super robust, but works. TODO Improve this.
    if len(df[colname].iloc[0]) > 16:
        format = "%d/%m/%Y %H:%M:%S"
    else:
        format = "%d/%m/%Y %H:%M"
    df[colname] = pd.to_datetime(df[colname], format=format)
    df[colname] = df[colname].dt.round(roundto)
    early_cutoff = pd.datetime(2010, 7, 30)  # When the program started.
    late_cutoff = pd.datetime(2020, 1, 1)  # Approximately now.
    df = df[(late_cutoff > df[colname]) & (df[colname] >= early_cutoff)]
    return df


def compute_single_events(df, which):
    stationcol = "{}Station Id".format(which)
    datecol = "{} Date".format(which)
    events = (
        df.rename(columns={stationcol: "Station", datecol: "Date"})
        .groupby(["Date", "Station"])
        .size()
        .unstack("Station")
    )
    return events


def compute_both_events(df):
    ends = compute_single_events(df, "End")
    starts = compute_single_events(df, "Start")
    both = (
        pd.concat([ends, starts], keys=["End", "Start"], axis=1)
        .reorder_levels([1, 0], axis=1)
        .fillna(0.0)
    )
    return both


def castable_to_int(obj):
    try:
        int(obj)
        return True
    except ValueError:
        return False


def cast_to_int(df, colname):
    try:
        df = df.astype({colname: np.int_}, copy=False)
    except ValueError:
        castable_rows = df[colname].apply(castable_to_int)
        df = df[castable_rows]
        df = df.astype({colname: np.int_}, copy=False)
    return df


events_by_station_path = Path("./events_by_station.p")
if events_by_station_path.exists():
    events_by_station = pd.read_pickle(events_by_station_path)
else:
    datafiles = sorted(os.listdir(bikefolder))
    folderpath = Path(bikefolder)
    datapaths = [folderpath / Path(file) for file in datafiles]
    datapaths = [p for p in datapaths if p.suffix == ".csv"]

    station_allnames = {}

    pieces = []
    #     datapaths = [
    #         folderpath / Path(file)
    #         for file in [
    #             "21JourneyDataExtract31Aug2016-06Sep2016.csv",
    #             "15JourneyDataExtract20Jul2016-26Jul2016.csv",
    #             "13b. Journey Data Extract 22Dec14-03Jan15.csv",
    #             "16JourneyDataExtract27Jul2016-02Aug2016.csv",
    #             "10b. Journey Data Extract 28Sep14-11Oct14.csv",
    #             "6. Journey Data Extract_27May-23Jun12.csv",
    #             "6. Journey Data Extract_27May-23Jun12.csv",
    #         ]
    #     ]
    cols = [
        "Duration",
        "End Date",
        "EndStation Id",
        "EndStation Name",
        "Start Date",
        "StartStation Id",
        "StartStation Name",
    ]
    problem_paths = []
    for path in datapaths:
        print(path)
        try:
            df = pd.read_csv(path, usecols=cols, encoding="ISO-8859-2")
        except ValueError as e:
            # Some files have missing or abnormaly named columns. We'll deal with them later.
            problem_paths.append(path)
            continue
        # Drop any rows that have missing values.
        df = df[~df.isna().any(axis=1)]
        # Drop any anomalously short trips. Probably somebody just taking a bike and putting
        # it right back in.
        df = df[df["Duration"] > 60]
        # Cast the columns to the right types. This is easier ones NAs have been dropped.
        df = cast_to_int(df, "EndStation Id")
        df = cast_to_int(df, "StartStation Id")
        # Turn the date columns from strings into datetime objects rounded to the hour.
        df = clean_datetime_column(df, "End Date")
        df = clean_datetime_column(df, "Start Date")
        events = compute_both_events(df)
        pieces.append(events)

        add_station_names(
            station_allnames, df, "EndStation Name", "EndStation Id"
        )
        add_station_names(
            station_allnames, df, "StartStation Name", "StartStation Id"
        )

    station_ids = {}
    station_names = {}
    for k, v in station_allnames.items():
        v = sorted(v)
        station_names[k] = v[0]
        for name in v:
            station_ids[name] = k

    def get_station_id(name):
        try:
            return station_ids[name]
        except KeyError:
            return np.nan

    print("Doing the problem cases ({} of them).".format(len(problem_paths)))
    safe_cols = [
        "Duration",
        "End Date",
        "EndStation Name",
        "Start Date",
        "StartStation Name",
    ]
    for path in problem_paths:
        print(path)
        df = pd.read_csv(path, usecols=safe_cols, encoding="ISO-8859-2")
        # Drop any rows that have missing values.
        df = df[~df.isna().any(axis=1)]
        # Drop any anomalously short trips. Probably somebody just taking a bike and putting
        # it right back in.
        df = df[df["Duration"] > 60]
        # Add a column of station ids, based on names.
        df["EndStation Id"] = df["EndStation Name"].apply(get_station_id)
        df["StartStation Id"] = df["StartStation Name"].apply(get_station_id)
        # Turn the date columns from strings into datetime objects rounded to the hour.
        clean_datetime_column(df, "End Date")
        clean_datetime_column(df, "Start Date")
        events = compute_both_events(df)
        pieces.append(events)

    events_by_station = pd.concat(pieces).fillna(0.0)
    # Several files may have contained entries for the same hour, which means that
    # events_by_station has duplicate entries in the index. Get rid of them by summing.
    events_by_station = events_by_station.groupby("Date").sum().sort_index()
    # Finally rename the columns according to the chosen names for stations.
    events_by_station = events_by_station.rename(
        mapper=station_names, axis=1, level=0
    )

    events_by_station.to_pickle(events_by_station_path)

data/bikes/01aJourneyDataExtract10Jan16-23Jan16.csv
data/bikes/01bJourneyDataExtract24Jan16-06Feb16.csv
data/bikes/02aJourneyDataExtract07Feb16-20Feb2016.csv
data/bikes/02bJourneyDataExtract21Feb16-05Mar2016.csv
data/bikes/03JourneyDataExtract06Mar2016-31Mar2016.csv
data/bikes/04JourneyDataExtract01Apr2016-30Apr2016.csv
data/bikes/05JourneyDataExtract01May2016-17May2016.csv
data/bikes/06JourneyDataExtract18May2016-24May2016.csv
data/bikes/07JourneyDataExtract25May2016-31May2016.csv
data/bikes/08JourneyDataExtract01Jun2016-07Jun2016.csv
data/bikes/09JourneyDataExtract08Jun2016-14Jun2016.csv
data/bikes/1. Journey Data Extract 01Jan-05Jan13.csv
data/bikes/1. Journey Data Extract 04Jan-31Jan 12.csv
data/bikes/1. Journey Data Extract 05Jan14-02Feb14.csv
data/bikes/10. Journey Data Extract 18Aug-13Sep13.csv
data/bikes/10. Journey Data Extract 21Aug-22 Aug12.csv
data/bikes/10JourneyDataExtract15Jun2016-21Jun2016.csv
data/bikes/10a Journey Data Extract 20Sep15-03Oct15.csv
data/bikes/10a. Journ

  interactivity=interactivity, compiler=compiler, result=result)


data/bikes/11. Journey Data Extract 14Sep13-12Oct13.csv
data/bikes/11. Journey Data Extract 23Aug-25 Aug12.csv
data/bikes/11JourneyDataExtract22Jun2016-28Jun2016.csv
data/bikes/11a Journey Data Extract 18Oct15-31Oct15.csv
data/bikes/11a. Journey Data Extract 12Oct14-08Nov14.csv
data/bikes/11b Journey Data Extract 01Nov15-14Nov15.csv
data/bikes/11b. Journey Data Extract 12Oct14-08Nov14.csv
data/bikes/12. Journey Data Extract 13Oct13-09Nov13.csv
data/bikes/12. Journey Data Extract 26Aug-27 Aug12.csv
data/bikes/12JourneyDataExtract29Jun2016-05Jul2016.csv
data/bikes/12a Journey Data Extract 15Nov15-27Nov15.csv
data/bikes/12a. Journey Data Extract 09Nov14-06Dec14.csv
data/bikes/12b Journey Data Extract 28Nov15-12Dec15.csv
data/bikes/12b. Journey Data Extract 09Nov14-06Dec14.csv
data/bikes/13. Journey Data Extract 10Nov13-07Dec13.csv
data/bikes/13. Journey Data Extract 28Aug-29 Aug12.csv
data/bikes/13JourneyDataExtract06Jul2016-12Jul2016.csv
data/bikes/13a Journey Data Extract 13Dec15-24Dec1

  interactivity=interactivity, compiler=compiler, result=result)


data/bikes/6aJourneyDataExtract31May15-12Jun15.csv
data/bikes/6bJourneyDataExtract13Jun15-27Jun15.csv
data/bikes/7. Journey Data Extract 22Jun14-19Jul14.csv
data/bikes/7. Journey Data Extract 26May-22Jun13.csv
data/bikes/7. Journey Data Extract_24Jun-21Jul12.csv
data/bikes/7a.JourneyDataExtract28Jun15-11Jul15.csv
data/bikes/7b.JourneyDataExtract12Jul15-25Jul15.csv
data/bikes/8. Journey Data Extract 22Jul-18Aug12.csv
data/bikes/8. Journey Data Extract 23Jun-20Jul13.csv
data/bikes/8a Journey Data Extract 20Jul14-31Jul14.csv
data/bikes/8aJourneyDataExtract26Jul15-07Aug15.csv
data/bikes/8b Journey Data Extract 01Aug14-16Aug14.csv
data/bikes/8bJourneyData Extract 08Aug15-22Aug15.csv
data/bikes/9. Journey Data Extract 19Aug-20 Aug12.csv
data/bikes/9. Journey Data Extract 21Jul-17Aug13.csv
data/bikes/9a Journey Data Extract 17Aug14-31Aug14.csv
data/bikes/9a-Journey-Data-Extract-23Aug15-05Sep15.csv
data/bikes/9b Journey Data Extract 01Sep14-13Sep14.csv
data/bikes/9b-Journey-Data-Extract-06Sep1

In [5]:
def mean_within_window(s):
    starttime = s.ne(0).idxmax()
    endtime = s[::-1].ne(0).idxmax()
    return s[starttime:endtime].mean()


a = events_by_station.sum(axis=1, level=0)
(a.max() / a.aggregate(mean_within_window)).sort_values(ascending=False)[:30]

Station
Electrical Workshop PS                                      1757.661290
PENTON STREET COMMS TEST TERMINAL _ CONTACT MATT McNULTY    1615.675676
tabletop1                                                   1066.166667
Contact Centre, Southbury House                              780.694737
6                                                            567.974684
Pop Up Dock 1                                                567.806009
Mechanical Workshop Penton                                   223.818810
South Quay East, Canary Wharf                                133.732653
Westfield Eastern Access Road, Shepherd's Bush                88.865062
Thornfield House, Poplar                                      83.572294
Fore Street Avenue: Guildhall                                 72.241404
Upper Grosvenor Street, Mayfair                               71.496163
Courland Grove , Wandsworth Road                              57.658476
Manfred Road, East Putney                               

In [6]:
improper_stations = [
    "Electrical Workshop PS",
    "PENTON STREET COMMS TEST TERMINAL _ CONTACT MATT McNULTY",
    "tabletop1",
    "Pop Up Dock 1",
    "6",
    "Mechanical Workshop Penton",
]
events_by_station = events_by_station.drop(improper_stations, axis=1, level=0)

In [7]:
# TODO What is up with this?
events_by_station["Exhibition Road Museums, Knightsbridge"]

Unnamed: 0_level_0,End,Start,End,Start
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-04 00:00:00,0.0,0.0,0.0,0.0
2012-01-04 01:00:00,0.0,0.0,0.0,0.0
2012-01-04 02:00:00,2.0,0.0,0.0,0.0
2012-01-04 03:00:00,0.0,0.0,0.0,0.0
2012-01-04 04:00:00,0.0,0.0,0.0,0.0
...,...,...,...,...
2017-05-16 22:00:00,1.0,1.0,0.0,0.0
2017-05-16 23:00:00,0.0,0.0,0.0,0.0
2017-05-17 00:00:00,1.0,0.0,0.0,0.0
2017-05-17 01:00:00,0.0,0.0,0.0,0.0


# Exploring

In [8]:
stations = events_by_station.columns.get_level_values(0).unique()
events_by_time = events_by_station.sum(axis=1, level=1)
totals_by_station = events_by_station.sum(axis=0)
times = events_by_station.index.to_series()

In [9]:
test_stations = [
    "Waterloo Station 3, Waterloo",
    "Hyde Park Corner, Hyde Park",
    "Wenlock Road , Hoxton",
    "Stonecutter Street, Holborn",
]

In [10]:
# TODO Give the plots widths from variance or something like [25%, 75%] limits (what are these called again?).
example_means_over_week = (
    events_by_station[test_stations]
    .groupby([times.dt.weekday, times.dt.hour])
    .sum()
)
example_means_over_week.index.rename(["Day", "Hour"], inplace=True)
example_means_over_week = (
    example_means_over_week.stack(level=[0, 1])
    .reset_index()
    .rename(columns={"level_3": "End/Start", 0: "Count"})
)
example_means_over_week["Weekday"] = example_means_over_week.apply(
    lambda x: x["Day"] + x["Hour"] / 24, axis=1,
)
g = sns.FacetGrid(
    example_means_over_week,
    col_wrap=2,
    col="Station",
    hue="End/Start",
    sharey=False,
    sharex=True,
)
g.map(plt.plot, "Weekday", "Count").set_titles("{col_name}")
g.add_legend();

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [45]:
# TODO Give the plots widths from variance or something like [25%, 75%] limits (what are these called again?).
example_means_over_year = (
    events_by_station[test_stations].groupby(times.dt.week).sum()
)
# Leave out the first and last weeks, since they are usually shorter and thus the data isn't comparable.
example_means_over_year = example_means_over_year.iloc[1:51]
example_means_over_year.index.rename("Week", inplace=True)
example_means_over_year = (
    example_means_over_year.stack(level=[0, 1])
    .reset_index()
    .rename(columns={"level_2": "End/Start", 0: "Count"})
)
g = sns.FacetGrid(
    example_means_over_year,
    col_wrap=2,
    col="Station",
    hue="End/Start",
    sharey=False,
    sharex=True,
)
g.map(plt.plot, "Week", "Count").set_titles("{col_name}")
g.add_legend();

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [46]:
yearly_rolling = events_by_station.loc[:, test_stations].rolling("365d", min_periods=24*90).mean()
yearly_rolling = (
    yearly_rolling.stack(level=[0, 1])
    .reset_index()
    .rename(columns={"level_2": "End/Start", 0: "Count"})
)
g = sns.FacetGrid(
    yearly_rolling,
    col_wrap=2,
    col="Station",
    hue="End/Start",
    sharey=False,
    sharex=True,
)
g.map(plt.plot, "Date", "Count").set_titles("{col_name}")
g.add_legend();

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Feature extraction

In [13]:
weekday_dummies = pd.get_dummies(times.dt.weekday_name)
week_dummies = pd.get_dummies(times.dt.week)
hour_dummies = pd.get_dummies(times.dt.hour)
hour_dummies = hour_dummies.rename(
    columns={c: "Hour {}".format(c) for c in hour_dummies.columns}
)
week_dummies = week_dummies.rename(
    columns={c: "Week {}".format(c) for c in week_dummies.columns}
)
predictors_cat = pd.concat(
    [week_dummies, hour_dummies, weekday_dummies,], axis=1,
)

day_angles = 2 * np.pi * times.dt.hour / 24
year_angles = (
    2 * np.pi * times.dt.week / 52
)  # TODO Should we do this by day or week or month?
predictors_trig = pd.concat(
    [
        pd.DataFrame(
            {
                "Year sine": np.sin(year_angles),
                "Year cosine": np.cos(year_angles),
            }
        ),
        pd.DataFrame(
            {"Day sine": np.sin(day_angles), "Day cosine": np.cos(day_angles)}
        ),
        weekday_dummies,
    ],
    axis=1,
)

In [14]:
class RollingValidator:
    def __init__(
        self,
        data,
        common_predictors,
        specific_predictors,
        min_training_time,
        prediction_time,
    ):
        cv_batches = []
        first_time = data.index.min()
        last_time = data.index.max()
        test_end_time = last_time
        cutoff = test_end_time - prediction_time
        while cutoff > first_time + min_training_time:
            training_data = data[:cutoff]
            training_common_predictors = common_predictors[:cutoff]
            training_specific_predictors = specific_predictors[:cutoff]
            test_data = data[cutoff:test_end_time]
            test_common_predictors = common_predictors[cutoff:test_end_time]
            test_specific_predictors = specific_predictors[
                cutoff:test_end_time
            ]
            cv_batches.append(
                (
                    training_data,
                    training_common_predictors,
                    training_specific_predictors,
                    test_data,
                    test_common_predictors,
                    test_specific_predictors,
                )
            )
            test_end_time = cutoff
            cutoff = test_end_time - prediction_time
        self.cv_batches = cv_batches
        self.models = {}
        print(
            "Created a RollingValidator with {} cross-validation batches.".format(
                len(cv_batches)
            )
        )

    def test_modelclass(self, modelclass, print_progress=False):
        test_errors = []
        training_errors = []
        test_predictions = []
        training_predictions = []
        for (i, cv_batch,) in enumerate(self.cv_batches):
            (
                training_data,
                training_common_predictors,
                training_specific_predictors,
                test_data,
                test_common_predictors,
                test_specific_predictors,
            ) = cv_batch
            if print_progress:
                print("Training for batch {}.".format(i))
            model = modelclass()
            model.train(
                training_data,
                training_common_predictors,
                training_specific_predictors,
            )
            if print_progress:
                print("Predicting for batch {}.".format(i))
            test_prediction = model.predict(
                test_common_predictors, test_specific_predictors
            )
            training_prediction = model.predict(
                training_common_predictors, training_specific_predictors
            )
            test_error = test_prediction - test_data
            training_error = training_prediction - training_data
            test_errors.append(test_error)
            training_errors.append(training_error)
            test_predictions.append(test_prediction)
            training_predictions.append(training_predictions)
        test_mae = pd.concat(test_errors).abs().mean()
        training_mae = pd.concat(training_errors).abs().mean()
        self.models[modelclass] = {
            "test_mae": test_mae,
            "training_mae": training_mae,
            "test_errors": test_errors,
            "training_errors": training_errors,
            "test_predictions": test_predictions,
            "training_predictions": training_predictions,
        }
        return test_mae.sum()

In [15]:
weekly_rolling = events_by_station.rolling("7d").mean()
weekly_rolling = weekly_rolling - weekly_rolling.mean()
weekly_rolling = weekly_rolling / weekly_rolling.std()

In [16]:
test_columns = [
    ("Waterloo Station 3, Waterloo", "Start"),
    ("Hyde Park Corner, Hyde Park", "End"),
    ("Wenlock Road , Hoxton", "End"),
]
# test_columns = sum(
#     [[(s, es) for s in test_stations] for es in ("End", "Start")], []
# )
data = events_by_station[test_columns]
specific_predictors = weekly_rolling[test_columns]
min_training_time = 2 * pd.Timedelta("365d")
prediction_time = 0.5 * pd.Timedelta("365d")
rv_trig = RollingValidator(
    data, predictors_trig, specific_predictors, min_training_time, prediction_time,
)
rv_dum = RollingValidator(
    data, predictors_cat, specific_predictors, min_training_time, prediction_time,
)

Created a RollingValidator with 6 cross-validation batches.
Created a RollingValidator with 6 cross-validation batches.


# Models

In [17]:
class SimpleMean:
    def __init__(self):
        self.model = None

    def train(self, data, common_predictors, specific_predictors):
        mean = pd.DataFrame(data.mean())
        m = mean.T
        self.model = mean

    def predict(self, common_predictors, specific_predictors):
        mean = self.model
        index = common_predictors.index
        predictions = mean.T.apply(lambda x: [x[0]] * len(index)).set_index(
            index
        )
        return predictions

In [18]:
print(rv_dum.test_modelclass(SimpleMean, print_progress=True))

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
25.571555752475582


Results for Linear are bad because all time variation has to be essentially sinusoidal.

In [19]:
class GenericModel:
    regressor = None
    
    def __init__(self):
        self.model = {}

    def train(self, data, common_predictors, specific_predictors):
        for c in data.columns:
            predictors = pd.concat([common_predictors, specific_predictors[c]], axis=1)
            model = self.regressor()
            model.fit(predictors, data[c])
            self.model[c] = model

    def predict(self, common_predictors, specific_predictors):
        model = self.model
        index = common_predictors.index
        predictions = pd.DataFrame({"Time": index}).set_index("Time")
        for c, m in model.items():
            predictors = pd.concat([common_predictors, specific_predictors[c]], axis=1)
            predictions[c] = m.predict(predictors)
        return predictions

In [20]:
class Linear(GenericModel):
    regressor = linear_model.Ridge

In [21]:
print(rv_dum.test_modelclass(Linear, print_progress=True))
print(rv_trig.test_modelclass(Linear, print_progress=True))

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
18.76985646243159
Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
23.737427990201304


In [22]:
class LinearSVR(GenericModel):
    regressor = lambda x: svm.LinearSVR()

In [23]:
print(rv_dum.test_modelclass(LinearSVR, print_progress=True))
print(rv_trig.test_modelclass(LinearSVR, print_progress=True))

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
15.62731098509729
Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
17.455293449130554


In [24]:
class KNeighbors(GenericModel):
    regressor = lambda x: neighbors.KNeighborsRegressor(n_neighbors=5, weights="distance")

In [26]:
print(rv_dum.test_modelclass(KNeighbors, print_progress=True))
print(rv_trig.test_modelclass(KNeighbors, print_progress=True))

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
13.07254488735968


In [26]:
class SVR(GenericModel):
    regressor = lambda x: svm.SVR(kernel="rbf", cache_size=500)

In [27]:
print(rv_dum.test_modelclass(SVR, print_progress=True))
print(rv_trig.test_modelclass(SVR, print_progress=True))

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
13.333613444960976
Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
14.32308049764043


KernelRidge also can't handle this amount of training data with chocking on RAM.

In [28]:
class DecisionTree(GenericModel):
    regressor = lambda x: tree.DecisionTreeRegressor(criterion="mae", max_depth=3)

In [29]:
print(rv_dum.test_modelclass(DecisionTree, print_progress=True))
print(rv_trig.test_modelclass(DecisionTree, print_progress=True))

Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
15.272308189212202
Training for batch 0.
Predicting for batch 0.
Training for batch 1.
Predicting for batch 1.
Training for batch 2.
Predicting for batch 2.
Training for batch 3.
Predicting for batch 3.
Training for batch 4.
Predicting for batch 4.
Training for batch 5.
Predicting for batch 5.
14.943021992541915


In [32]:
import pickle
with open("latest_rv_dum.p", "wb") as f:
    pickle.dump(rv_dum, f)
with open("latest_rv_trig.p", "wb") as f:
    pickle.dump(rv_trig, f)

In [37]:
rv = rv_trig
plot_column = ("Hyde Park Corner, Hyde Park", "End")
# plot_columns = [("Wenlock Road , Hoxton", "End")]
# plot_columns = [("Waterloo Station 3, Waterloo",  "Start")]
modelclass = KNeighbors
err = pd.concat(rv.models[modelclass]["test_errors"]).sort_index()
pred = pd.concat(rv.models[modelclass]["test_predictions"]).sort_index()
truth = pd.concat([l[3] for l in rv.cv_batches]).sort_index()
plt.figure()
plt.plot(truth[plot_column], err[plot_column], ls="", marker="*", ms=1)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7ff5727b0a90>]

In [64]:
weekoffset = (events_by_station[test_columns] - events_by_station[test_columns].shift(freq=-pd.Timedelta("7d")))
print(weekoffset.abs().mean())
# weekoffset.plot()#.rolling("30d").sum().plot()
weekoffset.hist(bins=30)

Station                            
Waterloo Station 3, Waterloo  Start    4.063871
Hyde Park Corner, Hyde Park   End      6.460352
Wenlock Road , Hoxton         End      1.059639
dtype: float64


  fig = plt.figure(**fig_kw)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7ff5600f4790>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7ff55fb82d90>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7ff55fbb6c10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7ff55fb6d950>]],
      dtype=object)

In [55]:
rv = rv_dum
plot_columns = [("Hyde Park Corner, Hyde Park", "End")]
# plot_columns = [("Wenlock Road , Hoxton", "End")]
# plot_columns = [("Waterloo Station 3, Waterloo",  "Start")]
modelclass = KNeighbors
err = pd.concat(rv.models[modelclass]["test_errors"]).sort_index()
pred = pd.concat(rv.models[modelclass]["test_predictions"]).sort_index()
truth = pd.concat([l[3] for l in rv.cv_batches]).sort_index()
plt.figure()
plt.plot(truth[plot_columns])
plt.plot(pred[plot_columns])

KeyError: <class '__main__.KNeighbors'>

In [205]:
for c in test_columns:
    data = events_by_station[c]
    data = data / data.mean()
    # data = data / data.rolling("365d").mean()
    data.rolling("14d").mean().plot()

  raw_cell, store_history, silent, shell_futures)
  fig = self.plt.figure(figsize=self.figsize)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  raw_cell, store_history, silent, shell_futures)
  fig = self.plt.figure(figsize=self.figsize)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  raw_cell, store_history, silent, shell_futures)
  fig = self.plt.figure(figsize=self.figsize)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [35]:
events_by_time.groupby(times.dt.date).sum().hist(bins=20)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7ff572a456d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7ff57266ab90>]],
      dtype=object)