In [1]:
import os
import pandas as pd

def load_predictions_data():
    predictions_path=os.path.join("datasets", "predictions_prod_v2.csv")
    return pd.read_csv(predictions_path, dtype={"predictions_route_id": str, "predictions_stop_id": str})

In [2]:
data = load_predictions_data()

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

data = data.dropna(subset=["predictions_arrival_time", "ve_arrival_time"])

data["ve_arrival_time"] = data["ve_arrival_time"] - data["predictions_arrival_time"]

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["predictions_route_id"]):
    strat_train_set = data.iloc[train_index]
    strat_test_set = data.iloc[test_index]
    
train_labels = strat_train_set["ve_arrival_time"].copy()
test_labels = strat_test_set["ve_arrival_time"].copy()
strat_train_set = strat_train_set.drop("ve_arrival_time", axis=1)
strat_test_set = strat_test_set.drop("ve_arrival_time", axis=1)

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
import numpy as np

class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_stops_away=False, include_route_id=False):
        self.include_stops_away = include_stops_away
        self.include_route_id = include_route_id
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop(["predictions_id", 
                       "predictions_trip_id", 
                       "predictions_stop_sequence",
                       "predictions_vehicle_event_id",
                       "predictions_vehicle_id",
                       "predictions_file_timestamp",],
                      axis=1)

class IsStoppedTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result['is_stopped'] = X['predictions_boarding_status'].notnull()
        return result.drop('predictions_boarding_status', axis=1)

class TimestampTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        new_data = X.copy()
        datetimes = new_data['predictions_file_timestamp'].apply(
            lambda x: datetime.fromtimestamp(x),
        )
        predictions_day_of_week = datetimes.apply(lambda x: x.weekday())
        predictions_time_bin = datetimes.apply(self._timestamp_bin)
        new_data['predictions_day_of_week'] = predictions_day_of_week
        new_data['predictions_time_bin'] = predictions_time_bin
        return new_data
    
    def _timestamp_bin(self, timestamp):
        return timestamp.hour * 4 + timestamp.minute // 15

class ColumnOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        enc = OneHotEncoder()
        one_hotted = enc.fit_transform(X[self.column].values.reshape(-1,1))
        return np.concatenate((X.drop(self.column, axis=1), one_hotted), axis=1)
    
class DebugTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        print(X.info())
        return X


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures

passthrough_features = [ #'predictions_arrival_time',
                        'is_stopped',
                        'predictions_stops_away',
                        #'predictions_file_timestamp',
                        #'predictions_direction_id'
]

column_transformer = ColumnTransformer([
    ('pass', 'passthrough', passthrough_features),
    ('1hot', OneHotEncoder(), ['predictions_stop_id', 
                               #'predictions_route_id', 
                               'predictions_day_of_week', 
                               'predictions_time_bin'])
])

predictions_pipeline = Pipeline([
    ('timestamp', TimestampTransformer()),
    ('is_stopped', IsStoppedTransformer()),
    ('col_transformer', column_transformer)
])

In [6]:
fitted = predictions_pipeline.fit_transform(strat_train_set)

In [7]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(fitted, train_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [8]:
predictions = lin_reg.predict(fitted)
predictions

array([-23.21740261,  46.2108025 , -65.98415271, ..., 109.45812668,
        -6.25224233, 215.24304989])

In [9]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(train_labels, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

353.1801577903579

In [10]:
ct = predictions_pipeline.named_steps['col_transformer']
(_, _, passthrough_cols) = ct.transformers[0]
onehot_cols = ct.named_transformers_['1hot'].get_feature_names()

features = passthrough_cols + onehot_cols.tolist()

In [11]:
sorted(zip(lin_reg.coef_, features))

[(-1154.6505037311113, 'x0_70105'),
 (-932.0391250557224, 'x0_70036'),
 (-872.3213991797542, 'x0_70061'),
 (-829.4215087435948, 'x0_70001'),
 (-629.1264937522444, 'x2_16'),
 (-540.3536680158431, 'x2_17'),
 (-240.71651717522363, 'x0_70260'),
 (-236.71556215313544, 'x2_7'),
 (-193.21185143022277, 'x0_70162'),
 (-175.9216083134484, 'x0_70164'),
 (-172.08170171916086, 'x0_Government Center-Brattle'),
 (-168.35008335376776, 'x0_70274'),
 (-160.4471881132956, 'x0_70166'),
 (-155.26259552475204, 'x0_70196'),
 (-145.00706589686635, 'x0_70168'),
 (-130.4156842093666, 'x2_6'),
 (-129.02987368091655, 'x0_70170'),
 (-114.12175803004381, 'x2_18'),
 (-95.2855166200204, 'x0_70096'),
 (-91.18282035186287, 'x0_70198'),
 (-87.24500335846471, 'x0_70101'),
 (-83.76047623448623, 'x0_70236'),
 (-81.53269446815621, 'x0_70103'),
 (-79.08898348995837, 'x0_70234'),
 (-74.14503819464028, 'x0_70232'),
 (-70.56668091713885, 'x0_70098'),
 (-67.9221687586028, 'x0_70230'),
 (-66.85048323257884, 'x0_70097'),
 (-57.104

In [12]:
lin_reg.intercept_

45.62165167194758

In [None]:
from sklearn.ensemble import RandomForestRegressor

rand_forest = RandomForestRegressor(n_estimators=10)
rand_forest.fit(fitted, train_labels)



In [None]:
predictions = rand_forest.predict(fitted)
predictions