In [1]:
import os
import pandas as pd

def load_predictions_data():
    predictions_path=os.path.join("datasets", "predictions_prod_v2.csv")
    return pd.read_csv(predictions_path, dtype={"predictions_route_id": str, "predictions_stop_id": str})

In [2]:
data = load_predictions_data()

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

data = data.dropna(subset=["predictions_arrival_time", "ve_arrival_time"])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["predictions_route_id"]):
    strat_train_set = data.iloc[train_index]
    strat_test_set = data.iloc[test_index]
    
train_labels = strat_train_set["ve_arrival_time"].copy()
test_labels = strat_test_set["ve_arrival_time"].copy()
strat_train_set = strat_train_set.drop("ve_arrival_time", axis=1)
strat_test_set = strat_test_set.drop("ve_arrival_time", axis=1)

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
import numpy as np

class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_stops_away=False, include_route_id=False):
        self.include_stops_away = include_stops_away
        self.include_route_id = include_route_id
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop(["predictions_id", 
                       "predictions_trip_id", 
                       "predictions_stop_sequence",
                       "predictions_vehicle_event_id",
                       "predictions_vehicle_id",
                       "predictions_file_timestamp",],
                      axis=1)

class IsStoppedTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result['is_stopped'] = X['predictions_boarding_status'].notnull()
        return result.drop('predictions_boarding_status', axis=1)

class TimestampTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        new_data = X.copy()
        datetimes = new_data['predictions_file_timestamp'].apply(
            lambda x: datetime.fromtimestamp(x),
        )
        predictions_day_of_week = datetimes.apply(lambda x: x.weekday())
        predictions_time_bin = datetimes.apply(self._timestamp_bin)
        new_data['predictions_day_of_week'] = predictions_day_of_week
        new_data['predictions_time_bin'] = predictions_time_bin
        return new_data
    
    def _timestamp_bin(self, timestamp):
        return timestamp.hour * 4 + timestamp.minute // 15

class ColumnOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        enc = OneHotEncoder()
        one_hotted = enc.fit_transform(X[self.column].values.reshape(-1,1))
        return np.concatenate((X.drop(self.column, axis=1), one_hotted), axis=1)
    
class DebugTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        print(X.info())
        return X

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

passthrough_features = ['predictions_arrival_time',
                        'predictions_stops_away',
                        'predictions_file_timestamp',
                        'predictions_direction_id']

column_transformer = ColumnTransformer([
    ('pass', 'passthrough', passthrough_features),
    ('1hot', OneHotEncoder(), ['predictions_stop_id', 
                               'predictions_route_id', 
                               'predictions_day_of_week', 
                               'predictions_time_bin'])
])

predictions_pipeline = Pipeline([
    ('timestamp', TimestampTransformer()),
    ('is_stopped', IsStoppedTransformer()),
    ('col_transformer', column_transformer)
])

In [6]:
fitted = predictions_pipeline.fit_transform(strat_train_set)

In [7]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(fitted, train_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [8]:
predictions = lin_reg.predict(fitted)
predictions

array([1.55358106e+09, 1.55358106e+09, 1.55358106e+09, ...,
       1.55358106e+09, 1.55358106e+09, 1.55358106e+09])

In [9]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(train_labels, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

189681.1243973759

In [10]:
ct = predictions_pipeline.named_steps['col_transformer']
(_, _, passthrough_cols) = ct.transformers[0]
onehot_cols = ct.named_transformers_['1hot'].get_feature_names()

features = passthrough_cols + onehot_cols.tolist()

In [11]:
sorted(zip(lin_reg.coef_, features))

[(-4.227945575351628e-15, 'x2_5'),
 (-4.179160425752195e-15, 'x2_4'),
 (-2.3345364449402975e-15, 'predictions_stops_away'),
 (-2.189480245436825e-15, 'x2_6'),
 (-1.2928734622705066e-15, 'x2_0'),
 (-4.3432233973492424e-16, 'x1_Green-C'),
 (-3.197380299394269e-16, 'x1_Green-B'),
 (-2.5153532380242735e-16, 'x1_Green-E'),
 (-2.1257764574738795e-16, 'x1_Orange'),
 (-1.160905103875426e-16, 'predictions_direction_id'),
 (-6.035630467713558e-17, 'x3_60'),
 (-5.894383316214146e-17, 'x3_62'),
 (-5.595892689939408e-17, 'x3_61'),
 (-5.128589178920496e-17, 'x3_63'),
 (-4.554898320739818e-17, 'x3_64'),
 (-4.409297950585262e-17, 'x3_59'),
 (-4.3479789204486687e-17, 'x3_66'),
 (-4.222711485620935e-17, 'x3_65'),
 (-4.0062386396173245e-17, 'x1_Mattapan'),
 (-3.448607192530684e-17, 'x3_68'),
 (-3.3008293583813956e-17, 'x3_77'),
 (-3.148597553931152e-17, 'x3_69'),
 (-3.089023050366017e-17, 'x3_76'),
 (-3.062967420884836e-17, 'x3_75'),
 (-2.932655188859369e-17, 'x3_67'),
 (-2.618979517235128e-17, 'x3_73'),

In [12]:
lin_reg.intercept_

1553581036.213455