In [1]:
import os
import pandas as pd

def load_predictions_data():
    predictions_path=os.path.join("datasets", "predictions_prod_v2.csv")
    return pd.read_csv(predictions_path, dtype={"predictions_route_id": str, "predictions_stop_id": str})

In [2]:
data = load_predictions_data()

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

data = data.dropna(subset=["predictions_arrival_time", "ve_arrival_time"])
data = data[data["predictions_route_id"] == 'Blue']

data["ve_arrival_time"] = data["ve_arrival_time"] - data["predictions_arrival_time"]

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["predictions_route_id"]):
    strat_train_set = data.iloc[train_index]
    strat_test_set = data.iloc[test_index]
    
train_labels = strat_train_set["ve_arrival_time"].copy()
test_labels = strat_test_set["ve_arrival_time"].copy()
strat_train_set = strat_train_set.drop("ve_arrival_time", axis=1)
strat_test_set = strat_test_set.drop("ve_arrival_time", axis=1)

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
import numpy as np

class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_stops_away=False, include_route_id=False):
        self.include_stops_away = include_stops_away
        self.include_route_id = include_route_id
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop(["predictions_id", 
                       "predictions_trip_id", 
                       "predictions_stop_sequence",
                       "predictions_vehicle_event_id",
                       "predictions_vehicle_id",
                       "predictions_file_timestamp",],
                      axis=1)

class IsStoppedTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result['is_stopped'] = X['predictions_boarding_status'].notnull()
        return result.drop('predictions_boarding_status', axis=1)

class TimestampTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        new_data = X.copy()
        datetimes = new_data['predictions_file_timestamp'].apply(
            lambda x: datetime.fromtimestamp(x),
        )
        predictions_day_of_week = datetimes.apply(lambda x: x.weekday())
        predictions_time_bin = datetimes.apply(self._timestamp_bin)
        new_data['predictions_day_of_week'] = predictions_day_of_week
        new_data['predictions_time_bin'] = predictions_time_bin
        return new_data
    
    def _timestamp_bin(self, timestamp):
        return timestamp.hour * 4 + timestamp.minute // 15

class ColumnOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        enc = OneHotEncoder()
        one_hotted = enc.fit_transform(X[self.column].values.reshape(-1,1))
        return np.concatenate((X.drop(self.column, axis=1), one_hotted), axis=1)
    
class DebugTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        print(X.info())
        return X


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures

passthrough_features = [ #'predictions_arrival_time',
                        'is_stopped',
                        'predictions_stops_away',
                        #'predictions_file_timestamp',
                        #'predictions_direction_id'
]

column_transformer = ColumnTransformer([
    ('pass', 'passthrough', passthrough_features),
    ('1hot', OneHotEncoder(), ['predictions_stop_id', 
                               #'predictions_route_id', 
                               'predictions_day_of_week', 
                               'predictions_time_bin'])
])

predictions_pipeline = Pipeline([
    ('timestamp', TimestampTransformer()),
    ('is_stopped', IsStoppedTransformer()),
    ('col_transformer', column_transformer)
])

In [6]:
fitted = predictions_pipeline.fit_transform(strat_train_set)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(fitted, train_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [8]:
predictions = lin_reg.predict(fitted)
predictions

array([ 53.7376658 ,  59.07844532,  25.28950811, ...,  69.18892805,
       106.45328422, 124.12608197])

In [9]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(train_labels, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

103.72944126856714

In [10]:
ct = predictions_pipeline.named_steps['col_transformer']
(_, _, passthrough_cols) = ct.transformers[0]
onehot_cols = ct.named_transformers_['1hot'].get_feature_names()

features = passthrough_cols + onehot_cols.tolist()

In [11]:
sorted(zip(lin_reg.coef_, features))

[(-60.84130981961257, 'x2_75.0'),
 (-55.78499741786421, 'x1_6.0'),
 (-49.41835501336302, 'x2_74.0'),
 (-43.49513946908326, 'x2_64.0'),
 (-41.887845892555546, 'x2_79.0'),
 (-41.455360791296684, 'x2_77.0'),
 (-41.14132384705608, 'x2_49.0'),
 (-40.568632196696, 'x2_70.0'),
 (-39.524120020497115, 'x2_76.0'),
 (-38.47647284312168, 'x2_63.0'),
 (-37.76955616392796, 'x2_65.0'),
 (-37.03260600269956, 'x2_83.0'),
 (-35.200928739077824, 'x2_78.0'),
 (-34.9671553594785, 'x2_91.0'),
 (-34.84070191226147, 'x2_48.0'),
 (-34.50592163455838, 'x2_86.0'),
 (-34.332658748529774, 'x2_69.0'),
 (-33.18040925739967, 'x2_62.0'),
 (-31.866982443417672, 'x1_2.0'),
 (-31.785122986445316, 'x2_90.0'),
 (-31.747239748487473, 'x2_61.0'),
 (-30.814750554236692, 'x2_84.0'),
 (-30.08546886332113, 'x1_4.0'),
 (-29.60545236255103, 'x2_92.0'),
 (-29.060001516022933, 'x2_93.0'),
 (-28.532782357756417, 'x2_80.0'),
 (-28.45220044179322, 'x1_0.0'),
 (-27.035199474657542, 'x2_94.0'),
 (-26.8885953672814, 'x2_82.0'),
 (-26.7610

In [12]:
lin_reg.intercept_

61.92990513049044

In [13]:
from sklearn.ensemble import RandomForestRegressor

rand_forest = RandomForestRegressor(n_estimators=10, n_jobs=8)
rand_forest.fit(fitted, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=8,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [14]:
predictions = rand_forest.predict(fitted)
predictions

array([-14.        , 170.47638889,   0.        , ...,  59.91932296,
       146.17      ,  82.28761905])

In [15]:
from sklearn.metrics import mean_squared_error
forest_mse = mean_squared_error(train_labels, predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

56.13411556611708

In [17]:
ct = predictions_pipeline.named_steps['col_transformer']
(_, _, passthrough_cols) = ct.transformers[0]
onehot_cols = ct.named_transformers_['1hot'].get_feature_names()

features = passthrough_cols + onehot_cols.tolist()

sorted(zip(rand_forest.feature_importances_, features))

[(4.374540640177548e-06, 'x2_6.0'),
 (0.0004506835002199488, 'x2_5.0'),
 (0.0009195628389400507, 'x2_92.0'),
 (0.000977626860868343, 'x2_91.0'),
 (0.0009987488653745262, 'x2_76.0'),
 (0.0010043416517008712, 'x2_69.0'),
 (0.0010725923153559575, 'x2_83.0'),
 (0.001098223376568222, 'x2_78.0'),
 (0.0011066164669554768, 'x2_74.0'),
 (0.0011860171783752297, 'x2_80.0'),
 (0.0012040334437912032, 'x2_81.0'),
 (0.0012393381166190495, 'x2_93.0'),
 (0.0012573447300286808, 'x2_84.0'),
 (0.0012931494979423596, 'x2_90.0'),
 (0.0013454490993841089, 'x2_86.0'),
 (0.0013529215662366134, 'x2_77.0'),
 (0.0013530292892650502, 'x2_94.0'),
 (0.0014855592981022896, 'x2_82.0'),
 (0.001575455216036246, 'x2_89.0'),
 (0.0016895934492813882, 'x2_63.0'),
 (0.0017329944985007002, 'x2_73.0'),
 (0.0017648844828928507, 'x2_85.0'),
 (0.0017683325179257022, 'x2_64.0'),
 (0.0019142790714289746, 'x2_62.0'),
 (0.002166789371489942, 'x2_79.0'),
 (0.0021903130954196797, 'x2_68.0'),
 (0.0023489944662857193, 'x2_38.0'),
 (0.002