In [1]:
import os
import pandas as pd

def load_predictions_data():
    predictions_path=os.path.join("..", "datasets", "predictions_prod_smallest.csv")
    return pd.read_csv(predictions_path, dtype={"predictions_route_id": str, "predictions_stop_id": str})

In [2]:
data = load_predictions_data()

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["predictions_route_id"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [4]:
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7670846 entries, 0 to 7670845
Data columns (total 13 columns):
predictions_id                  int64
predictions_trip_id             object
predictions_arrival_time        float64
predictions_boarding_status     object
predictions_departure_time      float64
predictions_stop_id             object
predictions_stop_sequence       int64
predictions_stops_away          float64
predictions_vehicle_event_id    int64
predictions_file_timestamp      int64
predictions_route_id            object
predictions_vehicle_id          object
predictions_direction_id        int64
dtypes: float64(3), int64(5), object(5)
memory usage: 760.8+ MB


In [5]:
# Keep: predictions_arrival_time, predictions_departure_time, predictions_stop_id (1-hot), predictions_direction_id
# Lose: predictions_id, predictions_trip_id, predictions_stop_sequence, predictions_vehicle_event_id, predictions_vehicle_id
# Maybe someday: predictions_stops_away, predictions_route_id (1-hot)
# Transform: is_stopped (from boarding status), predictions_file_timestamp (to get time of day / day of week)

In [45]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
import numpy as np

class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_stops_away=False, include_route_id=False):
        self.include_stops_away = include_stops_away
        self.include_route_id = include_route_id
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop(["predictions_id", 
                       "predictions_trip_id", 
                       "predictions_stop_sequence",
                       "predictions_vehicle_event_id",
                       "predictions_vehicle_id",
                       "predictions_file_timestamp",],
                      axis=1)

class IsStoppedTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result['is_stopped'] = X['predictions_boarding_status'].notnull()
        return result.drop('predictions_boarding_status', axis=1)

class TimestampTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        new_data = X.copy()
        datetimes = new_data['predictions_file_timestamp'].apply(
            lambda x: datetime.fromtimestamp(x),
        )
        predictions_day_of_week = datetimes.apply(lambda x: x.weekday())
        predictions_time_bin = datetimes.apply(self._timestamp_bin)
        new_data['predictions_day_of_week'] = predictions_day_of_week
        new_data['predictions_time_bin'] = predictions_time_bin
        return new_data
    
    def _timestamp_bin(self, timestamp):
        return timestamp.hour * 4 + timestamp.minute // 15
    
class DebugTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        print(X.head())
        return X

In [46]:
from sklearn.pipeline import Pipeline

predictions_pipeline = Pipeline([
    ('timestamp', TimestampTransformer()),
    ('is_stopped', IsStoppedTransformer()),
    ('drop', DropColumnsTransformer()),
    ('debug', DebugTransformer())

])

onehot_pipeline = ColumnTransformer([
    ("cat", OneHotEncoder(), cat_list)
])

In [47]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

cat_list = ['predictions_stop_id', 'predictions_route_id']
noncat_list = list(data)
for x in cat_list:
    if x in noncat_list:
        noncat_list.remove(x)

full_pipeline = ColumnTransformer([
    ("overall", predictions_pipeline, noncat_list),
    ("cat", OneHotEncoder(), cat_list)
])

In [53]:
from sklearn.preprocessing import OneHotEncoder

partially_transformed = predictions_pipeline.fit_transform(strat_train_set)
enc = OneHotEncoder()
fully_transformed = enc.fit_transform(partially_transformed.dropna())

         predictions_arrival_time  predictions_departure_time  \
213414               1.553748e+09                1.553748e+09   
2301512              1.553614e+09                1.553614e+09   
6484008                       NaN                1.553263e+09   
6973123              1.553343e+09                1.553343e+09   
6049928              1.553511e+09                1.553511e+09   

        predictions_stop_id  predictions_stops_away predictions_route_id  \
213414                70079                     4.0                  Red   
2301512               70087                     0.0                  Red   
6484008               70061                     2.0                  Red   
6973123               70200                     2.0              Green-E   
6049928               70021                    12.0               Orange   

         predictions_direction_id  predictions_day_of_week  \
213414                          0                        3   
2301512                     

In [54]:
fully_transformed

<5442506x1089617 sparse matrix of type '<class 'numpy.float64'>'
	with 48982554 stored elements in Compressed Sparse Row format>