In [1]:
import os
import pandas as pd

def load_predictions_data():
    predictions_path=os.path.join("..", "datasets", "predictions_prod_smallest.csv")
    return pd.read_csv(predictions_path)

In [2]:
data = load_predictions_data()

  if (yield from self.run_code(code, result)):


In [3]:
data.head()

Unnamed: 0,predictions_id,predictions_trip_id,predictions_arrival_time,predictions_boarding_status,predictions_departure_time,predictions_stop_id,predictions_stop_sequence,predictions_stops_away,predictions_vehicle_event_id,predictions_file_timestamp,predictions_route_id,predictions_vehicle_id,predictions_direction_id
0,648187443,39783376,1553863000.0,,1553863000.0,70040,10,1.0,10212508,1553862833,Blue,B-545C365A,1
1,648187455,39988585-20:30-FKenmoreStMaryC,1553863000.0,,1553863000.0,70203,620,1.0,10212521,1553862833,Green-C,G-10142,1
2,648187514,40033949,1553864000.0,,1553864000.0,70007,30,8.0,10214673,1553862833,Orange,O-545C36A0,1
3,648187604,ADDED-1553782585,1553864000.0,,1553864000.0,70156,580,9.0,10214713,1553862833,Green-D,G-10152,1
4,648187732,40033948,1553863000.0,,1553863000.0,70005,20,2.0,10213047,1553862833,Orange,O-545C364E,1


In [4]:
# Keep: predictions_arrival_time, predictions_departure_time, predictions_stop_id (1-hot), predictions_direction_id
# Lose: predictions_id, predictions_trip_id, predictions_stop_sequence, predictions_vehicle_event_id, predictions_vehicle_id
# Maybe someday: predictions_stops_away, predictions_route_id (1-hot)
# Transform: is_stopped (from boarding status), predictions_file_timestamp (to get time of day / day of week)

In [24]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
import numpy as np

class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_stops_away=False, include_route_id=False):
        self.include_stops_away = include_stops_away
        self.include_route_id = include_route_id
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop(["predictions_id", 
                       "predictions_trip_id", 
                       "predictions_stop_sequence",
                       "predictions_vehicle_event_id",
                       "predictions_vehicle_id",
                       "predictions_file_timestamp",],
                      axis=1)

class IsStoppedTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result['is_stopped'] = X['predictions_boarding_status'].notnull()
        return result.drop('predictions_boarding_status', axis=1)

class TimestampTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        new_data = X.copy()
        datetimes = new_data['predictions_file_timestamp'].apply(
            lambda x: datetime.fromtimestamp(x),
        )
        predictions_day_of_week = datetimes.apply(lambda x: x.weekday())
        predictions_time_bin = datetimes.apply(self._timestamp_bin)
        new_data['predictions_day_of_week'] = predictions_day_of_week
        new_data['predictions_time_bin'] = predictions_time_bin
        return new_data
    
    def _timestamp_bin(self, timestamp):
        return timestamp.hour * 4 + timestamp.minute // 15
    
class StringStopIDEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        new_data = X.copy()
        new_data['predictions_stop_id'] = new_data["predictions_stop_id"].apply(lambda x: str(x))
        return new_data

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

predictions_pipeline = Pipeline([
    {'stop_id', StringStopIDEncoder()},
    ('timestamp', TimestampTransformer()),
    ('is_stopped', IsStoppedTransformer()),
    ('drop', DropColumnsTransformer())
])

In [27]:
prepared_data = predictions_pipeline.fit_transform(data)

In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

cat_list = ['predictions_stop_id', 'predictions_route_id']

full_pipeline = ColumnTransformer([
    ("overall", predictions_pipeline, list(data)),
    {"ordinal", OrdinalEncoder(), data['predictions_stop_id']}
    ("cat", OneHotEncoder(), cat_list)
])

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [31]:
fully_transformed = full_pipeline.fit_transform(data)

TypeError: '<' not supported between instances of 'int' and 'str'

In [22]:
list(fully_transformed)

'13'