In [18]:
import numpy as np
import pandas as pd
import category_encoders as ce

In [19]:
train = pd.read_pickle("data/train.pickle")

In [20]:
train.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
906970,2,2022-03-17 16:02:41,2022-03-17 16:08:50,1,0.839844,1,236,237,1,6.0,1.0,0.5,1.25,0.0,0.300049,11.546875
906971,2,2022-03-27 13:18:54,2022-03-27 13:36:53,1,1.530273,1,162,48,1,12.0,0.0,0.5,3.060547,0.0,0.300049,18.359375
906972,1,2022-03-13 21:49:45,2022-03-13 21:56:50,2,1.099609,1,162,48,2,6.5,3.0,0.5,0.0,0.0,0.300049,10.296875
906973,1,2022-03-17 17:13:16,2022-03-17 17:47:12,2,8.601562,1,238,45,2,29.0,3.5,0.5,0.0,0.0,0.300049,33.3125
906974,2,2022-03-26 14:00:43,2022-03-26 14:18:28,1,3.169922,1,114,50,0,16.171875,0.0,0.5,3.160156,0.0,0.300049,22.625


In [21]:
def date_features(df):
    diff = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    diff = diff.dt.total_seconds()
    df["time_taken"] = diff
    df["month_day"] = df["tpep_pickup_datetime"].dt.day 
    df["week_day"] = df["tpep_pickup_datetime"].dt.dayofweek
    df["hour"] = df["tpep_pickup_datetime"].dt.hour
    df["avg_speed_ms"] = (df["trip_distance"] / df["time_taken"])

In [22]:
date_features(train)

In [23]:
pos1 = train[train["avg_speed_ms"] < 0].index
pos2 = train[train["avg_speed_ms"] == np.inf].index
pos = np.concatenate([pos1, pos2])
train.loc[pos, "avg_speed_ms"] = np.nan

In [24]:
test = pd.read_pickle("data/test.pickle")
date_features(test)
pos1 = test[test["avg_speed_ms"] < 0].index
pos2 = test[test["avg_speed_ms"] == np.inf].index
pos = np.concatenate([pos1, pos2])
test.loc[pos, "avg_speed_ms"] = np.nan
test.dropna(inplace=True)

In [25]:
train.isna().sum()

VendorID                    0
tpep_pickup_datetime        0
tpep_dropoff_datetime       0
passenger_count             0
trip_distance               0
RatecodeID                  0
PULocationID                0
DOLocationID                0
payment_type                0
fare_amount                 0
extra                       0
mta_tax                     0
tip_amount                  0
tolls_amount                0
improvement_surcharge       0
total_amount                0
time_taken                  0
month_day                   0
week_day                    0
hour                        0
avg_speed_ms             2419
dtype: int64

In [26]:
test.isna().sum()

VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
time_taken               0
month_day                0
week_day                 0
hour                     0
avg_speed_ms             0
dtype: int64

In [27]:
train.dropna(inplace=True)

In [28]:
cat = ["VendorID", "passenger_count", "RatecodeID", "PULocationID", "DOLocationID",
       "payment_type", "month_day", "week_day", "hour"]
cont = ["trip_distance", "time_taken", "avg_speed_ms"]

In [29]:
train.shape

(2718493, 21)

In [30]:
train.loc[:, cat]

Unnamed: 0,VendorID,passenger_count,RatecodeID,PULocationID,DOLocationID,payment_type,month_day,week_day,hour
906970,2,1,1,236,237,1,17,3,16
906971,2,1,1,162,48,1,27,6,13
906972,1,2,1,162,48,2,13,6,21
906973,1,2,1,238,45,2,17,3,17
906974,2,1,1,114,50,0,26,5,14
...,...,...,...,...,...,...,...,...,...
3627877,1,1,1,100,137,2,19,5,16
3627878,1,3,1,230,142,2,26,5,12
3627879,2,1,1,231,229,1,5,5,21
3627880,2,1,1,263,237,1,28,0,18


In [31]:
for col in cat:
    print(col)
    print(train[col].unique())
    print(train[col].nunique())
    print()

VendorID
[2 1 6 5]
4

passenger_count
[1 2 3 5 0 6 4 7 8 9]
10

RatecodeID
[ 1  4  2 99  5  3  6]
7

PULocationID
[236 162 238 114 233 234 239 163  48 170  90 144 141  79 263 237 137 249
 142  68 161  13 230 138 140 211 132 100 148 166 107 229 224  74 231  88
  25  43 246 186 113 262 164  75 151  42  45 125 116  70  65 261 143  41
 158 193 264  63 181  87  24 209  12 244  50 265   4 189 146  76 232  97
  40  80  77  72 175 216 207  52  95 168 145 223 152 260  33  10  28 256
 196 123 205 255  82 226 106 179 243 215 183  64 228 165  78  17  66 111
 247  26  86   7  92  71 159  14  47  51  20 112 119 169 131 210 133  93
 134  35  61  39  69   1  83  38  49 127  36 213 197 109 129 194 219  89
 153 130 248 218 121 259 257 102 235 117 225 220 258  67 135  37 241 139
 253  21  18 195 227 208 101 136 177  60 212  55 167 203 188  91 202  11
  22 128  85 147 201  56 182 155 157  53 160 174 250  23  16  29   9 254
 126  62 124 217 108  54  32  34  19 185 149 252 171   5 222 122 150 242
 191 178  

In [32]:
train_cat = []
test_cat = []
x = 0
cat = ["VendorID", "passenger_count", "RatecodeID", "PULocationID", "DOLocationID",
       "payment_type", "month_day", "week_day", "hour"]
comp = [2,3,3,9,9,3,5,3,5]
for category in cat:
    encoder = ce.HashingEncoder(cols=category, n_components=comp[x], max_sample=25000)
    train_cat.append(encoder.fit_transform(train[category]).to_numpy())
    test_cat.append(encoder.transform(train[category]).to_numpy())
    x = x + 1

In [43]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
train_cont = scale.fit_transform(train.loc[:, cont])
test_cont = scale.transform(test.loc[:, cont])

In [51]:
train_matrix = np.concatenate((train_cat[0], train_cat[1], train_cat[2], train_cat[3], train_cat[4],train_cat[5],
               train_cat[6], train_cat[7], train_cat[8], train_cont), axis=1)

array([[1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       ...,
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0]])