## **Feature Engineering**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### **About Data**

In [3]:
nyc_taxi = pd.read_pickle("data/train.pickle")
nyc_taxi.tail()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,total_amount
2707852,2,2022-03-29 16:40:56,2022-03-29 16:56:03,1,2.65,1,170,263,1,12.5,17.8
2707853,2,2022-03-20 00:37:51,2022-03-20 00:46:23,1,1.69,1,263,239,1,8.0,14.16
2707854,1,2022-03-18 18:37:32,2022-03-18 18:47:10,2,0.9,1,234,164,3,7.5,11.8
2707855,2,2022-03-16 19:53:58,2022-03-16 20:03:23,1,2.06,1,234,170,1,8.0,13.3
2707856,2,2022-03-04 06:42:00,2022-03-04 06:55:00,1,3.24,1,41,244,0,13.33,16.4


*Extract day in week from `tpep_pickup_datetime`*

In [4]:
nyc_taxi["day_of_week"] = nyc_taxi["tpep_pickup_datetime"].dt.dayofweek

*Calculating time interval from pickup to dropoff*

In [5]:
diff = nyc_taxi["tpep_dropoff_datetime"] - nyc_taxi["tpep_pickup_datetime"]

*Extracting time interval in seconds*

In [6]:
diff = diff.dt.total_seconds()
nyc_taxi["time_interval"] = diff

*Extracting hour from pickup date time*

In [7]:
nyc_taxi["hour"] = nyc_taxi["tpep_pickup_datetime"].dt.hour

*Calculating distance for zero values*

In [8]:
pos = nyc_taxi[nyc_taxi["trip_distance"] == 0.0].index
npos = nyc_taxi[~nyc_taxi.index.isin(pos)].index

In [9]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(nyc_taxi.loc[npos,["fare_amount", "time_interval"]], nyc_taxi.loc[npos, "trip_distance"])
pred = reg.predict(nyc_taxi.loc[pos,["fare_amount", "time_interval"]])/2

In [10]:
x = 0
for i in pos[:10]:
    nyc_taxi.loc[i, "trip_distance"] = pred[x]
    x = x + 1

*Removing negative and zero time instances from time_interval*

In [11]:
pos = nyc_taxi[nyc_taxi["time_interval"] <=0.0].index
nyc_taxi.drop(pos, axis=0, inplace=True)

*Converting miles into meters*

In [12]:
nyc_taxi["trip_distance"] = nyc_taxi["trip_distance"] * 1609.0

*Calculating speed in meter/second*

In [13]:
nyc_taxi["avg_speed_ms"] = nyc_taxi["trip_distance"] / nyc_taxi["time_interval"]

In [14]:
nyc_taxi.shape

(2705427, 15)

In [15]:
nyc_taxi.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,total_amount,day_of_week,time_interval,hour,avg_speed_ms
0,1,2022-03-23 16:23:45,2022-03-23 17:38:32,0,37972.4,4,161,265,1,98.0,115.3,2,4487.0,16,8.462759
1,2,2022-03-08 07:01:49,2022-03-08 07:42:58,1,28334.49,2,230,132,1,52.0,61.93,1,2469.0,7,11.4761
2,2,2022-03-24 14:40:13,2022-03-24 14:45:31,5,1319.38,1,164,234,1,5.5,10.56,3,318.0,14,4.148994
3,1,2022-03-16 16:32:38,2022-03-16 16:37:52,1,1287.2,1,239,142,1,5.0,12.3,2,314.0,16,4.099363
4,2,2022-03-04 08:42:02,2022-03-04 08:57:58,1,3008.83,1,246,230,1,11.5,17.76,4,956.0,8,3.147312


*Removing outliers from columns `trip_distance`, `time_interval` and `total_amount`* 

In [16]:
# removing outliers with more than 4 sd
from scipy import stats
cols = ["trip_distance", "time_interval", "total_amount"]
for col in cols:
    idx = nyc_taxi[np.abs(stats.zscore(nyc_taxi[col]) > 4)].index
    nyc_taxi.drop(idx, axis=0, inplace=True)

In [17]:
# save train_labels
train_labels = nyc_taxi["total_amount"]
train_labels = np.array(train_labels)
np.save("data/train_labels", train_labels, allow_pickle=True)

**Encoding**

In [18]:
from category_encoders import BinaryEncoder
cat = ["VendorID", "passenger_count", "trip_distance", "RatecodeID",
       "PULocationID", "DOLocationID", "payment_type", "day_of_week",
       "hour"]
encoder = BinaryEncoder(cols=cat)
enc = encoder.fit_transform(nyc_taxi.loc[:, cat])
encoded_cat = np.array(enc)

In [19]:
from sklearn.preprocessing import StandardScaler
cont = ["trip_distance", "time_interval", "avg_speed_ms"]
scale = StandardScaler()
sca = scale.fit_transform(nyc_taxi.loc[:,cont])
scale_cont = np.array(sca)

In [20]:
train_matrix = np.concatenate((encoded_cat, scale_cont), axis=1)

In [21]:
np.save(file="data/train_matrix", arr=train_matrix, allow_pickle=True)

In [22]:
del nyc_taxi,  scale_cont, encoded_cat, train_matrix 

**Test**

In [23]:
test = pd.read_pickle("data/test.pickle")
test.tail()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,total_amount
899995,2,2022-03-18 18:37:22,2022-03-18 18:43:56,1,1.09,1,186,234,1,6.5,12.96
899996,1,2022-03-30 23:08:32,2022-03-30 23:12:32,2,0.6,1,230,170,1,5.0,11.0
899997,1,2022-03-30 20:20:43,2022-03-30 20:24:09,0,0.5,1,138,223,4,4.5,7.05
899998,2,2022-03-30 07:43:04,2022-03-30 07:47:25,1,0.91,1,236,238,1,5.0,9.96
899999,2,2022-03-08 22:55:28,2022-03-08 23:34:56,1,15.3,1,138,190,1,46.0,58.26


*Extract day in week from `tpep_pickup_datetime`*

In [24]:
test["day_of_week"] = test["tpep_pickup_datetime"].dt.dayofweek

*Calculating time interval from pickup to dropoff*

In [25]:
diff = test["tpep_dropoff_datetime"] - test["tpep_pickup_datetime"]

*Extracting time interval in seconds*

In [26]:
diff = diff.dt.total_seconds()
test["time_interval"] = diff

*Extracting hour from pickup date time*

In [27]:
test["hour"] = test["tpep_pickup_datetime"].dt.hour

*Calculating distance for zero values*

In [28]:
pos = test[test["trip_distance"] == 0.0].index
npos = test[~test.index.isin(pos)].index

In [29]:
pred = reg.predict(test.loc[pos,["fare_amount", "time_interval"]])/2

In [30]:
x = 0
for i in pos[:10]:
    test.loc[i, "trip_distance"] = pred[x]
    x = x + 1

*Removing negative and zero time instances from time_interval*

In [31]:
pos = test[test["time_interval"] <=0.0].index
test.drop(pos, axis=0, inplace=True)

*Converting miles into meters*

In [32]:
test["trip_distance"] = test["trip_distance"] * 1609.0

*Calculating speed in meter/second*

In [33]:
test["avg_speed_ms"] = test["trip_distance"] / test["time_interval"]

In [34]:
# save train_labels
test_labels = test["total_amount"]
test_labels = np.array(test_labels)
np.save("data/test_labels", test_labels, allow_pickle=True)

In [35]:
from category_encoders import BinaryEncoder
cat = ["VendorID", "passenger_count", "trip_distance", "RatecodeID",
       "PULocationID", "DOLocationID", "payment_type", "day_of_week",
       "hour"]
enc = encoder.transform(test.loc[:, cat])
encoded_cat = np.array(enc)

In [36]:
from sklearn.preprocessing import StandardScaler
cont = ["trip_distance", "time_interval", "avg_speed_ms"]
sca = scale.transform(test.loc[:,cont])
scale_cont = np.array(sca)

In [37]:
test_matrix = np.concatenate((encoded_cat, scale_cont), axis=1)

In [38]:
np.save(file="data/test_matrix", arr=test_matrix, allow_pickle=True)

In [39]:
del encoded_cat, scale_cont, test_matrix