## **Feature Engineering**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### **About Data**

In [3]:
nyc_taxi = pd.read_pickle("data/train.pickle")
nyc_taxi.tail()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,total_amount
2707852,2,2022-03-24 21:47:05,2022-03-24 21:55:22,1,1.85,1,141,137,1,8.0,11.8
2707853,1,2022-03-14 10:36:01,2022-03-14 10:45:09,1,1.4,1,137,237,1,8.0,13.55
2707854,2,2022-03-18 09:19:17,2022-03-18 09:28:55,1,1.92,1,79,186,1,8.5,14.16
2707855,2,2022-03-30 11:30:47,2022-03-30 11:46:04,1,1.64,1,233,68,2,10.5,13.8
2707856,2,2022-03-09 14:56:36,2022-03-09 15:39:52,1,4.28,1,90,236,2,26.0,29.3


*Extract day in week from `tpep_pickup_datetime`*

In [4]:
nyc_taxi["day_of_week"] = nyc_taxi["tpep_pickup_datetime"].dt.dayofweek

*Calculating time `interval` from pickup to dropoff*

In [5]:
diff = nyc_taxi["tpep_dropoff_datetime"] - nyc_taxi["tpep_pickup_datetime"]

*Extracting time interval in `seconds`*

In [6]:
diff = diff.dt.total_seconds()
nyc_taxi["time_interval"] = diff

*Extracting `hour` from pickup date time*

In [7]:
nyc_taxi["hour"] = nyc_taxi["tpep_pickup_datetime"].dt.hour

*Calculating `distance` for zero values*

In [8]:
pos = nyc_taxi[nyc_taxi["trip_distance"] == 0.0].index
npos = nyc_taxi[~nyc_taxi.index.isin(pos)].index

In [9]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(nyc_taxi.loc[npos,["total_amount", "time_interval"]], nyc_taxi.loc[npos, "trip_distance"])
pred = reg.predict(nyc_taxi.loc[pos,["total_amount", "time_interval"]])/2

In [10]:
x = 0
for i in pos[:10]:
    nyc_taxi.loc[i, "trip_distance"] = pred[x]
    x = x + 1

*Removing negative and zero time instances from time_interval*

In [11]:
pos = nyc_taxi[nyc_taxi["time_interval"] <=0.0].index
nyc_taxi.drop(pos, axis=0, inplace=True)

*Converting miles into `meters`*

In [12]:
nyc_taxi["trip_distance"] = nyc_taxi["trip_distance"] * 1609.0

*Calculating speed in `meter/second`*

In [13]:
nyc_taxi["avg_speed_ms"] = nyc_taxi["trip_distance"] / nyc_taxi["time_interval"]

In [14]:
nyc_taxi.shape

(2705487, 15)

In [15]:
nyc_taxi.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,total_amount,day_of_week,time_interval,hour,avg_speed_ms
0,2,2022-03-14 12:06:11,2022-03-14 12:13:51,1,3008.83,1,229,263,2,8.0,11.3,0,460.0,12,6.540935
1,2,2022-03-09 17:56:22,2022-03-09 18:09:48,2,3105.37,1,140,239,1,10.0,15.3,2,806.0,17,3.852816
2,2,2022-03-24 22:11:00,2022-03-24 22:22:20,1,2928.38,1,239,263,1,9.5,15.96,3,680.0,22,4.306441
3,2,2022-03-15 20:54:50,2022-03-15 21:04:46,6,3137.55,1,163,186,1,9.0,15.36,1,596.0,20,5.264346
4,2,2022-03-27 16:59:01,2022-03-27 17:10:36,1,1657.27,1,186,230,2,8.5,11.8,6,695.0,16,2.384561


*Removing outliers from columns `trip_distance`, `time_interval` and `total_amount`* 

In [16]:
# removing outliers with more than 4 sd
from scipy import stats
cols = ["trip_distance", "time_interval", "total_amount"]
for col in cols:
    idx = nyc_taxi[np.abs(stats.zscore(nyc_taxi[col]) > 4)].index
    nyc_taxi.drop(idx, axis=0, inplace=True)

In [17]:
# save train_labels
train_labels = nyc_taxi["total_amount"]
train_labels = np.array(train_labels)
np.save("data/train_labels", train_labels, allow_pickle=True)

In [18]:
del train_labels

**Encoding**

In [19]:
from category_encoders import BinaryEncoder
cat = ["VendorID", "passenger_count", "trip_distance", "RatecodeID",
       "PULocationID", "DOLocationID", "payment_type", "day_of_week",
       "hour"]
encoder = BinaryEncoder(cols=cat)
enc = encoder.fit_transform(nyc_taxi.loc[:, cat])
encoded_cat = np.array(enc)

### **Scaling** 

In [20]:
from sklearn.preprocessing import StandardScaler
cont = ["trip_distance", "time_interval", "avg_speed_ms"]
scale = StandardScaler()
sca = scale.fit_transform(nyc_taxi.loc[:,cont])
scale_cont = np.array(sca)

In [21]:
train_matrix = np.concatenate((encoded_cat, scale_cont), axis=1)

In [22]:
np.save(file="data/train_matrix", arr=train_matrix, allow_pickle=True)

In [23]:
del nyc_taxi,  scale_cont, encoded_cat, train_matrix 

**Test**

In [24]:
test = pd.read_pickle("data/test.pickle")
test.tail()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,total_amount
899995,1,2022-03-31 09:00:44,2022-03-31 09:26:28,1,6.9,1,13,162,1,26.0,35.16
899996,2,2022-03-17 03:14:56,2022-03-17 03:34:42,1,3.84,1,170,87,2,16.0,19.8
899997,1,2022-03-31 13:31:46,2022-03-31 13:51:19,0,1.5,1,163,236,1,12.5,18.95
899998,1,2022-03-28 17:42:44,2022-03-28 17:58:20,1,3.9,1,140,179,1,14.5,23.5
899999,2,2022-03-15 20:24:19,2022-03-15 20:46:09,1,3.92,1,148,48,1,17.0,24.96


*Extract day in week from `tpep_pickup_datetime`*

In [25]:
test["day_of_week"] = test["tpep_pickup_datetime"].dt.dayofweek

*Calculating time interval from pickup to dropoff*

In [26]:
diff = test["tpep_dropoff_datetime"] - test["tpep_pickup_datetime"]

*Extracting time interval in seconds*

In [27]:
diff = diff.dt.total_seconds()
test["time_interval"] = diff

*Extracting hour from pickup date time*

In [28]:
test["hour"] = test["tpep_pickup_datetime"].dt.hour

*Calculating distance for zero values*

In [29]:
pos = test[test["trip_distance"] == 0.0].index
npos = test[~test.index.isin(pos)].index

In [30]:
pred = reg.predict(test.loc[pos,["total_amount", "time_interval"]])/2

In [32]:
x = 0
for i in pos[:10]:
    test.loc[i, "trip_distance"] = pred[x]
    x = x + 1

*Removing negative and zero time instances from time_interval*

In [33]:
pos = test[test["time_interval"] <=0.0].index
test.drop(pos, axis=0, inplace=True)

*Converting miles into meters*

In [34]:
test["trip_distance"] = test["trip_distance"] * 1609.0

*Calculating speed in meter/second*

In [35]:
test["avg_speed_ms"] = test["trip_distance"] / test["time_interval"]

In [36]:
# save train_labels
test_labels = test["total_amount"]
test_labels = np.array(test_labels)
np.save("data/test_labels", test_labels, allow_pickle=True)

In [37]:
from category_encoders import BinaryEncoder
cat = ["VendorID", "passenger_count", "trip_distance", "RatecodeID",
       "PULocationID", "DOLocationID", "payment_type", "day_of_week",
       "hour"]
enc = encoder.transform(test.loc[:, cat])
encoded_cat = np.array(enc)

In [38]:
from sklearn.preprocessing import StandardScaler
cont = ["trip_distance", "time_interval", "avg_speed_ms"]
sca = scale.transform(test.loc[:,cont])
scale_cont = np.array(sca)

In [39]:
test_matrix = np.concatenate((encoded_cat, scale_cont), axis=1)

In [40]:
np.save(file="data/test_matrix", arr=test_matrix, allow_pickle=True)

In [41]:
del encoded_cat, scale_cont, test_matrix