## **Feature Engineering**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### **About Data**

*The data consist of New Yorks iconic yellow taxis recorded and provided by TLC. Records include fields*
1. `VendorID` - Data provided by vendors
2. `tpep_pickup_datetime` - pickup date and time
3. `tpep_dropoff_datetime` - drop off date and time
4. `passenger_count` - number of passengers in taxi
5. `trip_distance` - in miles
6. `RatecodeID` - rates codes
7. `PULocationID` - pickup Location
8. `DOLocationID` - dropoff Location
9. `payment_type` - mode of payment
10. `total_amount` - amount payed by passengers

In [None]:
nyc_taxi = pd.read_parquet("data/yellow_tripdata_2022-03.parquet")
nyc_taxi.head()

*Converting miles into meters*

In [None]:
nyc_taxi["trip_distance"] = nyc_taxi["trip_distance"] * 1609.34

*Calculating time interval from pickup to dropoff*

In [None]:
diff = nyc_taxi["tpep_dropoff_datetime"] - nyc_taxi["tpep_pickup_datetime"]

*Extracting time interval in seconds*

In [None]:
diff = diff.dt.total_seconds()
nyc_taxi["time_interval"] = diff

*Extracting hour from pickup date time*

In [None]:
nyc_taxi["hour"] = nyc_taxi["tpep_pickup_datetime"].dt.hour

*Calculating average speed in meter/second for each trip*

In [None]:
nyc_taxi["avg_speed_ms"] = (nyc_taxi["trip_distance"] / nyc_taxi["time_interval"])

### **Statistics of important variables**

In [None]:
nyc_taxi.describe()

In [None]:
### **Data type**

In [None]:
nyc_taxi.info()

*Since variable `passenger_count` and `RateCodeID` are categorical we need to convert it into `int` data type*

In [None]:
nyc_taxi["passenger_count"] = nyc_taxi["passenger_count"].astype("int")
nyc_taxi["RatecodeID"] = nyc_taxi["RatecodeID"].astype("int")

*replacing `negative` and `infinite` values from `avg_speed_ms` as `NA` because they are small or incorrectly classified*

In [None]:
pos1 = nyc_taxi[nyc_taxi["avg_speed_ms"] < 0].index
pos2 = nyc_taxi[nyc_taxi["avg_speed_ms"] == np.inf].index
pos = np.concatenate([pos1, pos2])
nyc_taxi.loc[pos, "avg_speed_ms"] = np.nan

*remove all `NA` instances from the data*

In [None]:
nyc_taxi.dropna(inplace=True)