# Model development

I will test 3 models (RandomForest, KNN and Linear Regression)

In [29]:
import pandas as pd
from taxipred.utils.constants import DATA_PATH

df = pd.read_csv(DATA_PATH/"cleaned_taxi_trip_pricing.csv")

print(f"{df.shape[0]} samples")
print(f"{df.shape[1]-1} features") # -1 because Trip_price is the label and not a feature

df.head(5)

683 samples
7 features


Unnamed: 0,Trip_Distance_km,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Trip_Duration_Minutes,Trip_Price
0,19.35,Weekday,3.0,Low,Clear,3.56,53.82,36.2624
1,36.87,Weekend,1.0,High,Clear,2.7,37.27,52.9032
2,8.64,Weekend,2.0,Medium,Clear,2.55,89.33,60.2028
3,3.85,Weekday,4.0,High,Rain,3.51,5.05,11.2645
4,30.45,Weekday,3.0,High,Clear,2.77,110.33,


Now i will split my trip_price so i have 2 seperate dataframes, one of the dataframe will have all of the data except where trip_price is NaN and the other dataframe will only have the NaN values of trip_price, so i later can predict the price using the model im gonna choose.

In [27]:
df = df[df["Trip_Price"].notna()].copy()
df

Unnamed: 0,Trip_Distance_km,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Trip_Duration_Minutes,Trip_Price
0,19.35,Weekday,3.0,Low,Clear,3.56,53.82,36.2624
1,36.87,Weekend,1.0,High,Clear,2.70,37.27,52.9032
2,8.64,Weekend,2.0,Medium,Clear,2.55,89.33,60.2028
3,3.85,Weekday,4.0,High,Rain,3.51,5.05,11.2645
6,41.79,Weekend,3.0,High,Clear,4.60,86.95,88.1328
...,...,...,...,...,...,...,...,...
678,5.49,Weekend,4.0,Medium,Clear,2.39,58.39,34.4049
679,45.95,Weekday,4.0,Medium,Clear,3.12,61.96,62.1295
680,7.70,Weekday,3.0,Low,Rain,2.08,54.18,33.1236
681,47.56,Weekday,1.0,Low,Clear,2.67,114.94,61.2090


In [26]:
df_nan_values = df[df["Trip_Price"].isna()].copy()
df_nan_values.head(5)

Unnamed: 0,Trip_Distance_km,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Trip_Duration_Minutes,Trip_Price
4,30.45,Weekday,3.0,High,Clear,2.77,110.33,
5,48.53,Weekday,3.0,Low,Clear,4.78,79.94,
17,4.19,Weekday,1.0,Low,Clear,4.07,69.06,
61,38.78,Weekday,3.0,Medium,Clear,3.08,90.14,
108,10.14,Weekday,4.0,Low,Rain,4.5,25.41,


In [31]:
df.columns

Index(['Trip_Distance_km', 'Day_of_Week', 'Passenger_Count',
       'Traffic_Conditions', 'Weather', 'Base_Fare', 'Trip_Duration_Minutes',
       'Trip_Price'],
      dtype='object')

Now i will split the data (X is features and y is label), even the NaN value data

In [32]:
X, y = df.drop("Trip_Price", axis="columns"), df["Trip_Price"]
X.head(2), y.head(2)

(   Trip_Distance_km Day_of_Week  Passenger_Count Traffic_Conditions Weather  \
 0             19.35     Weekday              3.0                Low   Clear   
 1             36.87     Weekend              1.0               High   Clear   
 
    Base_Fare  Trip_Duration_Minutes  
 0       3.56                  53.82  
 1       2.70                  37.27  ,
 0    36.2624
 1    52.9032
 Name: Trip_Price, dtype: float64)

In [36]:
X_nan_values = df_nan_values.drop(columns="Trip_Price")
X_nan_values.head(5)

Unnamed: 0,Trip_Distance_km,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Trip_Duration_Minutes
4,30.45,Weekday,3.0,High,Clear,2.77,110.33
5,48.53,Weekday,3.0,Low,Clear,4.78,79.94
17,4.19,Weekday,1.0,Low,Clear,4.07,69.06
61,38.78,Weekday,3.0,Medium,Clear,3.08,90.14
108,10.14,Weekday,4.0,Low,Rain,4.5,25.41


Now i need to encode the data so i have int_cols (columns that are type: int) and str_cols (columns that are object / type: str)

In [39]:
int_cols = X.select_dtypes(include="number").columns
str_cols = X.select_dtypes(exclude="number").columns

This code is Ai generated i just switched the variables to the ones i have, this code is converting kategorical data to numbers.

In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ("int", StandardScaler(), int_cols),
        ("str", OneHotEncoder(handle_unknown="ignore"), str_cols),
    ]
)


Now i will start with the linear regression