In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
raw_path = '../01 - data/01 - raw/'
df = pd.read_csv(os.path.join(raw_path, 'taxi_trip_pricing.csv'))

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [3]:
df = df[df['Trip_Price'].notnull()]

In [4]:
categorical_cols = ['Time_of_Day', 'Day_of_Week', 'Traffic_Conditions', 'Weather']
numerical_cols = ['Trip_Distance_km', 'Passenger_Count', 'Base_Fare', 
                  'Per_Km_Rate', 'Per_Minute_Rate', 'Trip_Duration_Minutes']

target = 'Trip_Price'

In [5]:
simple_imputer = ColumnTransformer(transformers=[
    ('num', SimpleImputer(strategy='median'), numerical_cols),
    ('cat', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_cols)
])

In [6]:
knn_imputer = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5))
    ]), numerical_cols),
    ('cat', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_cols)
])

In [7]:
iterative_imputer = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[
        ('imputer', IterativeImputer(random_state=0))
    ]), numerical_cols),
    ('cat', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_cols)
])

In [8]:
def evaluate_model(preprocessor, df):
    X = df.drop(columns=[target])
    y = df[target]

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    # Create pipeline with pre-processing and model
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42))
    ])

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    return {'RMSE': rmse, 'R2': r2}

In [9]:
result_simple = evaluate_model(simple_imputer, df)
result_knn = evaluate_model(knn_imputer, df)
result_iter = evaluate_model(iterative_imputer, df)

In [10]:
results_df = pd.DataFrame([
    {'Strategy': 'Simple (Moda/Mediana)', **result_simple},
    {'Strategy': 'KNN Imputer', **result_knn},
    {'Strategy': 'Iterative Imputer', **result_iter},
])

results_df.sort_values(by='RMSE')

Unnamed: 0,Strategy,RMSE,R2
0,Simple (Moda/Mediana),10.898476,0.945653
1,KNN Imputer,11.266891,0.941917
2,Iterative Imputer,11.616091,0.938261


In [13]:
# 1. Apply the pipeline (fit_transform to learn and apply)
X_processed = simple_imputer.fit_transform(df.drop(columns=['Trip_Price']))

# 2. Retrieve the column names OneHotEncoder
ohe = simple_imputer.named_transformers_['cat'].named_steps['encoder']
encoded_cols = ohe.get_feature_names_out(categorical_cols)

# 3. Combine names of numeric columns + coded columns
all_cols = numerical_cols + list(encoded_cols)

# 4. Create a pandas DataFrame with the transformed data
df_processed = pd.DataFrame(X_processed, columns=all_cols)

# 5. Add the target column (Trip_Price) without change
df_processed['Trip_Price'] = df['Trip_Price'].values



output_path_ready = '../01 - data/02 - processed/processed_data.csv'
df_processed.to_csv(output_path_ready, index=False)

