In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_squared_error
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [65]:
df = pd.read_csv('../data/02_processed/df_cleaned.csv')

In [66]:
df.shape

(19607065, 19)

In [67]:
df.columns

Index(['id', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'pickup_datetime_dayofyear', 'pickup_datetime_month',
       'pickup_datetime_year', 'pickup_datetime_hour',
       'pickup_datetime_dayofweek', 'pickup_datetime_is_weekend',
       'is_late_night', 'is_night', 'is_early_morning', 'is_rush_hour',
       'trip_distance'],
      dtype='object')

In [68]:
df['fare_amount'].describe()

count    1.960706e+07
mean     1.293649e+01
std      1.076349e+01
min      1.000000e-02
25%      7.000000e+00
50%      9.500000e+00
75%      1.450000e+01
max      9.520000e+02
Name: fare_amount, dtype: float64

In [69]:
df = df.drop(columns=['id', 'pickup_datetime'])

In [70]:
df['hour_sin'] = np.sin(2 * np.pi * df['pickup_datetime_hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['pickup_datetime_hour'] / 24)

df['dow_sin'] = np.sin(2 * np.pi * df['pickup_datetime_dayofweek'] / 7)
df['dow_cos'] = np.cos(2 * np.pi * df['pickup_datetime_dayofweek'] / 7)

df['month_sin'] = np.sin(2 * np.pi * df['pickup_datetime_month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['pickup_datetime_month'] / 12)

df['doy_sin'] = np.sin(2 * np.pi * df['pickup_datetime_dayofyear'] / 366)
df['doy_cos'] = np.cos(2 * np.pi * df['pickup_datetime_dayofyear'] / 366)

In [71]:
df.drop(['pickup_datetime_hour', 'pickup_datetime_dayofweek', 
         'pickup_datetime_month', 'pickup_datetime_dayofyear'], axis=1, inplace=True)


In [72]:
numeric_features = ['pickup_longitude', 'pickup_latitude',
                    'dropoff_longitude', 'dropoff_latitude', 'trip_distance',
                    'passenger_count', 'pickup_datetime_year']

bool_features = ['pickup_datetime_is_weekend', 'is_late_night', 'is_night',
                 'is_early_morning', 'is_rush_hour']

cyclic_features = ['hour_sin', 'hour_cos', 'dow_sin', 'dow_cos',
                   'month_sin', 'month_cos', 'doy_sin', 'doy_cos']

all_features = numeric_features + bool_features + cyclic_features

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('passthrough_bool', 'passthrough', bool_features),
        ('passthrough_cyclic', 'passthrough', cyclic_features),
    ]
)

In [76]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

def calculate_vif(X):
    # Add a constant column for intercept
    X_with_const = add_constant(X)
    
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i + 1)
                       for i in range(X.shape[1])]  # skip constant
    return vif_data.sort_values(by="VIF", ascending=False)


X = df.copy()
# Example usage:
vif_df = calculate_vif(X)
print(vif_df)

MemoryError: Unable to allocate 299. MiB for an array with shape (2, 19607065) and data type int64

### Model training

In [73]:
X = df.drop(columns=['fare_amount'])
y = df['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15685652, 20), (3921413, 20), (15685652,), (3921413,))

In [None]:
version = "v4"
with mlflow.start_run(run_name=f"LinearRegression_{version}"):

    model = LinearRegression()
    model.fit(X_train_preprocessed, y_train)

    y_pred = model.predict(X_test_preprocessed)

    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)

    # Print results
    print(f"Mean Squared Error: {mse}\n")
    print(f"Root Mean Squared Error: {rmse}\n")
    print(f"Coeficient: {model.coef_}\n")
    print(f"Intercept: {model.intercept_}\n")

    # Log parameters (no hyperparams in LinearRegression, but log fit_intercept etc.)
    mlflow.log_param("features", ", ".join(X_train.columns))

    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)

    coefs = pd.DataFrame({
        "feature": X_train.columns,
        "coefficient": model.coef_
    })
    coefs_file = f"feature_importance_{version}.csv"
    coefs.to_csv(coefs_file, index=False)
    mlflow.log_artifact(coefs_file)

    # Log model
    signature = infer_signature(X_train, model.predict(X_train))

    # Optionally provide an example input (for documentation/UI)
    input_example = X_train.iloc[:5]

    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=input_example
    )

Mean Squared Error: 17.40546718430118

Root Mean Squared Error: 4.1719860000126054

Coeficient: [ 2.26031292e-01  2.99751251e-01 -3.74515856e-01 -4.61654390e-01
  9.97457126e+00  4.89317849e-02  1.90787257e-01 -2.90791419e-01
  3.24696157e-01  3.12050995e-01 -4.67046989e-01  6.56319153e-02
 -3.71826314e-01 -1.12939213e+00  1.34094081e-01 -2.57087724e-01
  1.18521725e-01 -1.09759603e-01 -2.74179352e-01 -1.20594265e-03]

Intercept: 12.823490951831921



