In [1]:
import pandas as pd
import numpy as np
import json

df = pd.read_csv("../data/cleaned_taxi_trip_pricing.csv")

df.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
2,8.64,Afternoon,Weekend,2.0,Medium,Clear,2.55,1.71,0.48,89.33,60.2028
3,41.79,Night,Weekend,3.0,High,Clear,4.6,1.77,0.11,86.95,88.1328
4,9.91,Evening,Weekday,2.0,High,Clear,2.32,1.26,0.34,41.72,28.9914


In [2]:
df.shape 

(562, 11)

In [3]:
X, y = df.drop("Trip_Price", axis=1), df["Trip_Price"]

X.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82
1,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27
2,8.64,Afternoon,Weekend,2.0,Medium,Clear,2.55,1.71,0.48,89.33
3,41.79,Night,Weekend,3.0,High,Clear,4.6,1.77,0.11,86.95
4,9.91,Evening,Weekday,2.0,High,Clear,2.32,1.26,0.34,41.72


In [4]:
y.head()

0    36.2624
1    52.9032
2    60.2028
3    88.1328
4    28.9914
Name: Trip_Price, dtype: float64

In [5]:
X = pd.get_dummies(df.drop("Trip_Price", axis=1), drop_first=True)

In [6]:
y = df["Trip_Price"]

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(f"{X_train.shape = }")
print(f"{X_test.shape = }")
print(f"{y_train.shape = }")
print(f"{y_test.shape = }")


X_train.shape = (376, 14)
X_test.shape = (186, 14)
y_train.shape = (376,)
y_test.shape = (186,)


In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)

scaled_X_train = scaler.transform(X_train)
scaled_X_test  = scaler.transform(X_test)

scaled_X_train.shape, scaled_X_test.shape


((376, 14), (186, 14))

In [9]:
scaled_X_train.min(), scaled_X_train.max()

(np.float64(0.0), np.float64(1.0))

In [10]:
scaled_X_test.min(), scaled_X_test.max()

(np.float64(0.0), np.float64(1.0022147959745724))

In [11]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model


0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [12]:
model.fit(scaled_X_train, y_train) 
model.coef_

array([259.49570199,  -0.31305689,  -0.92612626,  37.92785338,
        23.73432994,  34.41489611,  -0.73739574,   3.17731199,
        -0.26449685,   1.25676294,  -5.23725082,  -6.06872626,
        -0.8864775 ,   1.91026475])

In [13]:
model.intercept_

np.float64(-32.09671700551838)

## Predict

In [14]:
y_pred = model.predict(scaled_X_test)

print(f"Predictions: {y_pred[:5]}")
print(f"Actual values: {y_test[:5].values}")

Predictions: [ 42.11412098 108.62657096  45.37989508  51.02153065  37.92092763]
Actual values: [36.0978 87.7201 43.8544 52.5934 43.6034]


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Linear Regression RMSE: {rmse:.2f}")
print(f"mae: {mae:.2f}")
print(f"mse: {mse:.2f}")


Linear Regression RMSE: 15.13


In [16]:
final_scaler = MinMaxScaler()
scaled_X_final = final_scaler.fit_transform(X)
final_model = LinearRegression()
final_model.fit(scaled_X_final, y) 

print(f"Final scaled data shape: {scaled_X_final.shape}")

Final scaled data shape: (562, 14)
