In [1]:
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

import xgboost as xgb


In [3]:
# Load the dataset
df = pd.read_csv("./data/dataset_to_train.csv")

# Display the first few rows of the dataset
print(df.head())

   passenger_count  trip_distance  pu_zone_id  do_zone_id  enter_airport  \
0                1           3.20         140          79              0   
1                1           1.18         237         145              0   
2                1           2.21         114         170              0   
3                1           2.10          68         107              0   
4                2           1.00         249          79              0   

   pickup_hour  pickup_minute  pickup_weekday  dropoff_hour  dropoff_minute  \
0            7             34               1             7              46   
1            7             23               1             7              29   
2            7             19               1             7              32   
3            7             34               1             7              43   
4            7             59               1             8               5   

   trip_duration_minutes  congestion_level  rain  
0              11

In [4]:
# Print the column names to verify the correct column name
print(df.columns)

Index(['passenger_count', 'trip_distance', 'pu_zone_id', 'do_zone_id',
       'enter_airport', 'pickup_hour', 'pickup_minute', 'pickup_weekday',
       'dropoff_hour', 'dropoff_minute', 'trip_duration_minutes',
       'congestion_level', 'rain'],
      dtype='object')


# Split Train, Test

In [5]:
X = df[['trip_distance', 'pu_zone_id', 'do_zone_id', 'pickup_weekday', 'pickup_hour', 
        'pickup_minute', 'rain', 'enter_airport']]
y = df['trip_duration_minutes']

In [6]:
# Perform the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (4141568, 8)
X_test shape: (1035393, 8)
y_train shape: (4141568,)
y_test shape: (1035393,)


In [7]:
# Display the first few rows of the training dataset
print("Training Dataset:")
print(X_train.head())

# Display the first few rows of the testing dataset
print("\nTesting Dataset:")
print(X_test.head())

Training Dataset:
         trip_distance  pu_zone_id  do_zone_id  pickup_weekday  pickup_hour  \
5050631           2.71         142         170               0            4   
599797            3.41         229         144               6           18   
3720050           2.61         186         141               1            2   
4640213           3.70         239         234               0           23   
1157552           1.16          48         237               2           16   

         pickup_minute  rain  enter_airport  
5050631             34     0              0  
599797              36     0              0  
3720050             32     0              0  
4640213             13     0              0  
1157552             21     0              0  

Testing Dataset:
         trip_distance  pu_zone_id  do_zone_id  pickup_weekday  pickup_hour  \
3392987           1.96          42          75               3            3   
3146301           4.80         113         239         

In [8]:
print(X_train.dtypes)

trip_distance     float64
pu_zone_id          int64
do_zone_id          int64
pickup_weekday      int64
pickup_hour         int64
pickup_minute       int64
rain                int64
enter_airport       int64
dtype: object


# Model Prediction

## Linear Regression

In [26]:
# Create the Linear Regression model
lin_model = LinearRegression()

# Train the model on the training data
lin_model.fit(X_train, y_train)

In [27]:
# Make predictions on the test data
y_pred_lin = lin_model.predict(X_test)

In [28]:
print(y_pred_lin)

[13.65424301 25.50684407 12.90493014 ... 12.44999348 15.1833014
 15.25445413]


In [29]:
# Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred_lin)
print("Mean Absolute Error for Linear Regression:", mae)

# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred_lin)
print("Mean Squared Error for Linear Regression:", mse)

# Calculate the Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print("Root Mean Squared Error for Linear Regression:", rmse)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred_lin)
print("R-squared (R²) for Linear Regression:", r2)



Mean Absolute Error for Linear Regression: 4.43130663585696
Mean Squared Error for Linear Regression: 64.22837658064188
Root Mean Squared Error for Linear Regression: 8.014260825593455
R-squared (R²) for Linear Regression: 0.3733321065718832


## XGBoost

In [30]:
# Create the XGBoost model
xgb_model = xgb.XGBRegressor()

# Train the model on the training data
xgb_model.fit(X_train, y_train)

In [31]:
# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
print(y_pred_xgb)

[12.532698 31.852528 16.852839 ... 13.247412 13.993834 18.776003]


In [32]:
# Calculate Mean Absolute Error (MAE)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print("Mean Absolute Error (MAE) for XGBoost:", mae_xgb)

# Calculate Mean Squared Error (MSE)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print("Mean Squared Error (MSE) for XGBoost:", mse_xgb)

RMSE_xgb = np.sqrt(mse_xgb)
print("RMSE for XGBoost: ",RMSE_xgb)

# Calculate R-squared (R²)
r2_xgb = r2_score(y_test, y_pred_xgb)
print("R-squared (R²) for XGBoost:", r2_xgb)

Mean Absolute Error (MAE) for XGBoost: 3.4337676799430525
Mean Squared Error (MSE) for XGBoost: 51.07423697198124
RMSE for XGBoost:  7.146624166134752
R-squared (R²) for XGBoost: 0.5016753311288494


## Random Forest

In [33]:
# Initialize the Random Forest model
rf = RandomForestRegressor(
    n_estimators=50,  # Number of trees
    max_depth=10,    # Unlimited depth for better accuracy
    min_samples_split=2,  # Minimum samples per split
    random_state=42  # Random seed for reproducibility
)

# Train the model
rf.fit(X_train, y_train)

KeyboardInterrupt: 

In [20]:
y_pred_rf= rf.predict(X_test)

In [22]:
# Calculate Mean Absolute Error (MAE)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print("Mean Absolute Error (MAE) for Random Forest:", mae_rf)

# Calculate Mean Squared Error (MSE)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print("Mean Squared Error (MSE) for Random Forest:", mse_rf)

# Calculate Root Mean Squared Error (RMSE)
RMSE_rf = np.sqrt(mse_rf)
print("RMSE for Random Forest:", RMSE_rf)

# Calculate R-squared (R²)
r2_rf = r2_score(y_test, y_pred_rf)
print("R-squared (R²) for Random Forest:", r2_rf)

Mean Absolute Error (MAE) for Random Forest: 3.8426439175460803
Mean Squared Error (MSE) for Random Forest: 57.155625230829436
RMSE for Random Forest: 7.560133942651376
R-squared (R²) for Random Forest: 0.4423400190412713


Model XGB has a smaller RMSE compared to Model Linear Regression and Random Forest, which indicates that, on average, the predictions made by Model XGB are closer to the true values than those made by Model Linear and Random Forest . A smaller RMSE value suggests that Model XGB has a better fit to the data and is more accurate in its predictions.

## Advanced KNN

### Feature Scaling

In [17]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Initialize KNN Regressor


In [18]:
knn = KNeighborsRegressor()

### Set the parameter grid for GridSearchCV

In [19]:
param_grid = {
    'n_neighbors': [3, 5, 7, 10],                
    'weights': ['uniform', 'distance'],       
    'metric': ['euclidean', 'manhattan', 'minkowski']  
}

### GridSearchCV for hyperparameter optimization

In [20]:
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

# Fit GridSearchCV to find the best hyperparameters
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters found by GridSearchCV
print(f"Best Parameters from GridSearchCV: {grid_search.best_params_}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=18.6min
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=18.7min
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=18.7min
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=18.7min
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=18.7min
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=18.8min
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=18.8min
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=18.8min
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=22.3min
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=23.6min
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=25.7min
[CV] END ...metric=euclidean, n_neighbors=5, we

### Get the best KNN model from the grid search

In [None]:
knn_advanced = KNeighborsRegressor(n_neighbors=10, metric='manhattan', weights='distance', n_jobs=-1)

In [21]:
#knn_advanced = grid_search.best_estimator_
y_pred_knn = knn_advanced.predict(X_test_scaled)
print(y_pred_knn)

[11.08741536 30.3023562  12.87976475 ... 13.99919352 14.58100134
 18.12939399]


In [None]:
# Calculate Mean Absolute Error (MAE)
mae_knn = mean_absolute_error(y_test, y_pred_knn)
print("Mean Absolute Error (MAE) for Advanced KNN:", mae_knn)

# Calculate Mean Squared Error (MSE)
mse_knn = mean_squared_error(y_test, y_pred_knn)
print("Mean Squared Error (MSE) for Advanced KNN:", mse_knn)

# Calculate Root Mean Squared Error (RMSE)
RMSE_knn = np.sqrt(mse_knn)
print("RMSE for Advanced KNN:", RMSE_knn)

# Calculate R-squared (R²)
r2_knn = r2_score(y_test, y_pred_knn)
print("R-squared (R²) for Advanced KNN:", r2_knn)

Mean Absolute Error (MAE) for Advanced KNN: 3.691612715381257
Mean Squared Error (MSE) for Advanced KNN: 57.86367222078017
RMSE for Advanced KNN: 7.606817483072679
R-squared (R²) for Advanced KNN: 0.435431696906063


## Original KNN

In [34]:
knn_standard = KNeighborsRegressor(n_neighbors=5, metric='euclidean', weights='uniform', n_jobs=-1)
knn_standard.fit(X_train, y_train)

y_pred_standard = knn_standard.predict(X_test)

# Calculate metrics for the standard kNN model
mae_standard = mean_absolute_error(y_test, y_pred_standard)
mse_standard = mean_squared_error(y_test, y_pred_standard)
rmse_standard = np.sqrt(mse_standard)
r2_standard = r2_score(y_test, y_pred_standard)

print("Standard kNN Performance:")
print(f"MAE: {mae_standard}")
print(f"MSE: {mse_standard}")
print(f"RMSE: {rmse_standard}")
print(f"R-squared: {r2_standard}")

print("\nAdvanced kNN Performance:")
print(f"MAE: {mae_knn}")
print(f"MSE: {mse_knn}")
print(f"RMSE: {RMSE_knn}")
print(f"R-squared: {r2_knn}")


Standard kNN Performance:
MAE: 3.9544796871010965
MSE: 62.91711152752842
RMSE: 7.932030731630357
R-squared: 0.38612594867575134

Advanced kNN Performance:
MAE: 3.691612715381257
MSE: 57.86367222078017
RMSE: 7.606817483072679
R-squared: 0.435431696906063


# Save trained model with joblib

In [24]:
filename = 'aknn_model.joblib'
joblib.dump(knn, filename)

['aknn_model.joblib']

In [25]:
from sklearn.metrics import classification_report

# Define bins for classification
bins = [60, 120, 180, 240, 300, 360, np.inf]

labels = [1, 2, 3, 4, 5, 6]

# Bin the actual and predicted values
y_test_binned = pd.cut(y_test, bins=bins, labels=labels)
y_predict_binned = pd.cut(y_pred_knn, bins=bins, labels=labels)

# Convert to Series to use cat accessor
y_test_binned = pd.Series(y_test_binned).cat.add_categories([0]).fillna(0)
y_predict_binned = pd.Series(y_predict_binned).cat.add_categories([0]).fillna(0)

# check results
print(classification_report(y_test_binned, y_predict_binned)) 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1032777
           1       0.12      0.02      0.04      2439
           2       0.00      0.00      0.00        83
           3       0.20      0.05      0.08        21
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        12
           6       1.00      0.02      0.04        48

    accuracy                           1.00   1035393
   macro avg       0.33      0.16      0.16   1035393
weighted avg       1.00      1.00      1.00   1035393

