In [27]:

# import package
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import joblib

# Load the dataset
df = pd.read_csv("../data/dataset/dataset.csv")

# Display the first few rows of the dataset
print(df.head())




   trip_distance  PULocationID  DOLocationID  pickup_hour  speed_mph  \
0           4.24           100             4           15   6.888087   
1           1.50           230            68           15   8.282209   
2           1.30            50           142           15   8.224956   
3           1.90           141           161           15  11.672355   
4           2.40           249           163           15   7.592267   

   trip_duration_minutes  humidity  windgust  visibility  congestion_level  \
0              36.933333     43.34      47.8        16.0                 2   
1              10.866667     43.34      47.8        16.0                 2   
2               9.483333     43.34      47.8        16.0                 2   
3               9.766667     43.34      47.8        16.0                 2   
4              18.966667     43.34      47.8        16.0                 2   

   rain  
0     0  
1     0  
2     0  
3     0  
4     0  


In [28]:
# Print the column names to verify the correct column name
print(df.columns)

Index(['trip_distance', 'PULocationID', 'DOLocationID', 'pickup_hour',
       'speed_mph', 'trip_duration_minutes', 'humidity', 'windgust',
       'visibility', 'congestion_level', 'rain'],
      dtype='object')


# Split Train, Test

In [29]:
X = df[['trip_distance', 'congestion_level', 'pickup_hour', 'speed_mph', 'windgust', 
        'rain', 'visibility', 'humidity', 'PULocationID','DOLocationID']] # Features (replace 'target' with your actual column name)
y = df['trip_duration_minutes']

In [30]:
# Replace missing values in windgust, visibility, and humidity with their respective column mean
X['windgust'] = X['windgust'].fillna(X['windgust'].mean())
X['visibility'] = X['visibility'].fillna(X['visibility'].mean())
X['humidity'] = X['humidity'].fillna(X['humidity'].mean())

# Verify there are no more missing values in these columns
print(X[['windgust', 'visibility', 'humidity']].isna().sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['windgust'] = X['windgust'].fillna(X['windgust'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['visibility'] = X['visibility'].fillna(X['visibility'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['humidity'] = X['humidity'].fillna(X['humidity'].mean())


windgust      0
visibility    0
humidity      0
dtype: int64


In [31]:
# Perform the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (4841155, 10)
X_test shape: (1210289, 10)
y_train shape: (4841155,)
y_test shape: (1210289,)


In [32]:
# Display the first few rows of the training dataset
print("Training Dataset:")
print(X_train.head())

# Display the first few rows of the testing dataset
print("\nTesting Dataset:")
print(X_test.head())

Training Dataset:
         trip_distance  congestion_level  pickup_hour  speed_mph  windgust  \
1593171           1.64                 2           22   7.028571      43.5   
4923503          13.90                 2            4  21.176471       9.4   
4107482           1.60                 2            5  11.451292      16.6   
2204162           2.70                 2            0   8.066390      29.5   
2378851           1.42                 0           14  11.334812      25.9   

         rain  visibility  humidity  PULocationID  DOLocationID  
1593171     0        16.0     70.24           234            68  
4923503     0        16.0     83.05           161            92  
4107482     0        16.0     81.20           162           141  
2204162     1        11.1     92.57           262           143  
2378851     0        16.0     69.73           230            68  

Testing Dataset:
         trip_distance  congestion_level  pickup_hour  speed_mph  windgust  \
76913             1.6

In [33]:
print(X_train.dtypes)

trip_distance       float64
congestion_level      int64
pickup_hour           int64
speed_mph           float64
windgust            float64
rain                  int64
visibility          float64
humidity            float64
PULocationID          int64
DOLocationID          int64
dtype: object


# Model Prediction

### Linear Regression

In [34]:
# Create the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

In [35]:
# Make predictions on the test data
y_pred = model.predict(X_test)

In [36]:
print(y_pred)

[10.17755965 17.63774414 22.50792527 ... 20.63774849  6.80796144
 26.94394017]


In [37]:
# Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate the Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)



Mean Absolute Error: 3.489980016490799
Mean Squared Error: 101.95647705845887
Root Mean Squared Error: 10.09735000178061
R^2 Score: 0.6690518749776839


In [38]:
#Cal

### XGBoost

In [39]:
# Create the XGBoost model
xgb_model = xgb.XGBRegressor()

# Train the model on the training data
xgb_model.fit(X_train, y_train)

In [40]:
# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
print(y_pred_xgb)

[ 9.011329  16.193684  25.582346  ... 20.993456   6.8126755 38.53247  ]


In [41]:
# Calculate Mean Absolute Error (MAE)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print("Mean Absolute Error (MAE) for XGBoost:", mae_xgb)

# Calculate Mean Squared Error (MSE)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print("Mean Squared Error (MSE) for XGBoost:", mse_xgb)

RMSE_xgb = np.sqrt(mse_xgb)
print("RMSE for XGBoost: ",RMSE_xgb)

# Calculate R-squared (R²)
r2_xgb = r2_score(y_test, y_pred_xgb)
print("R-squared (R²) for XGBoost:", r2_xgb)

Mean Absolute Error (MAE) for XGBoost: 0.41157125527800853
Mean Squared Error (MSE) for XGBoost: 31.86700533371915
RMSE for XGBoost:  5.645086831371077
R-squared (R²) for XGBoost: 0.8965605131763866


Model XGB has a smaller RMSE compared to Model Linear Regression, which indicates that, on average, the predictions made by Model XGB are closer to the true values than those made by Model Linear. A smaller RMSE value suggests that Model XGB has a better fit to the data and is more accurate in its predictions.

# Make predictions

In [61]:
# save model with joblib 
filename = 'xgboost_model.sav'
joblib.dump(xgb_model, filename)

['xgboost_model.sav']

In [None]:
from sklearn.metrics import classification_report


# Define bins for classification
bins = []
for n in range(1201):
    if n % 60 == 0:
        bins.append(n)
bins.append(np.inf)
labels = [1, 2, 3, 4, 5, 6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]

# Bin the actual and predicted values
y_test_binned = pd.cut(y_test, bins=bins, labels=labels)
y_predict_binned = pd.cut(y_pred_xgb, bins=bins, labels=labels)

# Convert to Series to use cat accessor
y_predict_binned = pd.Series(y_predict_binned).cat.add_categories([0]).fillna(0)

# check results
print(classification_report(y_test_binned, y_predict_binned)) 