In [1]:

# import package
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Load the dataset
df = pd.read_csv("../data/dataset_removed_csv/dataset.csv")

# Display the first few rows of the dataset
print(df.head())




   passenger_count  trip_distance  PULocationID  DOLocationID PU_Borough  \
0                1           3.20           140            79  Manhattan   
1                1           1.18           237           145  Manhattan   
2                1           2.21           114           170  Manhattan   
3                1           2.10            68           107  Manhattan   
4                2           1.00           249            79  Manhattan   

                   PU_Zone DO_Borough                         DO_Zone  \
0          Lenox Hill East  Manhattan                    East Village   
1    Upper East Side South     Queens  Long Island City/Hunters Point   
2  Greenwich Village South  Manhattan                     Murray Hill   
3             East Chelsea  Manhattan                        Gramercy   
4             West Village  Manhattan                    East Village   

   enter_airport pickup_date  ...  precip  windgust  windspeed  winddir  \
0              0  2024-07-01 

In [None]:
# Print the column names to verify the correct column name
print(df.columns)

Index(['passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID',
       'PU_Borough', 'PU_Zone', 'DO_Borough', 'DO_Zone', 'enter_airport',
       'pickup_date', 'pickup_hour', 'pickup_minute', 'pickup_second',
       'pickup_weekday', 'pickup_month', 'pickup_day', 'pickup_week_hour',
       'pickup_time', 'dropoff_date', 'dropoff_hour', 'dropoff_minute',
       'dropoff_second', 'dropoff_weekday', 'dropoff_month', 'dropoff_day',
       'dropoff_time', 'trip_duration_seconds', 'speed_mph', 'temp',
       'humidity', 'precip', 'windgust', 'windspeed', 'winddir', 'cloudcover',
       'visibility', 'severerisk', 'day', 'congestion_level', 'rain'],
      dtype='object')


# Split Train, Test

In [2]:
X = df[['trip_distance', 'congestion_level', 'pickup_hour', 'speed_mph', 'windgust', 
        'rain', 'visibility', 'humidity', 'PULocationID','DOLocationID']] # Features (replace 'target' with your actual column name)
y = df['trip_duration_seconds']

In [3]:
# Replace missing values in windgust, visibility, and humidity with their respective column mean
X['windgust'] = X['windgust'].fillna(X['windgust'].mean())
X['visibility'] = X['visibility'].fillna(X['visibility'].mean())
X['humidity'] = X['humidity'].fillna(X['humidity'].mean())

# Verify there are no more missing values in these columns
print(X[['windgust', 'visibility', 'humidity']].isna().sum())

windgust      0
visibility    0
humidity      0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['windgust'] = X['windgust'].fillna(X['windgust'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['visibility'] = X['visibility'].fillna(X['visibility'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['humidity'] = X['humidity'].fillna(X['humidity'].mean())


In [4]:
# Perform the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (4841155, 10)
X_test shape: (1210289, 10)
y_train shape: (4841155,)
y_test shape: (1210289,)


In [None]:
# Display the first few rows of the training dataset
print("Training Dataset:")
print(X_train.head())

# Display the first few rows of the testing dataset
print("\nTesting Dataset:")
print(X_test.head())

Training Dataset:
         trip_distance  congestion_level  pickup_hour  speed_mph  windgust  \
1593171           6.63                 2           23  15.202548      13.0   
4923503           1.96                 2           21   8.301176      11.2   
4107482           5.90                 0           22   8.319624      16.6   
2204162           2.94                 2           17   7.697455      25.9   
2378851           2.11                 2            3   8.832558      25.9   

         rain  visibility  humidity  PULocationID  DOLocationID  
1593171     0        16.0     41.31            12           162  
4923503     0        16.0     70.56           234           161  
4107482     0        16.0     59.69           264           229  
2204162     0        16.0     46.60           161           211  
2378851     0        16.0     83.37            68           142  

Testing Dataset:
         trip_distance  congestion_level  pickup_hour  speed_mph  windgust  \
76913             2.3

In [6]:
print(X_train.dtypes)

trip_distance       float64
congestion_level      int64
pickup_hour           int64
speed_mph           float64
windgust            float64
rain                  int64
visibility          float64
humidity            float64
PULocationID          int64
DOLocationID          int64
dtype: object


# Model Prediction

### Linear Regression

In [7]:
# Create the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

In [8]:
# Make predictions on the test data
y_pred = model.predict(X_test)

In [9]:
print(y_pred)

[1273.24748395  811.89873509  937.2788271  ... 1175.34202066 1113.03770198
  696.59940241]


In [11]:
# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

# Display the model's coefficients and intercept
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Mean Squared Error: 310748.17293638294
R-squared Score: 0.7046904770738575
Coefficients: [ 2.55484403e+02 -1.06195584e+02  8.24003069e-01 -1.05649961e+02
  2.47476176e-01 -8.04193269e+00 -9.76763435e-01 -1.54865991e-01
 -1.37170030e-01 -1.25781228e-01]
Intercept: 1621.5095102237865


In [12]:
# Calculate the accuracy of the linear regression model
accuracy = 1 - (mean_absolute_error(y_test, y_pred) / y_test.mean())
print("Accuracy:", accuracy)

Accuracy: 0.8254410531908342


### XGBoost

In [13]:
# Create the XGBoost model
xgb_model = xgb.XGBRegressor()

# Train the model on the training data
xgb_model.fit(X_train, y_train)

In [22]:
# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)
print(y_pred_xgb)

[1082.9948   501.95682  903.5643  ...  842.08636 1156.5637   484.96133]


In [23]:

# Calculate Mean Squared Error (MSE)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print("Mean Squared Error (MSE) for XGBoost:", mse_xgb)

# Calculate Mean Absolute Error (MAE)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print("Mean Absolute Error (MAE) for XGBoost:", mae_xgb)

# Calculate R-squared (R²)
r2_xgb = r2_score(y_test, y_pred_xgb)
print("R-squared (R²) for XGBoost:", r2_xgb)

Mean Squared Error (MSE) for XGBoost: 88142.57574302246
Mean Absolute Error (MAE) for XGBoost: 23.129135395296082
R-squared (R²) for XGBoost: 0.9162365198135376
