In [5]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load and preprocess the dataset
df = pd.read_csv("./uber.csv")  # Replace with your dataset path
df.dropna(inplace=True)


In [6]:
# !pip install geopy


In [7]:

# Calculate distance using the Haversine formula
def haversine(lon1, lat1, lon2, lat2):
    R = 6371  # Radius of Earth in kilometers
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

df['distance'] = df.apply(lambda row: haversine(row['pickup_longitude'], row['pickup_latitude'],
                                                row['dropoff_longitude'], row['dropoff_latitude']), axis=1)


In [4]:

# Extract date and time features
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_weekday'] = df['pickup_datetime'].dt.weekday

# Drop unnecessary columns
df.drop(columns=['key', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
                 'dropoff_longitude', 'dropoff_latitude'], inplace=True)

# Define features and target variable
X = df[['distance', 'pickup_hour', 'pickup_day', 'pickup_weekday', 'passenger_count']]
y = df['fare_amount']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Implement Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

# Implement Random Forest Regression
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Linear Regression
linear_r2 = r2_score(y_test, y_pred_linear)
linear_rmse = np.sqrt(mean_squared_error(y_test, y_pred_linear))
print(f"Linear Regression -> R2 Score: {linear_r2:.3f}, RMSE: {linear_rmse:.3f}")

# Evaluate Random Forest Regression
rf_r2 = r2_score(y_test, y_pred_rf)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print(f"Random Forest Regression -> R2 Score: {rf_r2:.3f}, RMSE: {rf_rmse:.3f}")


Linear Regression -> R2 Score: 0.001, RMSE: 10.194
Random Forest Regression -> R2 Score: 0.646, RMSE: 6.072
