In [None]:
# --------------------------------------------------------------
# UBER FARE PREDICTION (Complete Mini Project)
# --------------------------------------------------------------
# Problem Statement:
# Predict the price of an Uber ride using ML techniques.
#
# Tasks Covered:
# 1. Pre-process the dataset
# 2. Identify and remove outliers
# 3. Check correlation between features
# 4. Apply Linear Regression & Random Forest Regression
# 5. Evaluate using R2, RMSE and compare models
# --------------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# -----------------------------
# 1. LOAD & PRE-PROCESS DATA
# -----------------------------

'''The dataset is first loaded and the pickup_datetime column is converted into proper datetime format.
We then extracted useful time-based features (month, day, hour, weekday) which help in understanding how time affects fare.
Using the Haversine formula, we calculated the total trip distance from pickup to drop location.
Finally, missing values were removed to ensure clean and reliable data for model training.'''

# Load dataset
df = pd.read_csv("./uber.csv")        # <- put the Kaggle dataset here

# Convert pickup_datetime to proper datetime type
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')

# Extract useful time features (Feature Engineering)
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_dayofweek'] = df['pickup_datetime'].dt.dayofweek  # Monday=0, Sunday=6


# Function to calculate distance between pickup & drop (Haversine Formula)
'''This block defines a function to calculate the geographical distance between pickup 
and drop-off points using the Haversine formula, which computes the shortest distance 
between two coordinates on a spherical surface (Earth).
Inside the function, map(np.radians, ...) converts latitude and longitude values from 
degrees to radians for trigonometric calculations. Variables dlat and dlon store the 
difference between coordinates, and the formula computes the value c, which is multiplied 
by Earth's radius (6371 km) to get the distance in kilometers.
A new feature column distance_km is then created by applying this function to the dataset
, and df.dropna() removes any rows containing missing values to ensure clean input data for the model.'''

def haversine(lat1, lon1, lat2, lon2):
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c  # 6371 km is Earth radius


# Create new feature: total trip distance
df['distance_km'] = haversine(df['pickup_latitude'], df['pickup_longitude'],
                              df['dropoff_latitude'], df['dropoff_longitude'])


# Remove missing values
df.dropna(inplace=True)

print("After preprocessing shape:", df.shape)



# -----------------------------
# 2. OUTLIER DETECTION
# -----------------------------

'''1️⃣ Before Outlier Removal – Boxplot Explanation
The boxplot of the fare_amount column shows that there are several data points that lie far outside the normal range.
These extreme values are plotted as dots beyond the whiskers of the boxplot.
Such values are considered outliers, and they can negatively affect model performance because they mislead the learning algorithm.
'''

# Boxplot before removing outliers
plt.figure(figsize=(8,4))
sns.boxplot(x=df['fare_amount'])
plt.title("Fare Amount - Before Outlier Removal")
plt.show()


'''We removed unrealistic fare values (less than 0 or above 200) and trips with incorrect distance (0 km or more than 50 km).
This helps clean the data by eliminating invalid and extreme values so the model can learn more accurately.'''

# Remove extreme fare values that are unrealistic
df = df[(df['fare_amount'] > 0) & (df['fare_amount'] < 200)]

# Remove outliers for distance also
df = df[(df['distance_km'] > 0) & (df['distance_km'] < 50)]

print("After removing outliers shape:", df.shape)

'''The second boxplot shows the fare_amount column after removing the extreme values.
The number of dots (outliers) has significantly reduced, and the data is now more compact and closer to a normal range.
'''

# Boxplot after removing outliers
plt.figure(figsize=(8,4))
sns.boxplot(x=df['fare_amount'], color='orange')
plt.title("Fare Amount - After Outlier Removal")
plt.show()

# -----------------------------
# 3. CHECK CORRELATION
# -----------------------------
'''This code computes the correlation matrix for selected numerical features using the .corr() function, which measures the strength and direction of the linear relationship between variables.
By printing corr['fare_amount'], we specifically view how strongly each feature correlates with the target variable (fare_amount).
A heatmap is then plotted using plt.imshow(), where colors (from cmap='coolwarm') visually represent correlation values, and axis labels are set using .xticks() and .yticks() for readability.
This visualization helps identify which features have a strong positive or negative impact on fare prediction and supports feature selection for the model.'''

corr = df[['fare_amount', 'distance_km', 'pickup_hour', 'pickup_dayofweek', 'pickup_month']].corr()
print("\nCorrelation with Fare Amount:\n", corr['fare_amount'])

plt.figure(figsize=(6,4))
plt.title("Correlation Heatmap")
plt.imshow(corr, cmap='coolwarm', interpolation='nearest')
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=45)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.show()

# -----------------------------
# 4. MODEL TRAINING
# -----------------------------
'''This section prepares the selected feature set (X) and target variable (y) for model 
training. The train_test_split() function divides the data into training (80%) and testing (20%) 
subsets to allow the model to learn and then be evaluated on unseen data.
A ColumnTransformer with StandardScaler is applied to normalize feature values for Linear Regression, 
improving model stability and convergence.
The Linear Regression model is created using a Pipeline, combining scaling and model training in a single 
workflow, while RandomForestRegressor is initialized with 200 decision trees to learn non-linear patterns.
Both models are trained using .fit(X_train, y_train), enabling them to learn relationships between input 
features and the target fare amount.'''

# Select final features for model
features = ['distance_km', 'pickup_hour', 'pickup_dayofweek', 'pickup_month']
X = df[features]
y = df['fare_amount']

# Train-Test Split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling data for Linear Regression (improves model)
scaler = ColumnTransformer([('scale', StandardScaler(), features)], remainder='passthrough')

# Linear Regression Model
lr_model = Pipeline([('scaler', scaler), ('model', LinearRegression())])
lr_model.fit(X_train, y_train)

# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# -----------------------------
# 5. MODEL EVALUATION
# -----------------------------
'''This section evaluates the model performance by generating predictions for the test set using the .predict() method for both Linear Regression and Random Forest models.
The custom rmse() function calculates the Root Mean Squared Error (RMSE) by first computing Mean Squared Error (MSE) using mean_squared_error() and then taking its square root to measure average prediction error.
The r2_score() metric is used to compute the R² value, which indicates how well the model explains the variance of the target variable (higher R² = better fit).
Finally, the R² and RMSE scores for both models are printed, allowing a clear comparison of prediction accuracy and model performance.'''

# Predict values
pred_lr = lr_model.predict(X_test)
pred_rf = rf_model.predict(X_test)

# Evaluation Metrics
def rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return mse ** 0.5     # manually compute RMSE

print("\n--- MODEL PERFORMANCE ---")
print("Linear Regression → R2:", r2_score(y_test, pred_lr), " | RMSE:", rmse(y_test, pred_lr))
print("Random Forest     → R2:", r2_score(y_test, pred_rf), " | RMSE:", rmse(y_test, pred_rf))