In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error



In [None]:
df = pd.read_csv("./datasets/uber_9_10.csv")
df.head()


In [None]:
print(df.info())
print(df.describe())
print(df.isnull().sum())


In [None]:
# Drop rows with missing or invalid values
df = df.dropna()

# Remove obviously wrong coordinates (outside NYC)
df = df[(df['pickup_longitude'] > -80) & (df['pickup_longitude'] < -70)]
df = df[(df['dropoff_longitude'] > -80) & (df['dropoff_longitude'] < -70)]
df = df[(df['pickup_latitude'] > 35) & (df['pickup_latitude'] < 45)]
df = df[(df['dropoff_latitude'] > 35) & (df['dropoff_latitude'] < 45)]

# Remove impossible fare values
df = df[(df['fare_amount'] > 0) & (df['fare_amount'] < 100)]
df.head()


In [None]:
# Convert datetime column to datetime type
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')

# Extract useful time-based features
df['hour'] = df['pickup_datetime'].dt.hour
df['day'] = df['pickup_datetime'].dt.day
df['month'] = df['pickup_datetime'].dt.month
df['year'] = df['pickup_datetime'].dt.year

# Haversine distance between pickup and dropoff (approximate)
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # radius of Earth in km
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi/2)**2 + np.cos(phi1)*np.cos(phi2)*np.sin(dlambda/2)**2
    return 2 * R * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

df['distance_km'] = haversine(df['pickup_latitude'], df['pickup_longitude'],
                              df['dropoff_latitude'], df['dropoff_longitude'])

df = df[df['distance_km'] > 0]
df.head()

In [None]:
plt.figure(figsize=(5,4))
sns.histplot(df['fare_amount'], kde=True)
plt.title("Distribution of Fare Amounts")
plt.show()

plt.figure(figsize=(5,4))
sns.scatterplot(x='distance_km', y='fare_amount', data=df)
plt.title("Fare vs Distance")
plt.show()

plt.figure(figsize=(5,4))
sns.boxplot(x='passenger_count', y='fare_amount', data=df)
plt.title("Passenger Count vs Fare")
plt.show()

In [None]:
X = df[['pickup_longitude','pickup_latitude','dropoff_longitude',
        'dropoff_latitude','passenger_count','distance_km','hour','day','month','year']]
y = df['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)

print("R² without PCA:", r2_score(y_test, y_pred))
print("RMSE without PCA:", np.sqrt(mean_squared_error(y_test, y_pred)))


In [None]:
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)


plt.figure(figsize=(6,4))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Variance Explained by PCA Components')
plt.grid(True)
plt.show()


print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Total variance preserved:", sum(pca.explained_variance_ratio_))


In [None]:
lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train)
y_pred_pca = lr_pca.predict(X_test_pca)

print("R² with PCA:", r2_score(y_test, y_pred_pca))
print("RMSE with PCA:", np.sqrt(mean_squared_error(y_test, y_pred_pca)))


In [None]:
results = pd.DataFrame({
    "Model":["Without PCA","With PCA"],
    "R2":[r2_score(y_test, y_pred), r2_score(y_test, y_pred_pca)],
    "RMSE":[np.sqrt(mean_squared_error(y_test, y_pred)),
            np.sqrt(mean_squared_error(y_test, y_pred_pca))]
})

print(results)

plt.bar(results["Model"], results["R2"], color=['skyblue','orange'])
plt.ylabel("R² Score")
plt.title("Model Comparison")
plt.show()