# 1. Loading & Understanding the data

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('train.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

# 2. Cleaning the data

In [None]:
data = df.copy()

In [None]:
data.isna().sum()

In [None]:
missing_percentage = (data.isnull().sum() / len(data)) * 100
missing_percentage

In [None]:
data.dropna(inplace=True)

In [None]:
data.isna().sum()

In [None]:
data.pickup_datetime = pd.to_datetime(data.pickup_datetime)
data.dropoff_datetime = pd.to_datetime(data.dropoff_datetime)

In [None]:
data[['pickup_datetime', 'dropoff_datetime']].info()

In [None]:
data.duplicated().sum()

In [None]:
data[data['trip_duration'] <= 0]

In [None]:
data['passenger_count'].unique()


In [None]:
data['trip_duration_minutes'] = (data['trip_duration'] / 60).astype(int)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.boxplot(x=data['trip_duration_minutes'])
plt.xlabel('Trip Duration (minutes)')
plt.title('Boxplot of Trip Duration')
plt.show()


In [None]:
data['trip_duration_minutes'].quantile([0.95, 0.99, 0.999])

95% of trips are under 35 minutes → Most trips are short.  
99% of trips are under 55 minutes → Almost all normal trips fit here.  
99.9% of trips are under 1386 minutes (~23 hours!) → A few extreme outliers exist.

In [None]:
data = data[data.trip_duration_minutes < 55]

In [None]:
data.shape

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x=data['trip_duration_minutes'])
plt.title("Trip Duration After Outlier Removal")
plt.show()

# 3. Feature Engineering

In [None]:
data['pickup_hour'] = data.pickup_datetime.dt.hour
data['pickup_day'] = data.pickup_datetime.dt.day_of_week
data['pickup_day_of_year'] = data.pickup_datetime.dt.day_of_year

In [None]:
# Create Distance Feature
from geopy.distance import geodesic

def haversine_distance(row):
    pickup = (row['pickup_latitude'], row['pickup_longitude'])
    dropoff = (row['dropoff_latitude'], row['dropoff_longitude'])
    return geodesic(pickup, dropoff).km  # Distance in km

data['trip_distance_km'] = data.apply(haversine_distance, axis=1)


In [None]:
data['avg_speed_kmh'] = data.trip_distance_km / (data.trip_duration_minutes / 60)

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x=data['avg_speed_kmh'])
plt.title("Average speed (kmh)")
plt.show()

In [None]:
threshold = data['avg_speed_kmh'].quantile(0.95)
threshold

In [None]:
data = data[data['avg_speed_kmh'] < threshold]

In [None]:
data.columns

# 4. Data Visualization & Insights

In [None]:
# Trip duration distribution

plt.figure(figsize=(10, 5))
sns.histplot(data['trip_duration_minutes'], bins=50, kde=True, color='royalblue')

plt.xlabel("Trip Duration (minutes)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.title("Distribution of Trip Duration", fontsize=14)
plt.xlim(0, data['trip_duration_minutes'].quantile(0.99))  # Focus on 99% of data (remove extreme outliers)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.show()

In [None]:
# Trip duration vs pickup hour

plt.figure(figsize=(10, 6))
sns.lineplot(x='pickup_hour', y='trip_duration_minutes', data=data, marker="o", color="b")
plt.xlabel("Pickup Hour", fontsize=12)
plt.ylabel("Trip Duration (minutes)", fontsize=12)
plt.title("Trip Duration vs. Pickup Hour", fontsize=14)
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()

In [None]:
numeric_data = data.select_dtypes(include=['number'])
corr_matrix = numeric_data.corr()

# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)

plt.title("Correlation Heatmap of Numerical Features", fontsize=14)
plt.show()

# 5. Pre-Modeling

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = data.drop(['trip_duration','trip_duration_minutes', 'id', 'pickup_datetime', 'dropoff_datetime'], axis=1)
y = data['trip_duration_minutes']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=42,
    test_size=0.2
)

# Convert categorical columns to string (optional, to prevent dtype issues)
X_train['vendor_id'] = X_train['vendor_id'].astype(str)
X_test['vendor_id'] = X_test['vendor_id'].astype(str)

# Encode 'vendor_id' and 'store_and_fwd_flag' separately
le_vendor = LabelEncoder()
X_train['vendor_id'] = le_vendor.fit_transform(X_train['vendor_id'])
X_test['vendor_id'] = le_vendor.transform(X_test['vendor_id'])

le_store = LabelEncoder()
X_train['store_and_fwd_flag'] = le_store.fit_transform(X_train['store_and_fwd_flag'])
X_test['store_and_fwd_flag'] = le_store.transform(X_test['store_and_fwd_flag'])

# Scale features (Don't scale y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
print(X_train.isin([np.inf, -np.inf]).sum())  # Count infinities
print(X_train.isna().sum())  # Count NaNs

print(X_test.isin([np.inf, -np.inf]).sum())  # Count infinities
print(X_test.isna().sum())  # Count NaNs

In [None]:
print("Mean:", np.mean(X_train_scaled))
print("Std Dev:", np.std(X_train_scaled))

# 5. Model Training & Evaluation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42)
}

# Perform cross-validation for each model
for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    print(f"{name} Cross-Validation R² Scores: {scores}")
    print(f"Mean R² Score: {scores.mean():.4f}")

In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for RandomForestRegressor
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 20]}
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=5, scoring='r2', n_jobs=-1)
grid_search_rf.fit(X_train_scaled, y_train)
print("Best Random Forest Parameters:", grid_search_rf.best_params_)

In [None]:
# Plot n_estimators vs R² score
# Convert GridSearchCV results to a DataFrame
cv_results = pd.DataFrame(grid_search_rf.cv_results_)

# Convert hyperparameters to integer type for plotting
cv_results["param_n_estimators"] = cv_results["param_n_estimators"].astype(int)
cv_results["param_max_depth"] = cv_results["param_max_depth"].astype(int)

# Plot n_estimators vs Mean Test R² Score
plt.figure(figsize=(8, 5))
sns.lineplot(x=cv_results["param_n_estimators"], y=cv_results["mean_test_score"], marker="o")
plt.title("Effect of n_estimators on Performance")
plt.xlabel("Number of Estimators")
plt.ylabel("Mean Test R² Score")
plt.grid()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

best_model = grid_search_rf.best_estimator_

y_pred = best_model.predict(X_test_scaled)
print("Test Set R² Score:", r2_score(y_test, y_pred))
print("Test Set MSE:", mean_squared_error(y_test, y_pred))

In [None]:
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print(feature_importance)

In [None]:
# Keep only the top 2 features
top_features = ['trip_distance_km', 'avg_speed_kmh']

# Convert scaled arrays back to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Create new datasets with only these features
X_train_selected = X_train_scaled_df[top_features]
X_test_selected = X_test_scaled_df[top_features]

In [None]:
best_model.fit(X_train_selected, y_train)

# Evaluate performance
y_pred = best_model.predict(X_test_selected)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.4f}, R² Score: {r2:.4f}")

## Conclusion

In this project, I explored the NYC Taxi Trip Duration dataset to build a predictive model for trip duration. Below are the key steps taken:

1. **Data Preprocessing:**  
   - Loaded and examined the dataset.  
   - Handled missing values and outliers.  
   - Created new features such as `trip_distance_km` and `avg_speed_kmh`.  
   - Scaled numerical features for better model performance.  

2. **Feature Selection:**  
   - Performed feature importance analysis using Random Forest.  
   - Identified `trip_distance_km` and `avg_speed_kmh` as the most significant features.  
   - Reduced the dataset to only the most relevant features.  

3. **Model Training & Evaluation:**  
   - Compared multiple models (Linear Regression, Decision Tree, and Random Forest).  
   - Used **cross-validation** to evaluate model performance.  
   - Fine-tuned the Random Forest model using **GridSearchCV** to optimize hyperparameters.  

4. **Results & Insights:**  
   - Random Forest outperformed other models with better R² scores.  
   - Visualized hyperparameter tuning results to understand model behavior.  
   - The final model effectively predicts trip duration based on key trip features.  

This project demonstrated the end-to-end machine learning workflow, from **data exploration** to **model optimization**, providing valuable insights into taxi trip durations. 🚖📊  

---
