In [2]:
import pandas as pd
from datetime import date

# --- FIX: Load the DataFrame (df_car) first ---
# Assuming 'car.csv' is in the 'data/' subdirectory relative to your notebook
# NOTE: Update the path if your notebook is not in 'notebooks/'
try:
    df_car = pd.read_csv("../data/car.csv")
except FileNotFoundError:
    # If the relative path fails, use the absolute path from the previous attempts
    file_path = "C:\\Users\\LENOVO\\Documents\\My Projects\\ShadowFox\\Car_Price_Prediction\\data\\car.csv"
    df_car = pd.read_csv(file_path)

print("DataFrame 'df_car' loaded successfully.")

# 1. Feature Engineering: Derive 'Years_of_Service'
current_year = 2025
df_car['Years_of_Service'] = current_year - df_car['Year']

# 2. Drop irrelevant or redundant columns
df_car.drop(['Car_Name', 'Year'], axis=1, inplace=True)

print("Feature Engineering Complete.")
print("Columns after cleanup:", df_car.columns.tolist())

DataFrame 'df_car' loaded successfully.
Feature Engineering Complete.
Columns after cleanup: ['Selling_Price', 'Present_Price', 'Kms_Driven', 'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner', 'Years_of_Service']


In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

# --- Plot 1: Selling Price vs. Age of Car ---
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Years_of_Service', y='Selling_Price', data=df_car)
plt.title('Selling Price vs. Age of Car (Years of Service)')
# Ensure the path matches your structure (assuming /Car_Price_Prediction/visuals/)
plt.savefig('../visuals/years_vs_price_scatter.png')
plt.close()

# --- Plot 2: Selling Price vs. Showroom Price (Present_Price) ---
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Present_Price', y='Selling_Price', data=df_car)
plt.title('Selling Price vs. Showroom Price (Present_Price)')
plt.savefig('../visuals/present_vs_price_scatter.png')
plt.close()

print("Generated 'years_vs_price_scatter.png' and 'present_vs_price_scatter.png' in the visuals folder.")

Generated 'years_vs_price_scatter.png' and 'present_vs_price_scatter.png' in the visuals folder.


In [4]:
from sklearn.model_selection import train_test_split

# 1. Separate features (X) and target (y)
X = df_car.drop('Selling_Price', axis=1)
y = df_car['Selling_Price']

# 2. Apply One-Hot Encoding to categorical features, dropping the first category to avoid multicollinearity
X = pd.get_dummies(X, drop_first=True)

# 3. Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data Preprocessing (Encoding and Splitting) Complete.")
print(f"Features after Encoding: {X.columns.tolist()}")
print(f"Training Samples: {len(X_train)}")
print(f"Testing Samples: {len(X_test)}")

Data Preprocessing (Encoding and Splitting) Complete.
Features after Encoding: ['Present_Price', 'Kms_Driven', 'Owner', 'Years_of_Service', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol', 'Seller_Type_Individual', 'Transmission_Manual']
Training Samples: 240
Testing Samples: 61


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# --- 4.1 Define Hyperparameter Search Space ---
# This dictionary defines the range of parameters for the Randomized Search
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
max_features = [1.0, 'sqrt'] # 1.0 is equivalent to 'auto' for Random Forest Regressor
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

# --- 4.2 Initialize and Tune Model ---
print("Starting Hyperparameter Tuning with Randomized Search (This may take a moment)...")

rf = RandomForestRegressor(random_state=42)

# Initialize Randomized Search (10 random combinations, 5-fold cross-validation)
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 10,  
                               cv = 5, 
                               verbose=0, 
                               random_state=42, 
                               n_jobs = -1) # Use all processors for speed

# Fit/Train the Tuned Model
rf_random.fit(X_train, y_train)

# Use the best model found by the search
best_rf_model = rf_random.best_estimator_

print("\nHyperparameter Tuning Complete.")
print("Best Parameters Found:", rf_random.best_params_)


# --- 4.3 Final Evaluation and Metrics ---
# Make predictions using the best model
y_pred_tuned = best_rf_model.predict(X_test)

# Calculate Evaluation Metrics
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
rmse_tuned = np.sqrt(mse_tuned)
r2_tuned = r2_score(y_test, y_pred_tuned)

print("\n--- Final Model Evaluation ---")
print(f"Model: Random Forest Regressor (Hyperparameter Tuned)")
print(f"R-squared (R2) Score: {r2_tuned:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_tuned:.4f}")

Starting Hyperparameter Tuning with Randomized Search (This may take a moment)...

Hyperparameter Tuning Complete.
Best Parameters Found: {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 25}

--- Final Model Evaluation ---
Model: Random Forest Regressor (Hyperparameter Tuned)
R-squared (R2) Score: 0.9423
Root Mean Squared Error (RMSE): 1.1526


In [6]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot 1: Selling Price vs. Age of Car (Years_of_Service)
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Years_of_Service', y='Selling_Price', data=df_car)
plt.title('Selling Price vs. Age of Car (Years of Service)')
# Saves the first plot
plt.savefig('../visuals/years_vs_price_scatter.png')
plt.close()

# Plot 2: Selling Price vs. Showroom Price (Present_Price)
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Present_Price', y='Selling_Price', data=df_car)
plt.title('Selling Price vs. Showroom Price (Present_Price)')
# Saves the second plot
plt.savefig('../visuals/present_vs_price_scatter.png')
plt.close()

print("Generated EDA plots.")

Generated EDA plots.


In [7]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming y_test (actual prices) and y_pred_tuned (predicted prices) exist from Step 4

# Plot 3: Actual vs. Predicted Prices (Model Performance Check)
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_test, y=y_pred_tuned)
# Add ideal prediction line (Red Line)
min_val = min(y_test.min(), y_pred_tuned.min())
max_val = max(y_test.max(), y_pred_tuned.max())
plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', label='Ideal Line')
plt.title('Actual vs. Predicted Car Selling Prices (Tuned RF)')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.legend()
# Saves the third, final plot
plt.savefig('../visuals/actual_vs_predicted_prices_tuned.png')
plt.close()

print("Generated final model performance plot.")

Generated final model performance plot.
