In [9]:
# STEP 1: Import all required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [10]:
data = pd.read_csv(r"C:\Users\ADMIN\Documents\cardekho_dataset.csv")

# STEP 3: Basic data exploration (just to see)
data.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [11]:
fuel_encoder = LabelEncoder()
transmission_encoder = LabelEncoder()

data['fuel_type'] = fuel_encoder.fit_transform(data['fuel_type'])
data['transmission_type'] = transmission_encoder.fit_transform(data['transmission_type'])

# Check the transformed columns
print(data[['fuel_type', 'transmission_type']].head())


   fuel_type  transmission_type
0          4                  1
1          4                  1
2          4                  1
3          4                  1
4          1                  1


In [14]:
numerical_cols = ['km_driven', 'mileage', 'engine', 'max_power', 'seats', 'fuel_type', 'transmission_type']
target = 'selling_price'

# Create features (X) and target (y)
X = data[numerical_cols]           # Only numerical columns
y = data[target]



In [15]:
# STEP 5: Handle missing values (if any) - fill with median (robust to outliers)
X = X.fillna(X.median())
y = y.fillna(y.median())

# STEP 6: Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [16]:
# STEP 7: Feature Scaling (Random Forest doesn't need it, but good practice)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# HYPERPARAMETER TUNING WITH GRIDSEARCHCV

In [17]:
# STEP 8: Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],           # Number of trees
    'max_depth': [10, 20, 30, None],           # Maximum depth of tree
    'min_samples_split': [2, 5, 10],           # Min samples required to split
    'min_samples_leaf': [1, 2, 4],             # Min samples at leaf node
    'max_features': ['auto', 'sqrt'],          # Number of features to consider
    'bootstrap': [True, False]                 # Whether to bootstrap samples
}

In [18]:
# Explanation: Total combinations = 3 ×  Messi 4 × 3 × 3 × 2 × 2 = 216 combinations!

# STEP 9: Initialize Random Forest Regressor
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# STEP 10: Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,                    # Model to tune
    param_grid=param_grid,           # All combinations to try
    cv=5,                            # 5-fold cross-validation
    scoring='r2',                    # Use R² score (higher is better)
    verbose=1,                       # Show progress
    n_jobs=-1                        # Use all CPU cores
)

In [21]:
# STEP 11: Fit GridSearchCV on training data
print("Starting GridSearchCV... (this may take several minutes)")
grid_search.fit(X_train_scaled, y_train)



Starting GridSearchCV... (this may take several minutes)
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


In [20]:

# STEP 12: Get best parameters and best score
print("\nGRIDSEARCHCV RESULTS:")
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation R² Score:", round(grid_search.best_score_, 4))


GRIDSEARCHCV RESULTS:
Best Parameters: {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
Best Cross-Validation R² Score: 0.8193
