In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error

# load  dataset
data = pd.read_csv('CAR DETAILS FROM CAR DEKHO.csv')

# creating a new feature 'car_age' 
data['car_age'] = 2024 - data['year']

# drop the 'year' column as it is now redundant
data = data.drop(columns=['year'])

# convert categorical features to numerical using one-hot encoding
data = pd.get_dummies(data, columns=['name', 'fuel', 'seller_type', 'transmission', 'owner'], drop_first=True)

# define feature columns and target column
features = data.drop(columns=['selling_price'])
target = data['selling_price']

# split the data into training and testing 
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define numeric features
numeric_features = ['km_driven', 'car_age']

# define the preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ], remainder='passthrough')

# define the Random Forest model pipeline
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestRegressor())])

# define the Gradient Boosting model pipeline
gb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', GradientBoostingRegressor())])

# train and test Random Forest model
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print(f"Random Forest - MAE: {mae_rf:.2f}, RMSE: {rmse_rf:.2f}, ")

# train and test Gradient Boosting model
gb_pipeline.fit(X_train, y_train)
y_pred_gb = gb_pipeline.predict(X_test)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mean_squared_error(y_test, y_pred_gb))
print(f"Gradient Boosting - MAE: {mae_gb:.2f}, RMSE: {rmse_gb:.2f}")

# cross-validation for Random Forest
cv_scores_rf = cross_val_score(rf_pipeline, features, target, cv=5, scoring='neg_mean_squared_error')
cv_rmse_rf = np.sqrt(-cv_scores_rf).mean()
print(f"Random Forest - Cross-Validation RMSE: {cv_rmse_rf:.2f}")

# cross-validation for Gradient Boosting
cv_scores_gb = cross_val_score(gb_pipeline, features, target, cv=5, scoring='neg_mean_squared_error')
cv_rmse_gb = np.sqrt(-cv_scores_gb).mean()
print(f"Gradient Boosting - Cross-Validation RMSE: {cv_rmse_gb:.2f}")


Random Forest - MAE: 118906.98, RMSE: 362611.62, 
Gradient Boosting - MAE: 168818.93, RMSE: 375402.55
Random Forest - Cross-Validation RMSE: 283046.99
Gradient Boosting - Cross-Validation RMSE: 314243.61
