In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Regression models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

In [20]:
# Load the dataset
df = pd.read_csv("CarPrice_Assignment.csv")

In [21]:
# Display dataset information
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nMissing Values:", df.isnull().sum().sum())

Dataset Shape: (205, 26)

First 5 rows:
   car_ID  symboling                   CarName fueltype aspiration doornumber  \
0       1          3        alfa-romero giulia      gas        std        two   
1       2          3       alfa-romero stelvio      gas        std        two   
2       3          1  alfa-romero Quadrifoglio      gas        std        two   
3       4          2               audi 100 ls      gas        std       four   
4       5          2                audi 100ls      gas        std       four   

       carbody drivewheel enginelocation  wheelbase  ...  enginesize  \
0  convertible        rwd          front       88.6  ...         130   
1  convertible        rwd          front       88.6  ...         130   
2    hatchback        rwd          front       94.5  ...         152   
3        sedan        fwd          front       99.8  ...         109   
4        sedan        4wd          front       99.4  ...         136   

   fuelsystem  boreratio  stroke compres

In [22]:
# Preprocessing
# Drop irrelevant columns (e.g., car_ID, CarName)
df = df.drop(["car_ID", "CarName"], axis=1)

# Convert categorical variables to dummy variables
df = pd.get_dummies(df, drop_first=True)

In [23]:
# Split into features (X) and target (y)
X = df.drop("price", axis=1)
y = df["price"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}


In [26]:
# Train and evaluate models
results = []

for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store results
    results.append({
        "Model": name,
        "MSE": mse,
        "MAE": mae,
        "R²": r2
    })

    # Print results
    print(f"\n{name} Results:")
    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")


Linear Regression Results:
MSE: 8482008.4844
MAE: 2089.3827
R²: 0.8926

Decision Tree Results:
MSE: 8300272.3561
MAE: 1886.3211
R²: 0.8949

Random Forest Results:
MSE: 3313969.6413
MAE: 1259.7577
R²: 0.9580

Gradient Boosting Results:
MSE: 5900138.7630
MAE: 1676.0276
R²: 0.9253

Support Vector Regressor Results:
MSE: 86829819.9835
MAE: 5697.6577
R²: -0.0999


In [27]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df.to_string(index=False))


Model Comparison:
                   Model          MSE         MAE        R²
       Linear Regression 8.482008e+06 2089.382729  0.892557
           Decision Tree 8.300272e+06 1886.321146  0.894859
           Random Forest 3.313970e+06 1259.757695  0.958021
       Gradient Boosting 5.900139e+06 1676.027625  0.925262
Support Vector Regressor 8.682982e+07 5697.657697 -0.099891


In [28]:
# Identify best and worst models
best_model = results_df.loc[results_df['R²'].idxmax()]
worst_model = results_df.loc[results_df['R²'].idxmin()]

print("\nBest Model:")
print(f"Model: {best_model['Model']}, R²: {best_model['R²']:.4f}")

print("\nWorst Model:")
print(f"Model: {worst_model['Model']}, R²: {worst_model['R²']:.4f}")


Best Model:
Model: Random Forest, R²: 0.9580

Worst Model:
Model: Support Vector Regressor, R²: -0.0999


In [29]:
# Feature Importance Analysis (for tree-based models)
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_scaled, y_train)

feature_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("\nTop 10 Important Features:")
print(feature_importance.head(10))


Top 10 Important Features:
       Feature  Importance
6   enginesize    0.544441
5   curbweight    0.299421
13  highwaympg    0.045694
10  horsepower    0.035511
3     carwidth    0.013533
2    carlength    0.009019
1    wheelbase    0.007541
11     peakrpm    0.006795
12     citympg    0.006493
8       stroke    0.004913


In [30]:
# Hyperparameter Tuning (Gradient Boosting as an example)
param_grid = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 7]
}

gb = GradientBoostingRegressor(random_state=42)
grid_search = GridSearchCV(gb, param_grid, cv=3, scoring="r2")
grid_search.fit(X_train_scaled, y_train)

print("\nBest Parameters for Gradient Boosting:")
print(grid_search.best_params_)


Best Parameters for Gradient Boosting:
{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 50}


In [31]:
# Evaluate tuned model
y_pred_tuned = grid_search.predict(X_test_scaled)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
mae_tuned = mean_absolute_error(y_test, y_pred_tuned)
r2_tuned = r2_score(y_test, y_pred_tuned)

print("\nTuned Gradient Boosting Results:")
print(f"MSE: {mse_tuned:.4f}")
print(f"MAE: {mae_tuned:.4f}")
print(f"R²: {r2_tuned:.4f}")


Tuned Gradient Boosting Results:
MSE: 6086847.6572
MAE: 1698.2912
R²: 0.9229
