In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
file_path = 'CarPrice_Assignment.csv'
df = pd.read_csv(file_path)

# Preview the dataset
preview = df.head()

# Check for missing values
missing_values = df.isnull().sum()

# Separate features and target variable
X = df.drop(['price', 'car_ID', 'CarName'], axis=1)
y = df['price']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

# Preprocessing for numerical data: impute missing values with median and scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: impute missing values with most frequent and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and transform the training data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

(preview, missing_values, X_train_processed.shape, X_test_processed.shape)

(   car_ID  symboling                   CarName fueltype aspiration doornumber  \
 0       1          3        alfa-romero giulia      gas        std        two   
 1       2          3       alfa-romero stelvio      gas        std        two   
 2       3          1  alfa-romero Quadrifoglio      gas        std        two   
 3       4          2               audi 100 ls      gas        std       four   
 4       5          2                audi 100ls      gas        std       four   
 
        carbody drivewheel enginelocation  wheelbase  ...  enginesize  \
 0  convertible        rwd          front       88.6  ...         130   
 1  convertible        rwd          front       88.6  ...         130   
 2    hatchback        rwd          front       94.5  ...         152   
 3        sedan        fwd          front       99.8  ...         109   
 4        sedan        4wd          front       99.4  ...         136   
 
    fuelsystem  boreratio  stroke compressionratio horsepower  pea

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR()
}

# Train models
for name, model in models.items():
    model.fit(X_train_processed, y_train)

models

{'Linear Regression': LinearRegression(),
 'Decision Tree': DecisionTreeRegressor(random_state=42),
 'Random Forest': RandomForestRegressor(random_state=42),
 'Gradient Boosting': GradientBoostingRegressor(random_state=42),
 'Support Vector Regressor': SVR()}

In [3]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Evaluate models
results = {}
for name, model in models.items():
    y_pred = model.predict(X_test_processed)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    results[name] = {'R2': r2, 'MSE': mse, 'MAE': mae}

results

{'Linear Regression': {'R2': 0.87247536932174,
  'MSE': 10067307.106935194,
  'MAE': 2244.6017256584537},
 'Decision Tree': {'R2': 0.8958288846689278,
  'MSE': 8223686.70375339,
  'MAE': 1847.4349512195122},
 'Random Forest': {'R2': 0.9577276121364623,
  'MSE': 3337152.2701321063,
  'MAE': 1276.3987520325202},
 'Gradient Boosting': {'R2': 0.9266994158974956,
  'MSE': 5786642.841879208,
  'MAE': 1666.4004125353745},
 'Support Vector Regressor': {'R2': -0.09966346645387403,
  'MSE': 86811855.65633136,
  'MAE': 5694.47171546142}}

In [4]:
import numpy as np

# Feature importance from Random Forest (best model)
feature_names_num = numerical_cols
feature_names_cat = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
feature_names = np.concatenate([feature_names_num, feature_names_cat])

rf_model = models['Random Forest']
importances = rf_model.feature_importances_

# Create a sorted list of features by importance
feature_importance = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

# Select significant features (importance > mean importance)
mean_importance = np.mean(importances)
significant_features = [f for f, imp in feature_importance if imp > mean_importance]

(significant_features, feature_importance[:10])

(['enginesize', 'curbweight', 'highwaympg', 'horsepower'],
 [('enginesize', 0.5521616138545801),
  ('curbweight', 0.294244881310119),
  ('highwaympg', 0.045324870531969104),
  ('horsepower', 0.031433980751227546),
  ('carwidth', 0.014010365442477296),
  ('carlength', 0.008621446759495418),
  ('wheelbase', 0.007588822392256461),
  ('peakrpm', 0.006911451989804732),
  ('citympg', 0.006623670771719535),
  ('stroke', 0.004688422270706298)])

In [5]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for Random Forest Regressor
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='r2')
grid_search.fit(X_train_processed, y_train)

best_rf = grid_search.best_estimator_

# Evaluate the tuned model
y_pred_tuned = best_rf.predict(X_test_processed)
r2_tuned = r2_score(y_test, y_pred_tuned)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
mae_tuned = mean_absolute_error(y_test, y_pred_tuned)

(best_rf.get_params(), r2_tuned, mse_tuned, mae_tuned)

({'bootstrap': True,
  'ccp_alpha': 0.0,
  'criterion': 'squared_error',
  'max_depth': 10,
  'max_features': 'auto',
  'max_leaf_nodes': None,
  'max_samples': None,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'n_estimators': 200,
  'n_jobs': None,
  'oob_score': False,
  'random_state': 42,
  'verbose': 0,
  'warm_start': False},
 0.9587078371836437,
 3259769.3635406517,
 1247.733904787096)

In [6]:
# Due to timeout, reduce the parameter grid for faster tuning
param_grid_small = {
    'n_estimators': [100, 200],
    'max_depth': [None, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestRegressor(random_state=42)
grid_search_small = GridSearchCV(estimator=rf, param_grid=param_grid_small, cv=3, n_jobs=-1, scoring='r2')
grid_search_small.fit(X_train_processed, y_train)

best_rf_small = grid_search_small.best_estimator_

# Evaluate the tuned model
y_pred_tuned_small = best_rf_small.predict(X_test_processed)
r2_tuned_small = r2_score(y_test, y_pred_tuned_small)
mse_tuned_small = mean_squared_error(y_test, y_pred_tuned_small)
mae_tuned_small = mean_absolute_error(y_test, y_pred_tuned_small)

(best_rf_small.get_params(), r2_tuned_small, mse_tuned_small, mae_tuned_small)

({'bootstrap': True,
  'ccp_alpha': 0.0,
  'criterion': 'squared_error',
  'max_depth': None,
  'max_features': 'auto',
  'max_leaf_nodes': None,
  'max_samples': None,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'n_estimators': 200,
  'n_jobs': None,
  'oob_score': False,
  'random_state': 42,
  'verbose': 0,
  'warm_start': False},
 0.9584683052266159,
 3278678.979357608,
 1254.6977619047618)