In [None]:
#1. Loading and Preprocessing (5 marks)
 
#Load the dataset and perform necessary preprocessing steps.
import pandas as pd

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('C:\\Users\\neenu\\Jupyter\\Assignments\\Machine Learning\\CarPriceAssignment.csv')

# Drop unnecessary columns
data.drop(columns=['car_ID', 'CarName'], inplace=True)

# Encode categorical variables
categorical_cols = ['fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel', 'enginelocation', 
                    'enginetype', 'cylindernumber', 'fuelsystem']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Define features and target variable
X = data.drop(columns='price')
y = data['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
#2. Model Implementation (10 marks)
 #Implement the following five regression algorithms:

#1) Linear Regression

#2) Decision Tree Regressor


#3) Random Forest Regressor

#4) Gradient Boosting Regressor

#5) Support Vector Regressor

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

# Initialize the models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Support Vector': SVR()
}

# Train and store the models
fitted_models = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    fitted_models[name] = model



In [None]:
#3. Model Evaluation (5 marks)
'''Compare the performance of all the models based on R-squared, Mean Squared Error (MSE), and Mean Absolute Error (MAE).
Identify the best performing model and justify why it is the best.'''

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Support Vector': SVR()
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {
        'MSE': mse,
        'MAE': mae,
        'R-squared': r2
    }

# Create a DataFrame to display results
results_df = pd.DataFrame(results).T
print(results_df)

'''Based on the results, we can determine which model performed the best:

Best Model: Random Forest Regressor
Justification: It has the lowest Mean Squared Error (MSE) and Mean Absolute Error (MAE), along with the highest R-squared value. 
This indicates that the Random Forest model captures the variability in car prices effectively, making it a robust choice for this regression problem.'''


In [None]:

#4. Feature Importance Analysis (2 marks)
'''Identify the significant variables affecting car prices (feature selection)'''
import matplotlib.pyplot as plt

# Train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Extract feature importances
importances = rf_model.feature_importances_
feature_names = X.columns

# Create a DataFrame for visualization
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display top features
print("Top Significant Features:")
print(importance_df.head(10))

# Plotting feature importances
plt.figure(figsize=(12, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importance Analysis')
plt.gca().invert_yaxis()  # To display the highest importance at the top
plt.show()



In [None]:
#5. Hyperparameter Tuning (2 marks):
'''Perform hyperparameter tuning and check whether the performance of the model has increased.'''
from sklearn.model_selection import GridSearchCV

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load your dataset
data = pd.read_csv('C:\\Users\\neenu\\Jupyter\\Assignments\\Machine Learning\\CarPriceAssignment.csv')

# Define features and target
X = data.drop(columns=['price'])
y = data['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
rf = RandomForestRegressor()

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Set up the grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_

# Evaluate the model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Best Hyperparameters: {best_params}")
print(f"Test MSE: {mse}")
