In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


### ***EXPLORE THE DATA***

In [None]:
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/car data.csv')

In [None]:
# statistical summary of data
num_sum = data.describe()
palette = sns.color_palette('inferno', as_cmap=True)
num_sum.style.background_gradient(cmap=palette)

In [None]:
data.head()

In [None]:
# shape of data
data.shape

## ***DATA CLEANING***

In [None]:
# missing values
missing = data.isnull().sum()
print(missing)

# Check if there are any missing values
if missing.any():
    print("\nMissing values found in the dataset:")
    print(missing[missing > 0])  # Display only columns with missing values
else:
    print('\nThere are no missing values in the dataset')

In [None]:
# Drop columns that are not useful for modeling
data.drop(['Owner'], axis=1, inplace=True)

In [None]:
# info of data
data.info()

# ***DUPLICATES IDENTIFY***

In [None]:
# duplicate values
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows = {duplicates}")

# drop duplicates
print("After dropping duplicates")
data.drop_duplicates(inplace=True)
print(f"Number of duplicate rows = {data.duplicated().sum()}")

## ***OUTLIERS***

In [None]:
# Boxplot to identify outliers
plt.figure(figsize=(12, 6))
sns.boxplot(x=data['Selling_Price'])
plt.title('Boxplot of Selling Price')
plt.show()


In [None]:
# Outlier treatment (if necessary)
# Example: Removing outliers in Selling Price
Q1 = data['Selling_Price'].quantile(0.25)
Q3 = data['Selling_Price'].quantile(0.75)
IQR = Q3 - Q1
data = data[(data['Selling_Price'] >= (Q1 - 1.5 * IQR)) & (data['Selling_Price'] <= (Q3 + 1.5 * IQR))]
Q3 = data['Selling_Price'].quantile(0.75)
IQR = Q3 - Q1
data = data[(data['Selling_Price'] >= (Q1 - 1.5 * IQR)) & (data['Selling_Price'] <= (Q3 + 1.5 * IQR))]

# ***DATA PRE-PROCESSING***

In [None]:
# Calculate skewness
skewness = data['Selling_Price'].skew()
print(f'Skewness of Selling_Price: {skewness}')

# Plot the distribution of Selling_Price
sns.histplot(data['Selling_Price'], kde=True)
plt.title('Distribution of Selling_Price')
plt.show()

In [None]:
data.hist(figsize = (12,10), bins = 50)
plt.show()

In [None]:
print(data.dtypes)

In [None]:
# Select only numerical features
numerical_features = data.select_dtypes(include=["number"])

# Calculate the correlation matrix for numerical features
correlation_matrix = numerical_features.corr()

# Display the correlation matrix
print("\nCorrelation Matrix:")
correlation_matrix

In [None]:
# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='Set1', fmt='.2f')
plt.title("Correlation Matrix")
plt.show()

# ***VISUALISATIONS***

In [None]:
def plotdata(data, col_name, col_type):
    if col_type == 'object':
        plt.figure(figsize=(15,3))
        sns.countplot(x=col_name, data=data, palette='YlGnBu')
        plt.show()

In [None]:
for i in data.columns:
    print(i)
    plotdata(data, i, data[i].dtype)

In [None]:
#Checking relationship of Year with Price

plt.subplots(figsize=(20,10))
ax=sns.swarmplot(x='Year',y='Selling_Price',data=data) # Change 'year' to 'Year' and 'Price' to 'Selling_Price'
ax.set_xticklabels(ax.get_xticklabels(),rotation=40,ha='right')
plt.show()

## ***Encoding the Categorical Columns***

In [None]:
# Encoding categorical variables
data = pd.get_dummies(data, columns=['Fuel_Type', 'Selling_type', 'Transmission'], drop_first=True)


In [None]:
data.head()

# ***SPLITING THE DATA SET***

In [None]:
X = data.drop(['Car_Name','Selling_Price'],axis=1)
Y = data['Selling_Price']

In [None]:
from sklearn.model_selection import train_test_split
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Now using 'Y' instead of 'y'

# ***Model Training & Model Evaluation***

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
# Training the Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Predicting on the test set
y_pred = rf_model.predict(X_test)

In [None]:
# Visualization: Actual vs Predicted
plt.figure(figsize=(12, 6))
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Selling Price')
plt.ylabel('Predicted Selling Price')
plt.title('Actual vs Predicted Selling Price')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
plt.show()

In [None]:
# Error Rate Calculation

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# ***HYPERPARAMETER TUNING***

In [None]:
# Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

In [None]:
# Best parameters
print("Best parameters found: ", grid_search.best_params_)

In [None]:
# Re-evaluate the model with best parameters
best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test)

In [None]:
# Final Error Rate Calculation

mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)


print(f'Best Model - Mean Squared Error: {mse_best}')
print(f'Best Model - R-squared: {r2_best}')

In [None]:
# Visualizations
plt.figure(figsize=(12, 6))
plt.bar(feature_importance['feature'], feature_importance['importance'])
plt.xticks(rotation=45)
plt.title('Feature Importance (Random Forest)')
plt.tight_layout()
plt.show()


# ***Conclusion***
In conclusion, the analysis of the car dataset revealed valuable insights into the factors influencing car selling prices, with a successful implementation of a Random Forest Regressor to predict these prices based on various features. Through exploratory data analysis, we identified and addressed outliers and missing values, leading to a cleaner dataset for modeling. The initial model performance metrics indicated room for improvement, which was achieved through hyperparameter tuning using Grid Search, resulting in enhanced accuracy and reduced error rates. Visualizations of actual versus predicted selling prices illustrated the model's effectiveness, and the overall process underscored the significance of data preprocessing, feature selection, and model evaluation in building robust predictive models. This structured approach not only provided a solid foundation for understanding the car market but also highlighted opportunities for further exploration and refinement in predictive analytics.