In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
df = pd.read_csv("Car details v3.csv")

In [None]:
df.head()

In [None]:
df['mileage'] = df['mileage'].str.replace(' kmpl', '', regex = False)
df['mileage'] = pd.to_numeric(df['mileage'], errors = 'coerce')
df['engine'] = df['engine'].str.replace(' CC', '', regex = False)
df['engine'] = pd.to_numeric(df['engine'], errors = 'coerce')
df['max_power'] = df['max_power'].str.replace(' bhp', '', regex = False)
df['max_power'] = pd.to_numeric(df['max_power'], errors = 'coerce')
df.drop('torque', axis = 1, inplace = True)

In [None]:
df.head(15)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace = True)
df.isnull().sum()

In [None]:
df['num_year'] = 2026 - df['year']
df.drop('year', axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace = True)

In [None]:
df.info()

In [None]:
df['fuel'].value_counts()

In [None]:
df['seller_type'].value_counts()

In [None]:
df['transmission'].value_counts()

In [None]:
df['owner'].value_counts()

In [None]:
df.describe()

In [None]:
sns.countplot(x = 'fuel', data = df);

In [None]:
sns.countplot(x = 'seller_type', data = df);

In [None]:
sns.countplot(x = 'transmission', data = df);

In [None]:
sns.countplot(x = 'owner', data = df);

In [None]:
df.replace({'fuel':{'Diesel':0, 'Petrol':1, 'CNG':2, 'LPG':3}}, inplace = True)
df.replace({'seller_type':{'Individual':0, 'Dealer':1, 'Trustmark Dealer':2}}, inplace = True)
df.replace({'transmission':{'Manual':0, 'Automatic':1}}, inplace = True)
df.replace({'owner':{'First Owner':0, 'Second Owner':1, 'Third Owner':2, 'Fourth & Above Owner':3, 'Test Drive Car':4}}, inplace = True)
df['fuel'] = pd.to_numeric(df['fuel'], errors='coerce')
df['seller_type'] = pd.to_numeric(df['seller_type'], errors='coerce')
df['transmission'] = pd.to_numeric(df['transmission'], errors='coerce')
df['owner'] = pd.to_numeric(df['owner'], errors='coerce')

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (16,10))
fig.suptitle('Visualization of Numerical columns')
sns.regplot(x = 'num_year', y = 'selling_price', data = df, ax = axes[0,0])
sns.regplot(x = 'seller_type', y = 'selling_price', data = df, ax = axes[0,1])
sns.regplot(x = 'mileage', y = 'selling_price', data = df, ax = axes[1,0])
sns.regplot(x = 'owner', y = 'selling_price', data = df, ax = axes[1,1]);

In [None]:
X = df.drop(['selling_price', 'name'], axis = 1)
Y = df['selling_price']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 2)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, Y_train)
Y_pred_lr = lin_reg.predict(X_test)

In [None]:
print("===== Linear Regression Results =====")
print("MAE:", mean_absolute_error(Y_test, Y_pred_lr))
print("RMSE:", np.sqrt(mean_squared_error(Y_test, Y_pred_lr)))
print("R2 Score:", r2_score(Y_test, Y_pred_lr))

In [None]:
rf_model = RandomForestRegressor(random_state=2)
rf_model.fit(X_train, Y_train)
Y_pred_rf = rf_model.predict(X_test)

In [None]:
print("\n===== Random Forest Results =====")
print("MAE:", mean_absolute_error(Y_test, Y_pred_rf))
print("RMSE:", np.sqrt(mean_squared_error(Y_test, Y_pred_rf)))
print("R2 Score:", r2_score(Y_test, Y_pred_rf))

In [None]:
importances = rf_model.feature_importances_
features = X.columns
plt.figure(figsize = (10,6))
plt.barh(features, importances)
plt.title("Feature Importance (Random Forest)")
plt.xlabel("Importance Score")
plt.show()