# 🚗 Car Price Prediction Project

### Import libraries

In [1]:

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import gradio as gr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

In [2]:
warnings.filterwarnings('ignore')

### Load Dataset

In [3]:
from google.colab import files
uploaded = files.upload()


Saving quikr_car.csv to quikr_car.csv


# New section

In [None]:
car=pd.read_csv('quikr_car.csv')

In [None]:
car.head()

### Initial Data Exploration

In [None]:
car.shape

In [None]:
car.info()

#### Unique Values Check

In [None]:
car['year'].unique()

In [None]:
car['Price'].unique()

In [None]:
car['kms_driven'].unique()

In [None]:
car['fuel_type'].unique()

## Quality
* year has many non-year values
* year object to int
* price has Ask For Price
* Price object to int
* kms_driven has kms with integers
* kms_driven object to int
* kms_driven has nan values
* fuel-type has nan values
* keep first 3 words of name

## Data Cleaning

In [None]:
backup=car.copy()

#### Clean 'year' Column

In [None]:
car =car[car['year'].str.isnumeric()]

In [None]:
car['year']=car['year'].astype(np.int32)

#### Clean 'Price' Column

In [None]:
car=car[car['Price']!="Ask For Price"]

In [None]:
car['Price'] = car['Price'].astype(str)
car['Price']=car['Price'].str.replace(',', '', regex=True).astype(np.int32)


#### Clean 'kms_driven' Column

In [None]:
car['kms_driven']=car['kms_driven'].str.split(' ').str.get(0).str.replace(',','')

In [None]:
car=car[car['kms_driven'].str.isnumeric()]

In [None]:
car['kms_driven']=car['kms_driven'].astype(np.int32)

In [None]:
car.info()

#### Remove Null Fuel Types

In [None]:
car=car[~car['fuel_type'].isna()]

#### Normalize 'name' Column

In [None]:
car['name']= car['name'].apply(
    lambda x: ' '.join(x[:3]) if isinstance(x, list)
    else ' '.join(str(x).split()[:3])
)


In [None]:
car=car.reset_index(drop=True)

In [None]:
car.describe()

#### Filter Out Extreme Prices

In [None]:
car=car[car['Price']<6e6].reset_index(drop=True)

In [None]:
car

### Save Cleaned Data

In [None]:
car.to_csv("Cleaned Car.csv",index=False)

### Data Visualization

In [None]:

plt.figure(figsize=(10, 6))
sns.histplot(car['Price'], kde=True, bins=40)
plt.title('Distribution of Car Prices')
plt.xlabel('Price (INR)')
plt.ylabel('Count')
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=car['year'],y=car['Price'])
plt.title('Car Price vs year')
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(data=car, x='fuel_type', y='Price', estimator=np.mean)
plt.title('Average Price by Fuel Type')
plt.ylabel('Average Price')
plt.xlabel('Fuel Type')
plt.show()


In [None]:
plt.figure(figsize=(10, 5))
sns.lineplot(data=car.groupby('year')['Price'].mean().reset_index(), x='year', y='Price')
plt.title('Average Price by Year')
plt.xlabel('Year')
plt.ylabel('Average Price')
plt.xticks(rotation=45)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=car, x='kms_driven', y='Price', hue='fuel_type')
plt.title('KMs Driven vs Price')
plt.xlabel('KMs Driven')
plt.ylabel('Selling Price')
plt.show()


## Model Preparation

In [None]:
X=car.drop(columns='Price')
y=car['Price']

### Model Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


#### Encode + Train Linear Regression

In [None]:
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()
num_cols, cat_cols


## Build Pipeline

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)
print('Model trained.')

### Model Evalution

In [None]:
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Accuracy:")
print(f"R² Score       : {r2:.4f}")
print(f"MAE            : {mae:.2f}")
print(f"MSE            : {mse:.2f}")
print(f"RMSE           : {rmse:.2f}")

In [None]:
metrics = ['R² Score', 'MAE', 'MSE', 'RMSE']
values = [0.6424, 114032.35, 72813676137.85, 269840.09]
plt.figure(figsize=(10,6))
bars = plt.bar(metrics, values, color=['green', 'orange', 'red', 'blue'], alpha=0.8)
for bar, val in zip(bars, values):
    plt.text(bar.get_x() + bar.get_width()/2, val, f'{val:.2f}',
             ha='center', va='bottom', fontsize=10, fontweight='bold')
plt.title('Model Accuracy and Error Metrics Comparison', fontsize=14)
plt.xlabel('Metrics', fontsize=12)
plt.ylabel('Values (Log Scale)', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.6)

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred, color='blue', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
         color='red', linestyle='--', linewidth=2, label='Perfect Prediction Line')

plt.title('Actual Price vs Predicted Price')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.legend()
plt.show()

## Gradio

In [None]:
def predict_price(year, kms_driven, fuel_type, company, name):
    data = {
        'year': [int(year)],
        'kms_driven': [int(kms_driven)],
        'fuel_type': [fuel_type],
        'company': [company]
    }
    # Include name only if available in the training set
    if 'name' in X.columns:
        data['name'] = [name]
    input_df = pd.DataFrame(data)
    pred = model.predict(input_df)[0]
    return f'Estimated Price: ₹ {int(pred):,}'

# UI choices
fuel_choices = sorted(car['fuel_type'].dropna().unique().tolist())
company_choices = sorted(car['company'].dropna().unique().tolist())
name_choices = []
if 'name' in car.columns:
    name_counts = car['name'].value_counts().head(50)
    name_choices = name_counts.index.tolist()

inputs = [
    gr.Number(label='Year'),
    gr.Number(label='Kms Driven'),
    gr.Dropdown(choices=fuel_choices, label='Fuel Type'),
    gr.Dropdown(choices=company_choices, label='Company'),
    gr.Dropdown(choices=name_choices, label='Car Name') if len(name_choices) > 0 else gr.Textbox(label='Car Name (optional)', value='')
]

demo = gr.Interface(
    fn=predict_price,
    inputs=inputs,
    outputs=gr.Textbox(label='Predicted Price')
)

#Uncomment to launch in notebook runtime
demo.launch(debug=True)
