<a href="https://colab.research.google.com/github/nkr9/Data-Analyst/blob/main/ML%20using%20Cars_Market_Analysis_from_Cars24..ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
amanrajput16_used_car_price_data_from_cars24_path = kagglehub.dataset_download('amanrajput16/used-car-price-data-from-cars24')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Title: Used Cars Market Analysis from Cars24 Data

#### Description: Analyzing trends in the used car market using scraped data from Cars24. Insights include price distribution, popular brands, demand patterns, and factors affecting resale value.

## Import dataset

In [None]:
df = pd.read_csv('/kaggle/input/used-car-price-data-from-cars24/cars24data.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.dtypes

In [None]:
df.corr

In [None]:
df.shape

In [None]:
df.columns

## data visualizations

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df['Price'], bins=30, kde=True, color='darkblue')
plt.title('Distribution of Car Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x=df['Manufacturing_year'], y=df['Price'], palette='Blues')
plt.xticks(rotation=45)
plt.title('Car Price Distribution by Manufacturing Year')
plt.xlabel('Manufacturing Year')
plt.ylabel('Price')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(x=df['Engine capacity'], y=df['Price'], scatter_kws={'alpha':0.5}, color='darkgreen')
plt.title('Price vs. Engine Capacity')
plt.xlabel('Engine Capacity (cc)')
plt.ylabel('Price')
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x=df['Transmission'], palette='coolwarm')
plt.title('Count of Manual vs. Automatic Cars')
plt.xlabel('Transmission Type')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(x=df['KM driven'], y=df['Price'], scatter_kws={'alpha':0.5}, color='darkred')
plt.title('Price vs. KM Driven')
plt.xlabel('KM Driven')
plt.ylabel('Price')
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x=df['Ownership'], palette='viridis')
plt.title('Ownership Type Distribution')
plt.xlabel('Ownership Type')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x=df['Fuel type'], y=df['Price'], palette='coolwarm')
plt.title('Price Distribution Across Fuel Types')
plt.xlabel('Fuel Type')
plt.ylabel('Price')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(y=df['Imperfections'], order=df['Imperfections'].value_counts().index, palette='Blues_r')
plt.title('Count of Different Imperfections')
plt.xlabel('Count')
plt.ylabel('Imperfection Type')
plt.show()

In [None]:
plt.figure(figsize=(6, 6))
df['Repainted Parts'].value_counts().plot.pie(autopct='%1.1f%%', colors=['skyblue', 'orange'], startangle=90)
plt.title('Proportion of Cars with Repainted Parts')
plt.ylabel('')
plt.show()

## Predictive modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
df.fillna(df.median(numeric_only=True), inplace=True)  # Fill numeric with median
df.fillna(df.mode().iloc[0], inplace=True)

In [None]:
# Encode categorical columns
cat_cols = ['Model Name', 'Spare key', 'Transmission', 'Ownership', 'Fuel type', 'Imperfections', 'Repainted Parts']
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
# Select features and target
X = df.drop(columns=['Price'])  # Independent variables
y = df['Price']

In [None]:
# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train[['Engine capacity', 'KM driven', 'Manufacturing_year']] = scaler.fit_transform(X_train[['Engine capacity', 'KM driven', 'Manufacturing_year']])
X_test[['Engine capacity', 'KM driven', 'Manufacturing_year']] = scaler.transform(X_test[['Engine capacity', 'KM driven', 'Manufacturing_year']])

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42)
}

In [None]:
# Train models and evaluate
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    results[name] = {"R² Score": r2, "MAE": mae}


In [None]:
# Display results
results_df = pd.DataFrame(results).T
print(results_df)

In [None]:
models = ["Linear Regression", "Decision Tree", "Random Forest", "XGBoost"]
r2_scores = [0.803813, 0.803520, 0.862652, 0.864392]
mae_values = [61020.22, 57910.03, 47078.58, 46320.28]

# Convert R² scores to percentages
r2_percent = [r * 100 for r in r2_scores]

# Normalize MAE to percentage (lower is better, so we reverse the scale)
max_mae = max(mae_values)
mae_percent = [(1 - (m / max_mae)) * 100 for m in mae_values]

# Create DataFrame for plotting
data = pd.DataFrame({"Model": models, "R² Score (%)": r2_percent, "MAE (%)": mae_percent})

# Plot the percentage bar graph
plt.figure(figsize=(10, 5))
sns.barplot(x="Model", y="R² Score (%)", data=data, color="darkblue", label="R² Score")
sns.barplot(x="Model", y="MAE (%)", data=data, color="darkred", label="MAE (Inverted)")

plt.title("Model Performance Comparison in Percentage")
plt.ylabel("Percentage (%)")
plt.legend()
plt.ylim(0, 100)  # Set percentage limit
plt.show()


## Thank you!!!...pls upvote!!!!