# Car Price Data Analysis
This notebook explores factors affecting car prices using visualization and statistical analysis.
Link of original dataset: https://www.kaggle.com/datasets/asinow/car-price-dataset


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = "/Users/muneebsahibzada/Downloads/car_price_dataset.csv"
df = pd.read_csv(file_path)

ModuleNotFoundError: No module named 'seaborn'

## Data Overview

In [None]:
# Display dataset info and first few rows
df.info()
df.head()

In [None]:
# Set visualization style
sns.set_style("whitegrid")

## Distribution of Car Prices

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df["Price"], bins=50, kde=True, color="blue")
plt.title("Distribution of Car Prices", fontsize=14)
plt.xlabel("Price", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.show()

## Correlation Analysis

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Numerical Features", fontsize=14)
plt.show()

## Price Distribution by Brand

In [None]:
top_brands = df["Brand"].value_counts().nlargest(10).index
plt.figure(figsize=(12, 6))
sns.boxplot(data=df[df["Brand"].isin(top_brands)], x="Brand", y="Price")
plt.xticks(rotation=45)
plt.title("Car Price Distribution by Top 10 Brands", fontsize=14)
plt.xlabel("Brand", fontsize=12)
plt.ylabel("Price", fontsize=12)
plt.show()

## Price Distribution by Fuel Type

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x="Fuel_Type", y="Price", palette="Set2")
plt.title("Car Price Distribution by Fuel Type", fontsize=14)
plt.xlabel("Fuel Type", fontsize=12)
plt.ylabel("Price", fontsize=12)
plt.show()

## Section 5: Price Distribution by Transmission

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x="Transmission", y="Price", palette="coolwarm")
plt.title("Car Price Distribution by Transmission Type", fontsize=14)
plt.xlabel("Transmission", fontsize=12)
plt.ylabel("Price", fontsize=12)
plt.show()

## Data Quality Check

In [None]:
missing_values = df.isnull().sum()
duplicate_rows = df.duplicated().sum()

print("Missing Values:", missing_values)
print("\nDuplicate Rows:", duplicate_rows)