In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv("data.csv")

In [None]:
df

In [None]:
df.info()

In [None]:
df.fillna({
    'Engine Fuel Type': df['Engine Fuel Type'].mode()[0],
    'Engine HP': df['Engine HP'].median(),
    'Engine Cylinders': df['Engine Cylinders'].median(),
    'Number of Doors': df['Number of Doors'].median(),
    'Market Category': 'Unknown'
}, inplace=True)

In [None]:
# Drop duplicates
df.drop_duplicates(inplace=True)  

In [None]:
# --- Exploratory Data Analysis (EDA) ---
print("Basic Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())

In [None]:
# --- Visualizations ---
sns.set(style="whitegrid")

In [None]:
# Price distribution
plt.figure(figsize=(8, 5))
sns.histplot(df['MSRP'], bins=50, kde=True)
plt.title("MSRP Distribution")
plt.xlabel("Price ($)")
plt.ylabel("Frequency")
plt.xlim(0, 100000)  # Limit price range for better visualization
plt.show()

In [None]:
# Fuel efficiency vs price
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df['city mpg'], y=df['MSRP'], alpha=0.5)
plt.title("City MPG vs MSRP")
plt.xlabel("City MPG")
plt.ylabel("Price ($)")
plt.ylim(0, 100000)
plt.show()

In [None]:
# Popular car brands
plt.figure(figsize=(10, 5))
top_brands = df['Make'].value_counts().nlargest(10)
sns.barplot(x=top_brands.index, y=top_brands.values)
plt.title("Top 10 Most Popular Car Brands")
plt.xlabel("Brand")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Correlation heatmap (Only numeric columns)
plt.figure(figsize=(10, 6))
sns.heatmap(df.select_dtypes(include=['number']).corr(), annot=True, cmap="coolwarm", fmt='.2f')
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:
# Additional Insights
print("Top 5 Most Popular Car Brands:")
print(df['Make'].value_counts().head())

In [None]:
print("Average MSRP by Vehicle Size:")
print(df.groupby("Vehicle Size")['MSRP'].mean())