<a href="https://colab.research.google.com/github/namankathuria21/EDA/blob/main/EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Bike Details Dataset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("bike_details.csv")

print(df.info())
print(df.head())

In [None]:
#Q1. What is the range of selling prices in the dataset?
price_range = (df['selling_price'].min(), df['selling_price'].max())
print("Range of Selling Prices:", price_range)

In [None]:
#Q2. What is the median selling price for bikes in the dataset?
median_price = df['selling_price'].median()
print("Median Selling Price:", median_price)


In [None]:
#Q3. What is the most common seller type?
common_seller = df['seller_type'].mode()[0]
print("Most Common Seller Type:", common_seller)

In [None]:
#Q4. How many bikes have driven more than 50,000 kilometers?
count_high_km = (df['km_driven'] > 50000).sum()
print("Bikes driven more than 50,000 km:", count_high_km)

In [None]:
#Q5. What is the average km_driven value for each ownership type?
avg_km_by_owner = df.groupby('owner')['km_driven'].mean()
print(avg_km_by_owner)

In [None]:
#Q6. What proportion of bikes are from the year 2015 or older?
prop_old = (df['year'] <= 2015).mean()
print("Proportion of bikes from 2015 or older:", prop_old)

In [None]:
#Q7. What is the trend of missing values across the dataset?
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)
sns.heatmap(df.isnull(), cbar=False)
plt.show()

In [None]:
#Q8. What is the highest ex_showroom_price recorded, and for which bike?
max_price_idx = df['ex_showroom_price'].idxmax()
print(df.loc[max_price_idx, ['name','ex_showroom_price']])

In [None]:
#Q9. What is the total number of bikes listed by each seller type?
seller_counts = df['seller_type'].value_counts()
print(seller_counts)

In [None]:
#Q10. What is the relationship between selling_price and km_driven for first-owner bikes?
first_owner = df[df['owner']=="1st owner"]
sns.scatterplot(data=first_owner, x="km_driven", y="selling_price")
plt.title("Selling Price vs KM Driven (1st Owner Bikes)")
plt.show()

In [None]:
#Q11. Identify and remove outliers in the km_driven column using the IQR method.
Q1 = df['km_driven'].quantile(0.25)
Q3 = df['km_driven'].quantile(0.75)
IQR = Q3 - Q1
filtered = df[(df['km_driven'] >= Q1 - 1.5*IQR) & (df['km_driven'] <= Q3 + 1.5*IQR)]
print("Before:", len(df), "After Removing Outliers:", len(filtered))

In [None]:
#Q12. Perform a bivariate analysis to visualize the relationship between year and selling_price.
sns.boxplot(data=df, x="year", y="selling_price")
plt.xticks(rotation=90)
plt.show()

In [None]:
#Q13. What is the average depreciation in selling price based on the bike's age?
import datetime
current_year = datetime.datetime.now().year
df['age'] = current_year - df['year']
df['depreciation'] = df['ex_showroom_price'] - df['selling_price']
avg_depreciation = df.groupby('age')['depreciation'].mean()
print(avg_depreciation)

In [None]:
#Q14. Which bike names are priced significantly above the average price for their manufacturing year?
avg_year_price = df.groupby('year')['selling_price'].mean()
df['above_avg'] = df.apply(lambda x: x['selling_price'] > avg_year_price[x['year']], axis=1)
above_avg_bikes = df[df['above_avg']==True]['name'].unique()
print("Bikes priced above average:", above_avg_bikes)

In [None]:
#Q15. Develop a correlation matrix for numeric columns and visualize it using a heatmap.
corr = df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.show()


In [None]:
#EDA – 2: Car Sales Dataset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

cars = pd.read_csv("car_sales.csv")
print(cars.info())
print(cars.head())

In [None]:
#Q1. What is the average selling price of cars for each dealer, and how does it compare across different dealers?
avg_price_dealer = cars.groupby("Dealer_Name")["Price ($)"].mean().sort_values(ascending=False)
print(avg_price_dealer)

plt.figure(figsize=(10,5))
avg_price_dealer.plot(kind="bar")
plt.ylabel("Average Price ($)")
plt.title("Average Car Price per Dealer")
plt.show()

In [None]:
#Q2. Which car brand (Company) has the highest variation in prices, and what does this tell us about the pricing trends?
variation = cars.groupby("Company")["Price ($)"].std().sort_values(ascending=False)
print("Brand with highest variation:", variation.head(1))

In [None]:
#Q3. What is the distribution of car prices for each transmission type, and how do the interquartile ranges compare?
sns.boxplot(data=cars, x="Transmission", y="Price ($)")
plt.title("Car Prices by Transmission Type")
plt.show()

In [None]:
#Q4. What is the distribution of car prices across different regions?
sns.boxplot(data=cars, x="Dealer_Region", y="Price ($)")
plt.title("Car Price Distribution Across Regions")
plt.show()

In [None]:
#Q5. What is the distribution of cars based on body styles?
sns.countplot(data=cars, x="Body Style", order=cars["Body Style"].value_counts().index)
plt.title("Car Count by Body Style")
plt.xticks(rotation=45)
plt.show()

In [None]:
#Q6. How does the average selling price of cars vary by customer gender and annual income?
avg_price_gender = cars.groupby("Gender")["Price ($)"].mean()
print(avg_price_gender)

sns.boxplot(data=cars, x="Gender", y="Price ($)")
plt.title("Car Price by Gender")
plt.show()

In [None]:
#Q7. What is the distribution of car prices by region, and how does the number of cars sold vary by region?
sns.boxplot(data=cars, x="Dealer_Region", y="Price ($)")
plt.show()

region_sales = cars["Dealer_Region"].value_counts()
print(region_sales)

In [None]:
#Q8. How does the average car price differ between cars with different engine sizes?
avg_price_engine = cars.groupby("Engine")["Price ($)"].mean().sort_values(ascending=False)
print(avg_price_engine)

sns.barplot(data=cars, x="Engine", y="Price ($)", estimator="mean")
plt.title("Average Car Price by Engine Type")
plt.show()

In [None]:
#Q9. How do car prices vary based on the customer’s annual income bracket?
cars["Income_Bracket"] = pd.cut(cars["Annual Income"], bins=[0,50000,100000,150000,200000], labels=["Low","Medium","High","Very High"])
sns.boxplot(data=cars, x="Income_Bracket", y="Price ($)")
plt.title("Car Price vs Customer Income Bracket")
plt.show()

In [None]:
#Q10. What are the top 5 car models with the highest number of sales, and how does their price distribution look?
top_models = cars["Model"].value_counts().head(5).index
sns.boxplot(data=cars[cars["Model"].isin(top_models)], x="Model", y="Price ($)")
plt.title("Price Distribution of Top 5 Selling Car Models")
plt.show()

In [None]:
#Q11. How does car price vary with engine size across different car colors, and which colors have the highest price variation?
sns.boxplot(data=cars, x="Color", y="Price ($)", hue="Engine")
plt.xticks(rotation=90)
plt.title("Price Variation by Engine and Color")
plt.show()

In [None]:
#Q12. Is there any seasonal trend in car sales based on the date of sale?
cars["Date"] = pd.to_datetime(cars["Date"])
cars["Month"] = cars["Date"].dt.month
sns.countplot(data=cars, x="Month")
plt.title("Car Sales by Month")
plt.show()

In [None]:
#Q13. How does the car price distribution change when considering different combinations of body style and transmission type?
sns.boxplot(data=cars, x="Body Style", y="Price ($)", hue="Transmission")
plt.xticks(rotation=45)
plt.title("Price Distribution by Body Style & Transmission")
plt.show()

In [None]:
#Q14. What is the correlation between car price, engine size, and annual income of customers, and how do these features interact?
num_corr = cars[["Price ($)", "Annual Income"]].corr()
sns.heatmap(num_corr, annot=True, cmap="coolwarm")
plt.show()

In [None]:
#Q15. How does the average car price vary across different car models and engine types?
avg_price_model_engine = cars.groupby(["Model","Engine"])["Price ($)"].mean().sort_values(ascending=False)
print(avg_price_model_engine.head(10))

In [None]:
#EDA – 3: Amazon Sales Dataset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

amazon = pd.read_csv("amazon_sales.csv")
print(amazon.info())
print(amazon.head())

In [None]:
#Q1. What is the average rating for each product category?
avg_rating = amazon.groupby("category")["rating"].mean().sort_values(ascending=False)
print(avg_rating)

plt.figure(figsize=(10,5))
avg_rating.plot(kind="bar")
plt.title("Average Rating by Category")
plt.ylabel("Average Rating")
plt.show()

In [None]:
#Q2. What are the top rating_count products by category?
top_products = amazon.groupby("category")["rating_count"].nlargest(1).reset_index()
print(top_products)

In [None]:
#Q3. What is the distribution of discounted prices vs. actual prices?
plt.figure(figsize=(8,5))
sns.scatterplot(data=amazon, x="actual_price", y="discounted_price", alpha=0.5)
plt.title("Discounted vs Actual Price")
plt.xlabel("Actual Price")
plt.ylabel("Discounted Price")
plt.show()

In [None]:
#Q4. How does the average discount percentage vary across categories?
avg_discount = amazon.groupby("category")["discount_percentage"].mean().sort_values(ascending=False)
print(avg_discount)

avg_discount.plot(kind="bar", figsize=(10,5))
plt.title("Average Discount Percentage by Category")
plt.ylabel("Discount %")
plt.show()

In [None]:
#Q5. What are the most popular product names?
popular_products = amazon["product_name"].value_counts().head(10)
print(popular_products)

popular_products.plot(kind="barh", figsize=(8,5))
plt.title("Top 10 Popular Product Names")
plt.show()

In [None]:
#Q6. What are the most popular product keywords?
from collections import Counter

keywords = " ".join(amazon["product_name"]).lower().split()
common_keywords = Counter(keywords).most_common(15)
print(common_keywords)

In [None]:
#Q7. What are the most popular product reviews?
popular_reviews = amazon["review_title"].value_counts().head(10)
print(popular_reviews)

In [None]:
#Q8. What is the correlation between discounted_price and rating?
corr = amazon[["discounted_price","rating"]].corr()
print(corr)

sns.scatterplot(data=amazon, x="discounted_price", y="rating", alpha=0.5)
plt.title("Correlation: Discounted Price vs Rating")
plt.show()

In [None]:
#Q9. What are the Top 5 categories based on the highest ratings?
top5_cats = avg_rating.head(5)
print(top5_cats)

Q10. Identify any potential areas for improvement or optimization based on the data analysis.

Categories with low ratings may need better product quality or description.

Categories with high discounts but low sales → pricing strategy issue.

Products with high reviews but low average rating → focus on quality improvement.

Marketing should target top-rated + most reviewed categories for maximum sales.

In [None]:
#EDA – 4: Spotify Dataset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

spotify = pd.read_csv("spotify_tracks.csv")
print(spotify.info())
print(spotify.head())

In [None]:
Q1. Read the dataframe, check null value if present then do the needful, check duplicate row, if present then do the needful
print(spotify.isnull().sum())
spotify = spotify.dropna()   # remove nulls

print("Duplicates before:", spotify.duplicated().sum())
spotify = spotify.drop_duplicates()
print("Duplicates after:", spotify.duplicated().sum())

In [None]:
Q2. What is the distribution of popularity among the tracks in the dataset? Visualize it using a histogram
plt.hist(spotify["Popularity"], bins=20, edgecolor="black")
plt.title("Distribution of Track Popularity")
plt.xlabel("Popularity Score")
plt.ylabel("Frequency")
plt.show()

In [None]:
Q3. Is there any relationship between the popularity and the duration of tracks? Explore this using a scatter plot
sns.scatterplot(data=spotify, x="Duration (ms)", y="Popularity", alpha=0.5)
plt.title("Popularity vs Duration of Tracks")
plt.show()

In [None]:
Q4. Which artist has the highest number of tracks in the dataset? Display the count of tracks for each artist using a countplot
top_artists = spotify["Artist"].value_counts().head(10)
print(top_artists)

sns.countplot(data=spotify, y="Artist", order=spotify["Artist"].value_counts().head(10).index)
plt.title("Top 10 Artists by Track Count")
plt.show()

In [None]:
Q5. What are the top 5 least popular tracks in the dataset? Provide the artist name and track name for each
least_popular = spotify.sort_values("Popularity").head(5)[["Artist","Track Name","Popularity"]]
print(least_popular)

In [None]:
Q6. Among the top 5 most popular artists, which artist has the highest popularity on average?
top5_artists = spotify["Artist"].value_counts().head(5).index
avg_popularity = spotify[spotify["Artist"].isin(top5_artists)].groupby("Artist")["Popularity"].mean()
print(avg_popularity.sort_values(ascending=False))

In [None]:
Q7. For the top 5 most popular artists, what are their most popular tracks?
most_popular_tracks = spotify[spotify["Artist"].isin(top5_artists)].groupby("Artist")["Popularity"].idxmax()
print(spotify.loc[most_popular_tracks, ["Artist","Track Name","Popularity"]])

In [None]:
Q8. Visualize relationships between multiple numerical variables simultaneously using a pair plot
sns.pairplot(spotify[["Popularity","Duration (ms)"]])
plt.show()

In [None]:
Q9. Does the duration of tracks vary significantly across different artists? Explore this visually using a box plot or violin plot
sns.boxplot(data=spotify, x="Artist", y="Duration (ms)")
plt.xticks(rotation=90)
plt.title("Duration Variation Across Artists")
plt.show()

In [None]:
Q10. How does the distribution of track popularity vary for different artists? Visualize this using a swarm plot or a violin plot
sns.violinplot(data=spotify, x="Artist", y="Popularity", inner="quartile")
plt.xticks(rotation=90)
plt.title("Popularity Distribution by Artist")
plt.show()