#Assignment - I
#EDA-1

In [None]:
#1.What is the range of selling prices in the dataset ?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
DATA_PATH = "BIKE DETAILS.csv"
OUTPUT_DIR = "bike_analysis_outputs"
CURRENT_YEAR = 2025
os.makedirs(OUTPUT_DIR, exist_ok=True)
df = pd.read_csv(DATA_PATH)
df.columns = df.columns.str.strip()
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
if 'selling_price' in df.columns:
    sp_min = df['selling_price'].min()
    sp_max = df['selling_price'].max()
    print(f"1) Selling price range: min = {sp_min}, max = {sp_max}")
else:
    raise KeyError("Column 'selling_price' not found")


In [None]:
# 2.What is the median selling price for bikes in the dataset?
median_sp = df['selling_price'].median()
print(f"2) Median selling price: {median_sp}")


In [None]:
# 3.What is the most common seller type?
if 'seller_type' in df.columns:
    most_common_seller = df['seller_type'].mode().iloc[0]
    print(f"3) Most common seller_type: {most_common_seller}")
else:
    print("Column 'seller_type' not found")


In [None]:
# 4.How many bikes have driven more than 50,000 kilometres?
if 'km_driven' in df.columns:
    df['km_driven'] = pd.to_numeric(df['km_driven'], errors='coerce')
    count_gt50k = df[df['km_driven'] > 50000].shape[0]
    print(f"4) Bikes with km_driven > 50,000: {count_gt50k}")
else:
    print("Column 'km_driven' not found")


In [None]:
# 5.What is the average km_driven value for each ownership type?
if 'ownership' in df.columns:
    avg_km_by_ownership = df.groupby('ownership')['km_driven'].mean().reset_index().sort_values('km_driven', ascending=False)
    print("5) Average km_driven per ownership type:")
    print(avg_km_by_ownership)
    avg_km_by_ownership.to_csv(os.path.join(OUTPUT_DIR, "avg_km_by_ownership.csv"), index=False)
else:
    print("Column 'ownership' not found")



In [None]:
# 6.What proportion of bikes are from the year 2015 or older?
if 'year' in df.columns:
    df['year'] = pd.to_numeric(df['year'], errors='coerce')
    total = df.shape[0]
    older_count = df[df['year'] <= 2015].shape[0]
    proportion_older = older_count / total
    print(f"6) Proportion of bikes from year 2015 or older: {older_count}/{total} = {proportion_older:.3f}")
else:
    print("Column 'year' not found")


In [None]:
# 7.What is the trend of missing values across the dataset?
missing_counts = df.isnull().sum().sort_values(ascending=False)
missing_percent = (missing_counts / len(df) * 100).round(2)
missing_df = pd.DataFrame({'missing_count': missing_counts, 'missing_percent': missing_percent})
print("7) Missing values by column:")
print(missing_df[missing_df['missing_count']>0])
missing_df.to_csv(os.path.join(OUTPUT_DIR, "missing_values_summary.csv"))


In [None]:
# 8.What is the highest ex_showroom_price recorded, and for which bike?
if 'ex_showroom_price' in df.columns and 'name' in df.columns:
    df['ex_showroom_price'] = pd.to_numeric(df['ex_showroom_price'], errors='coerce')
    idxmax = df['ex_showroom_price'].idxmax()
    max_price = df.loc[idxmax, 'ex_showroom_price']
    bike_name_max = df.loc[idxmax, 'name']
    print(f"8) Highest ex_showroom_price = {max_price}, bike: {bike_name_max} (index {idxmax})")
else:
    print("Columns 'ex_showroom_price' and/or 'name' not found")


In [None]:
# 9.What is the total number of bikes listed by each seller type?
if 'seller_type' in df.columns:
    counts_by_seller = df['seller_type'].value_counts().reset_index()
    counts_by_seller.columns = ['seller_type', 'count']
    print("9) Total number of bikes by seller type:")
    print(counts_by_seller)
    counts_by_seller.to_csv(os.path.join(OUTPUT_DIR, "counts_by_seller_type.csv"), index=False)
else:
    print("Column 'seller_type' not found")


In [None]:
# 10.What is the relationship between selling_price and km_driven for first-owner bikes?
if 'ownership' in df.columns:
    first_owner_df = df[df['ownership'].astype(str).str.contains('First', case=False, na=False)]
    if not first_owner_df.empty:
        corr = first_owner_df[['selling_price','km_driven']].dropna().corr().iloc[0,1]
        print(f"10) Correlation (selling_price vs km_driven) for first-owner bikes: {corr:.4f}")
        plt.figure(figsize=(8,6))
        sns.regplot(data=first_owner_df, x='km_driven', y='selling_price', scatter_kws={'s':10}, line_kws={'lw':2})
        plt.title("selling_price vs km_driven (First-owner bikes)")
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, "selling_vs_km_first_owner.png"))
        plt.close()
    else:
        print("No first-owner bikes found")
else:
    print("Column 'ownership' not found")


In [None]:
# 11.Identify and remove outliers in the km_driven column using the IQR method.
if 'km_driven' in df.columns:
    q1 = df['km_driven'].quantile(0.25)
    q3 = df['km_driven'].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5*iqr
    upper_bound = q3 + 1.5*iqr
    outliers = df[(df['km_driven'] < lower_bound) | (df['km_driven'] > upper_bound)].copy()
    print(f"11) IQR method: lower={lower_bound}, upper={upper_bound}")
    print(f"Outliers detected in km_driven: {outliers.shape[0]}")    outliers[['km_driven','name','year']].to_csv(os.path.join(OUTPUT_DIR,"km_driven_outliers.csv"), index=False)
    df_no_outliers = df[(df['km_driven'] >= lower_bound) & (df['km_driven'] <= upper_bound)].copy()
    print(f"Rows before: {len(df)}, after removing outliers: {len(df_no_outliers)}")
    df_no_outliers.to_csv(os.path.join(OUTPUT_DIR, "data_km_no_outliers.csv"), index=False)
else:
    print("Column 'km_driven' not found")


In [None]:
# 12.Perform a bivariate analysis to visualize the relationship between year and selling_price.
if 'year' in df.columns:
    plt.figure(figsize=(10,6))
    years_sorted = sorted(df['year'].dropna().unique())
    sns.boxplot(data=df, x='year', y='selling_price', order=years_sorted)
    plt.xticks(rotation=45)
    plt.title("Selling price distribution by manufacturing year")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "boxplot_selling_by_year.png"))
    plt.close()
    plt.figure(figsize=(8,6))
    sns.scatterplot(data=df, x='year', y='selling_price', alpha=0.6, s=20)
    sns.regplot(data=df, x='year', y='selling_price', scatter=False, lowess=True)
    plt.title("Year vs Selling Price (scatter with lowess)")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "scatter_selling_by_year.png"))
    plt.close()
    print("12) Bivariate plots saved.")
else:
    print("Column 'year' not found")


In [None]:
# 13.What is the average depreciation in selling price based on the bike's age (current year - manufacturing year)?
if 'year' in df.columns and 'selling_price' in df.columns and 'ex_showroom_price' in df.columns:
    df_age = df.copy()
    df_age['age'] = CURRENT_YEAR - df_age['year']
    df_age.loc[df_age['age'] <= 0, 'age'] = np.nan
    df_age['abs_dep_total'] = df_age['ex_showroom_price'] - df_age['selling_price']
    df_age['abs_dep_per_year'] = df_age['abs_dep_total'] / df_age['age']
    df_age['pct_dep_per_year'] = (df_age['abs_dep_total'] / df_age['ex_showroom_price']) / df_age['age'] * 100
    avg_abs_dep_per_year = df_age['abs_dep_per_year'].mean()
    avg_pct_dep_per_year = df_age['pct_dep_per_year'].mean()
    print(f"13) Average absolute depreciation per year: {avg_abs_dep_per_year:.2f}")
    print(f"    Average percentage depreciation per year: {avg_pct_dep_per_year:.2f} %")
    df_age[['name','year','ex_showroom_price','selling_price','age','abs_dep_per_year','pct_dep_per_year']].to_csv(os.path.join(OUTPUT_DIR, "depreciation_per_bike.csv"), index=False)
else:
    print("One of required columns ('year','selling_price','ex_showroom_price') not found")


In [None]:
#14.Which bike names are priced significantly above the average price for their manufacturing year?
if 'year' in df.columns and 'selling_price' in df.columns and 'name' in df.columns:
    df_year = df.groupby('year')['selling_price'].agg(['mean','std']).reset_index().rename(columns={'mean':'year_mean','std':'year_std'})
    df = df.merge(df_year, on='year', how='left')
    df['z_score_year'] = (df['selling_price'] - df['year_mean']) / df['year_std']
    significantly_above = df[df['z_score_year'] > 2].sort_values('z_score_year', ascending=False)
    print(f"14) Bikes priced significantly above average for their year (z > 2): {len(significantly_above)} found")    significantly_above[['name','year','selling_price','year_mean','z_score_year']].to_csv(os.path.join(OUTPUT_DIR, "significantly_above_by_year.csv"), index=False)
else:
    print("Required columns for analysis 14 not present")


In [None]:
#15.Develop a correlation matrix for numeric columns and visualize it using a heatmap.
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns used for correlation:", numeric_cols)
corr_matrix = df[numeric_cols].corr()
corr_matrix.to_csv(os.path.join(OUTPUT_DIR, "correlation_matrix.csv"))
plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", square=True, cmap="vlag", cbar_kws={'shrink':0.7})
plt.title("Correlation matrix (numeric columns)")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "correlation_heatmap.png"))
plt.close()
print("All outputs saved in folder:", OUTPUT_DIR)


# EDA -2

In [None]:
# 1.What is the average selling price of cars for each dealer, and how does it compare across different dealers?
import pandas as pd, numpy as np, os, matplotlib.pyplot as plt
DATA_PATH = "your_car_file.csv"
OUTPUT_DIR = "car_analysis_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
df = pd.read_csv(DATA_PATH)
df.columns = df.columns.str.strip()
price_col = None
for c in ['selling_price','price','SalePrice','sellingPrice']:
    if c in df.columns:
        price_col = c
        break
if price_col is None:
    raise ValueError("No price column found. Rename your selling price column to 'selling_price' or 'price'.")
for c in [price_col, 'engine_size', 'annual_income']:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
if 'dealer' in df.columns:
    avg_price_by_dealer = df.groupby('dealer')[price_col].agg(['count','mean','median','std']).reset_index()
    avg_price_by_dealer.columns = ['dealer','count','avg_price','median_price','std_price']    avg_price_by_dealer.to_csv(os.path.join(OUTPUT_DIR,"avg_price_by_dealer.csv"), index=False)
    plt.figure(figsize=(10,5)); plt.bar(avg_price_by_dealer['dealer'].astype(str), avg_price_by_dealer['avg_price']); plt.xticks(rotation=45); plt.tight_layout(); plt.savefig(os.path.join(OUTPUT_DIR,"avg_price_by_dealer.png")); plt.close()


In [None]:
#2.	Which car brand (Company) has the highest variation in prices, and what does this tell us about the pricing trends?
if 'company' in df.columns:
    brand_stats = df.groupby('company')[price_col].agg(['count','mean','std']).reset_index().sort_values('std', ascending=False)
    brand_stats.to_csv(os.path.join(OUTPUT_DIR,"brand_price_variation.csv"), index=False)


In [None]:
# 3.What is the distribution of car prices for each transmission type, and how do the interquartile ranges compare?
if 'transmission' in df.columns:
    trans = df.groupby('transmission')[price_col].apply(lambda x: x.dropna())
    iqr_rows = []
    for name, series in trans.items():
        if len(series)>0:
            q1 = series.quantile(0.25); q3 = series.quantile(0.75)
            iqr_rows.append({'transmission':name,'q1':q1,'q3':q3,'IQR':q3-q1,'count':len(series)})    pd.DataFrame(iqr_rows).to_csv(os.path.join(OUTPUT_DIR,"transmission_iqr.csv"), index=False)
    labels=[]; data=[]
    for name, series in trans.items():
        if len(series)>0:
            labels.append(str(name)); data.append(series.values)
    if data:
        plt.figure(figsize=(8,5)); plt.boxplot(data, labels=labels, showfliers=False); plt.xticks(rotation=45); plt.tight_layout(); plt.savefig(os.path.join(OUTPUT_DIR,"boxplot_price_by_transmission.png")); plt.close()



In [None]:
#4.	What is the distribution of car prices across different regions?
if 'region' in df.columns:
    region_stats = df.groupby('region')[price_col].agg(['count','mean','median','std']).reset_index().sort_values('mean',ascending=False)
    region_stats.to_csv(os.path.join(OUTPUT_DIR,"region_price_stats.csv"), index=False)
    top_regions = region_stats.sort_values('count',ascending=False).head(6)['region'].tolist()
    if top_regions:
        plt.figure(figsize=(12,8))
        for i,r in enumerate(top_regions,1):
            plt.subplot(3,2,i)
            arr = df[df['region']==r][price_col].dropna()
            plt.hist(arr, bins=15)
            plt.title(f"{r} (n={len(arr)})")
        plt.tight_layout(); plt.savefig(os.path.join(OUTPUT_DIR,"hist_price_top_regions.png")); plt.close()


In [None]:
# 5.What is the distribution of cars based on body styles?
if 'body_style' in df.columns:
     df['body_style'].value_counts().reset_index().rename(columns={'index':'body_style',body_style':'count'}).to_csv(os.path.join(OUTPUT_DIR,"body_style_distribution.csv"), index=False)


In [None]:
# 6.How does the average selling price of cars vary by customer gender and annual income?
if 'gender' in df.columns and 'annual_income' in df.columns:
    df['income_bracket'] = pd.cut(pd.to_numeric(df['annual_income'],errors='coerce'), bins=[-1,25000,50000,75000,100000,150000,1e9], labels=['0-25k','25k-50k','50k-75k','75k-100k','100k-150k','150k+'])
    pivot = df.groupby(['gender','income_bracket'])[price_col].mean().reset_index().rename(columns={price_col:'avg_price'})
    pivot.to_csv(os.path.join(OUTPUT_DIR,"avg_price_by_gender_income.csv"), index=False)


In [None]:
# 7.What is the distribution of car prices by region, and how does the number of cars sold vary by region?
region_stats = df.groupby(region_col)[price_col].agg(count='count', mean='mean', median='median', std='std', q1=lambda x: x.quantile(0.25), q3=lambda x: x.quantile(0.75)).reset_index().sort_values('count', ascending=False)
print(region_stats.head(200))
region_stats.to_csv("region_price_distribution.csv", index=False)


In [None]:
# 8.How does the average car price differ between cars with different engine sizes?
if 'engine_size' in df.columns:
    df['engine_bucket'] = pd.cut(pd.to_numeric(df['engine_size'],errors='coerce'), bins=[-1,1000,1500,2000,3000,1e9], labels=['<=1000','1000-1500','1500-2000','2000-3000','3000+'])                                                                                                                       df.groupby('engine_bucket')[price_col].agg(['count','mean','median','std']).to_csv(os.path.join(OUTPUT_DIR,"engine_price_stats.csv"))


In [None]:
# 9.How do car prices vary based on the customer’s annual income bracket?
if 'income_bracket' in df.columns:
    groups = [df[df['income_bracket']==b][price_col].dropna() for b in df['income_bracket'].cat.categories]
    import matplotlib.pyplot as plt
    plt.figure(figsize=(8,5)); plt.boxplot(groups, labels=[str(b) for b in   df['income_bracket'].cat.categories], showfliers=False); plt.tight_layout(); plt.savefig(os.path.join(OUTPUT_DIR,"boxplot_price_by_income_bracket.png")); plt.close()


In [None]:
# 10.What are the top 5 car models with the highest number of sales, and how does their price distribution look?
if 'model' in df.columns:
    top5 = df['model'].value_counts().head(5).index.tolist()
    df[df['model'].isin(top5)][[ 'model', price_col ]].to_csv(os.path.join(OUTPUT_DIR,"top5_models_prices.csv"), index=False)
    data = [df[df['model']==m][price_col].dropna().values for m in top5]
    plt.figure(figsize=(8,5)); plt.boxplot(data, labels=top5, showfliers=False); plt.xticks(rotation=45); plt.tight_layout();
plt.savefig(os.path.join(OUTPUT_DIR,"top5_models_price_boxplot.png")); plt.close()



In [None]:
# 11.How does car price vary with engine size across different car colors, and which colors have the highest price variation?
if 'color' in df.columns and 'engine_size' in df.columns:
    rows=[]
    for color, g in df.groupby('color'):
        prices = pd.to_numeric(g[price_col],errors='coerce').dropna()
        rows.append({'color':color,'count':len(g),'std_price':float(prices.std() if    len(prices)>1 else 0)})    pd.DataFrame(rows).sort_values('std_price',ascending=False).to_csv(os.path.join(OUTPUT_DIR,"color_price_variation.csv"), index=False)


In [None]:
#12.Is there any seasonal trend in car sales based on the date of sale?
if 'date_of_sale' in df.columns:
    df['date_of_sale_parsed'] = pd.to_datetime(df['date_of_sale'], errors='coerce')
    df.groupby(df['date_of_sale_parsed'].dt.to_period('M')).size().reset_index(name='count').to_csv(os.path.join(OUTPUT_DIR,"monthly_sales_counts.csv"), index=False)


In [None]:
# 13.How does the car price distribution change when considering different combinations of body style and transmission type?
if 'body_style' in df.columns and 'transmission' in df.columns:
            df.groupby(['body_style','transmission'])[price_col].agg(['count','mean','median','std']).reset_index().to_csv(os.path.join(OUTPUT_DIR,"price_by_body_trans_combo.csv"), index=False)


In [None]:
# 14.What is the correlation between car price, engine size, and annual income of customers, and how do these features interact?
corr_cols = [c for c in [price_col,'engine_size','annual_income'] if c in df.columns]
if len(corr_cols)>=2:
    df[corr_cols].corr().to_csv(os.path.join(OUTPUT_DIR,"corr_price_engine_income.csv"))


In [None]:
# 15.How does the average car price vary across different car models and engine types?
if 'model' in df.columns and 'engine_size' in df.columns:
        df['engine_type'] = pd.cut(pd.to_numeric(df['engine_size'], errors='coerce'), bins=[-1,1000,1500,2000,3000,1e9], labels=['<=1000','1000-1500','1500-2000','2000-3000','3000+'])
        df.groupby(['model','engine_type'])[price_col].agg(['count','mean']).reset_index().to_csv(os.path.join(OUTPUT_DIR,"avg_price_by_model_engine.csv"), index=False)
print("All outputs saved in:", OUTPUT_DIR)


#EDA-3

In [None]:
# 1.What is the average rating for each product category?
import pandas as pd
df = pd.read_csv("your_file.csv")
avg_rating = df.groupby("category")["rating"].mean().reset_index()
print(avg_rating)


In [None]:
# 2.What are the top rating_count products by category?
top_rating_count = df.sort_values("rating_count", ascending=False)\ .groupby("category")\.head(5)[["category", "product_name", "rating_count"]]
print(top_rating_count)


In [None]:
# 3.What is the distribution of discounted prices vs. actual prices?
import matplotlib.pyplot as plt
plt.scatter(df["actual_price"], df["discounted_price"])
plt.xlabel("Actual Price")
plt.ylabel("Discounted Price")
plt.title("Distribution: Discounted vs Actual Price")
plt.show()



In [None]:
# 4.How does the average discount percentage vary across categories?
df["discount_percentage"] = ((df["actual_price"] - df["discounted_price"]) / df["actual_price"]) * 100
avg_discount = df.groupby("category")["discount_percentage"].mean().reset_index()
print(avg_discount)


In [None]:
# 5.What are the most popular product names?
popular_products = df.sort_values("rating_count", ascending=False)\.head(10)[["product_name", "rating_count"]]
print(popular_products)


In [None]:
# 6.What are the most popular product keywords?
from collections import Counter
keywords_series = df["product_keywords"].dropna().str.split(",")
keywords_flat = [kw.strip().lower() for sublist in keywords_series for kw in sublist]
top_keywords = Counter(keywords_flat).most_common(20)
print(top_keywords)



In [None]:
# 7.What are the most popular product reviews?
popular_reviews = df.sort_values("rating_count", ascending=False)\.head(10)[["product_name", "review"]]
print(popular_reviews)


In [None]:
# 8.What is the correlation between discounted_price and rating?
correlation = df["discounted_price"].corr(df["rating"])
print("Correlation:", correlation)


In [None]:
#9.	What are the Top 5 categories based on the highest ratings?
top_categories = df.groupby("category")["rating"]\.mean().sort_values(ascending=False).head(5)
print(top_categories)


In [None]:
#10.Identify any potential areas for improvement or optimization based on the data analysis.
issues = {}
issues["missing_ratings"] = df["rating"].isna().sum()
issues["low_interaction_products"] = df[df["rating_count"] < 10].shape[0]
df["discount_percentage"] = ((df["actual_price"] - df["discounted_price"])
                             / df["actual_price"]) * 100
issues["high_discount_low_rating"] = df[
    (df["discount_percentage"] > 40) & (df["rating"] < 3)
][["product_name", "rating", "discount_percentage"]]
print(issues)


# EDA-4

In [None]:
# 1.Read the dataframe, check null value if present then do the needful, check duplicate row , if present then do the needful.
import pandas as pd
df = pd.read_csv("your_file.csv")
print("Null values:\n", df.isnull().sum())
for col in df.columns:
    if df[col].dtype in ["int64", "float64"]:
        df[col].fillna(df[col].mean(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)
print("Duplicate rows:", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("After cleaning → Shape:", df.shape)


In [None]:
# 2.What is the distribution of popularity among the tracks in the dataset? Visualize it using a histogram.
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(8,5))
sns.histplot(df["popularity"], kde=True)
plt.title("Distribution of Track Popularity")
plt.xlabel("Popularity")
plt.ylabel("Count")
plt.show()



In [None]:
# 3.Is there any relationship between the popularity and the duration of tracks? Explore this using a scatter plot.
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x="duration_ms", y="popularity")
plt.title("Popularity vs Duration of Tracks")
plt.xlabel("Duration (ms)")
plt.ylabel("Popularity")
plt.show()


In [None]:
# 4.Which artist has the highest number of tracks in the dataset? Display the count of tracks for each artist using a countplot.
plt.figure(figsize=(10,6))
sns.countplot(data=df, y="artist", order=df["artist"].value_counts().index)
plt.title("Number of Tracks per Artist")
plt.xlabel("Count")
plt.ylabel("Artist")
plt.show()


In [None]:
# 5.What are the top 5 least popular tracks in the dataset? Provide the artist name and track name for each.
least_popular_tracks = df.sort_values("popularity", ascending=True)\.head(5)[["artist", "track_name", "popularity"]]
print(least_popular_tracks)


In [None]:
# 6.Among the top 5 most popular artists, which artist has the highest popularity on average? Calculate and display the average popularity for each artist.
top5_avg_popularity = df.groupby("artist")["popularity"].mean()\.sort_values(ascending=False).head(5)
print("Top 5 Artists by Average Popularity:")
print(top5_avg_popularity)


In [None]:
# 7.For the top 5 most popular artists, what are their most popular tracks? List the track name for each artist.
top_artists = top5_avg_popularity.index
for artist in top_artists:
    top_track = df[df["artist"] == artist]\.sort_values("popularity", ascending=False)\.iloc[0][["track_name", "popularity"]]
    print(f"{artist} → Most Popular Track: {top_track['track_name']} (Popularity: {top_track['popularity']})")


In [None]:
# 8.Visualize relationships between multiple numerical variables simultaneously using a pair plot.
num_cols = ["popularity", "duration_ms", "danceability", "energy", "tempo"]
sns.pairplot(df[num_cols])
plt.show()


In [None]:
# 9.Does the duration of tracks vary significantly across different artists? Explore this visually using a box plot or violin plot.
BOX PLOT:
plt.figure(figsize=(12,6))
sns.boxplot(data=df, x="artist", y="duration_ms")
plt.xticks(rotation=90)
plt.title("Track Duration Across Artists")
plt.show()
VIOLIN PLOT
plt.figure(figsize=(12,6))
sns.violinplot(data=df, x="artist", y="duration_ms")
plt.xticks(rotation=90)
plt.title("Track Duration Across Artists")
plt.show()


In [None]:
# 10.	How does the distribution of track popularity vary for different artists? Visualize this using a swarm plot or a violin plot.
SWARM PLOT
plt.figure(figsize=(12,6))
sns.swarmplot(data=df, x="artist", y="popularity", size=3)
plt.xticks(rotation=90)
plt.title("Popularity Distribution Across Artists")
plt.show()
VIOLIN PLOT
plt.figure(figsize=(12,6))
sns.violinplot(data=df, x="artist", y="popularity")
plt.xticks(rotation=90)
plt.title("Popularity Violin Plot for Artists")
plt.show()
