# E-commerce Sales Prediction - Exploratory Data Analysis

This notebook contains comprehensive exploratory data analysis for the e-commerce sales prediction project. We'll analyze various aspects of the data to understand patterns and relationships that can help in predicting weekly sales.

## Setup and Data Loading

In [1]:
!pip install pyspark
!pip install findspark
!pip install seaborn

[0m[31mERROR: Could not find a version that satisfies the requirement findspark (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for findspark[0m[31m


In [2]:
# Import required libraries
import findspark
findspark.init

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, when, count, mean, stddev
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# Initialize Spark session
spark = SparkSession.builder \
    .appName("SalesPrediction_EDA") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv('/kaggle/input/basalam-comments-and-products/BaSalam.products.csv', header=True, inferSchema=True)

# Display basic information
print("Dataset Overview:")
print(f"Number of records: {df.count():,}")
print(f"Number of features: {len(df.columns)}")
print("\nSchema:")
df.printSchema()

ModuleNotFoundError: No module named 'findspark'

## Missing Data Analysis

In [None]:
import pandas as pd

total_count = df.count()
missing_counts = []

for column in df.columns:
    missing_count = df.filter(col(column).isNull()).count()
    missing_percentage = (missing_count / total_count) * 100
    missing_counts.append({
        'column': column,
        'missing_percentage': missing_percentage
    })

missing_df = pd.DataFrame(missing_counts)
missing_df = missing_df.sort_values('missing_percentage', ascending=True)

In [None]:
plt.figure(figsize=(15, 10))
plt.barh(missing_df['column'], missing_df['missing_percentage'])
plt.xlabel('Missing Percentage')
plt.title('Missing Values Analysis')
plt.show()

# Get detailed missing value statistics
missing_stats = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).toPandas()
missing_stats = missing_stats.T.reset_index()
missing_stats.columns = ['Column', 'Missing Count']
missing_stats['Missing Percentage'] = (missing_stats['Missing Count'] / df.count()) * 100
missing_stats = missing_stats.sort_values('Missing Percentage', ascending=False)

print("\nDetailed Missing Value Analysis:")
print(missing_stats[missing_stats['Missing Percentage'] > 0])

## 2. Descriptive Statistics

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import NumericType, StringType, BooleanType, IntegerType

# Assuming `df` is your DataFrame
df_schema = df.dtypes

# Lists to hold the column names
numerical_columns = []
categorical_columns = []
binary_columns = []

# Loop through the schema and classify the columns
for column, dtype in df_schema:
    if isinstance(df.schema[column].dataType, NumericType):
        numerical_columns.append(column)
    elif isinstance(df.schema[column].dataType, StringType):
        categorical_columns.append(column)
    elif isinstance(df.schema[column].dataType, BooleanType):
        binary_columns.append(column)

# Show the results
print("Numerical Columns:", numerical_columns)
print("Categorical Columns:", categorical_columns)
print("Binary Columns:", binary_columns)

In [None]:
from pyspark.sql.functions import col

# Changing the currency to EGP
exchange_rate = 0.00038  # 1 Iranian Rial = 0.00038 Egyptian Pound

# Update the 'price', 'primaryPrice', 'vendor_freeShippingToIran', and 'vendor_freeShippingToSameCity' columns
df = df.withColumn('price', col('price') * exchange_rate) \
       .withColumn('primaryPrice', col('primaryPrice') * exchange_rate) \
       .withColumn('vendor_freeShippingToIran', col('vendor_freeShippingToIran') * exchange_rate) \
       .withColumn('vendor_freeShippingToSameCity', col('vendor_freeShippingToSameCity') * exchange_rate)


In [None]:
# Calculate descriptive statistics for numerical features
numeric_stats = df.select(numerical_columns).describe().toPandas()
print("Numerical Features Statistics:")
display(numeric_stats)

# Display distribution plots for numerical features
# numerical_data = df.select(numerical_columns).toPandas()

# fig, axes = plt.subplots(3, 3, figsize=(15, 15))
# axes = axes.ravel()

# for idx, col in enumerate(numerical_columns):
#     if idx < len(axes):
#         sns.histplot(data=numerical_data, x=col, ax=axes[idx])
#         axes[idx].set_title(f'Distribution of {col}')

# plt.tight_layout()
# plt.show()

# Key Insights
#### 1. Price vs Score

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from pyspark.sql import functions as F

# Define the number of bins you want
num_bins = 10

# Calculate bins for each price column
price_bins = df.select(F.min("price"), F.max("price")).collect()
min_price, max_price = price_bins[0][0], price_bins[0][1]
price_bin_width = (max_price - min_price) / num_bins

# Create binned columns
df = df.withColumn("price_bin", F.floor((F.col("price") - min_price) / price_bin_width))
# Aggregate average score by price bin
price_score = df.groupBy("price_bin").agg(
    F.avg("_score").alias("avg_score"),
    F.count("*").alias("count")
).orderBy("price_bin")
# Convert aggregated data to Pandas DataFrames
price_pd = price_score.toPandas()
# Calculate bin centers
price_pd['bin_center'] = min_price + (price_pd['price_bin'] + 0.5) * price_bin_width


# Assuming df is your PySpark DataFrame
# First convert the necessary columns to Pandas
pdf = df.select("price", "_score").toPandas()

# Set up the figure
plt.figure(figsize=(15, 6))

# 1. Price vs Score histogram
plt.subplot(1, 2, 1)

# Create bins for price
price_bins = np.linspace(pdf['price'].min(), pdf['price'].max(), num_bins + 1)
pdf['price_bin'] = pd.cut(pdf['price'], bins=price_bins)

# Calculate average score per bin
price_stats = pdf.groupby('price_bin')['_score'].agg(['mean', 'count']).reset_index()

# Plot
plt.bar(price_stats['price_bin'].apply(lambda x: x.mid), 
        price_stats['mean'], 
        width=(price_bins[1]-price_bins[0])*0.9,  # 90% of bin width for spacing
        alpha=0.7)
plt.title('Average Score by Price Bin')
plt.xlabel('Price Range')
plt.ylabel('Average Score')
plt.xticks(rotation=45)
plt.grid(True)

# 2. PrimaryPrice vs Score histogram
plt.subplot(1, 2, 1)

plt.tight_layout()
plt.show()

#### 2. Discount Percentage vs score

In [None]:
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## 1. Calculate Discount Percentage
# If primaryPrice is 0, set discount to 0, otherwise calculate (primaryPrice - price)/primaryPrice
df = df.withColumn(
    "discount_pct",
    F.when(F.col("primaryPrice") == 0, 0)
     .otherwise((F.col("primaryPrice") - F.col("price")) / F.col("primaryPrice") * 100
))

## 2. Filter out invalid discounts (optional)
# Remove cases where discount is negative (price > primaryPrice) or >100%
df = df.filter((F.col("discount_pct") >= 0) & (F.col("discount_pct") <= 100))

## 3. Bin the discount percentages
num_bins = 10  # You can adjust this
df = df.withColumn("discount_bin", F.floor(F.col("discount_pct") / (100/num_bins)))

## 4. Aggregate by bins
discount_stats = df.groupBy("discount_bin").agg(
    F.avg("_score").alias("avg_score"),
    F.count("*").alias("count")
).orderBy("discount_bin")

## 5. Convert to Pandas for plotting
discount_pd = discount_stats.toPandas()

# Calculate bin centers (middle of each range)
discount_pd['bin_center'] = (discount_pd['discount_bin'] + 0.5) * (100/num_bins)

## 6. Create the plot
plt.figure(figsize=(12, 6))

# Main bar plot
bars = plt.bar(discount_pd['bin_center'], 
               discount_pd['avg_score'], 
               width=(100/num_bins)*0.8,
               alpha=0.7,
               color='green')

# Add count labels if desired
for bar, count in zip(bars, discount_pd['count']):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(count)}',
             ha='center', va='bottom', fontsize=8)

plt.title('Average Score by Discount Percentage')
plt.xlabel('Discount Percentage (%)')
plt.ylabel('Average Score')
plt.grid(True, alpha=0.3)
plt.xticks(np.arange(0, 101, 10))
plt.xlim(0, 100)

plt.tight_layout()
plt.show()

#### 3. Weight vs Price

In [None]:
from pyspark.sql import functions as F

# Calculate IQR for both weight and price
quantiles = df.approxQuantile(["weight", "price"], [0.25, 0.75], 0.05)
weight_q1, weight_q3 = quantiles[0][0], quantiles[0][1]
price_q1, price_q3 = quantiles[1][0], quantiles[1][1]

# Define bounds (1.5*IQR rule)
weight_iqr = weight_q3 - weight_q1
price_iqr = price_q3 - price_q1

df_clean = df.filter(
    (F.col("weight") >= weight_q1 - 1.5*weight_iqr) &
    (F.col("weight") <= weight_q3 + 1.5*weight_iqr) &
    (F.col("price") >= price_q1 - 1.5*price_iqr) &
    (F.col("price") <= price_q3 + 1.5*price_iqr)
)

print(f"Removed {df.count() - df_clean.count()} outliers")

In [None]:
# Re-bin the cleaned data
weight_bins = df_clean.select(
    F.min("weight").alias("min_w"),
    F.max("weight").alias("max_w")
).collect()
min_w, max_w = weight_bins[0][0], weight_bins[0][1]
bin_width = (max_w - min_w) / 20  # Adjust bin count as needed

df_binned = df_clean.withColumn(
    "weight_bin", 
    F.floor((F.col("weight") - min_w) / bin_width)
).groupBy("weight_bin").agg(
    F.avg("price").alias("avg_price"),
    F.count("*").alias("count")
).orderBy("weight_bin")

hist_data = df_binned.toPandas()
hist_data["bin_center"] = min_w + (hist_data["weight_bin"] + 0.5) * bin_width

In [None]:
plt.figure(figsize=(12, 6))
bars = plt.bar(
    hist_data["bin_center"], 
    hist_data["avg_price"], 
    width=bin_width*0.8,
    color="teal",
    alpha=0.7
)

# Add count labels (only for bins with reasonable counts)
for idx, row in hist_data.iterrows():
    if row["count"] > 10:  # Only label meaningful bins
        plt.text(
            row["bin_center"], 
            row["avg_price"] * 1.05,  # Position above bar
            f"n={int(row['count']):,}",
            ha="center",
            fontsize=8
        )

plt.title("Average Price by Weight Bin (Outliers Removed)", fontsize=14)
plt.xlabel("Weight (units)", fontsize=12)
plt.ylabel("Average Price", fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.3)

# Set y-axis limit based on cleaned data
plt.ylim(0, hist_data["avg_price"].max() * 1.2)

plt.show()

#### 4. isFreeShipping vs Score

In [None]:
from pyspark.sql import functions as F

# Calculate average scores by free shipping status
score_comparison = df.groupBy("isFreeShipping").agg(
    F.avg("_score").alias("average_score"),
    F.count("*").alias("product_count"),
    F.stddev("_score").alias("score_stddev"),
    F.percentile_approx("_score", 0.5).alias("median_score")
).orderBy("isFreeShipping")

# Show results
score_comparison.show()

#### 5. has_variation vs Score

In [None]:
from pyspark.sql import functions as F

# Calculate average scores by free shipping status
score_comparison = df.groupBy("has_variation").agg(
    F.avg("_score").alias("average_score"),
    F.count("*").alias("product_count"),
    F.stddev("_score").alias("score_stddev"),
    F.percentile_approx("_score", 0.5).alias("median_score")
).orderBy("has_variation")

# Show results
score_comparison.show()

#### 6. isFreeShipping vs Weight

In [None]:
from pyspark.sql import functions as F

# Weight vs isFreshipping
score_comparison = df.groupBy("isFreeShipping").agg(
    F.avg("weight").alias("average_weight"),
    F.count("*").alias("product_count"),
    F.stddev("weight").alias("weight_stddev"),
    F.percentile_approx("_score", 0.5).alias("median_weight")
).orderBy("isFreeShipping")

# Show results
score_comparison.show()

In [None]:
from pyspark.sql import functions as F

# Weight vs isFreshipping
score_comparison = df.groupBy("isFreeShipping").agg(
    F.avg("price").alias("average_price"),
    F.count("*").alias("product_count"),
).orderBy("isFreeShipping")

# Show results
score_comparison.show()

#### 7. has_delivery vs score

In [None]:
from pyspark.sql import functions as F

# Calculate average scores by free shipping status
score_comparison = df.groupBy("has_delivery").agg(
    F.avg("_score").alias("average_score"),
    F.count("*").alias("product_count"),
    F.stddev("_score").alias("score_stddev"),
    F.percentile_approx("_score", 0.5).alias("median_score")
).orderBy("has_delivery")

# Show results
score_comparison.show()

#### 8. top vendors that has highest scores

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Calculate vendor statistics
vendor_stats = df.groupBy("vendor_id", "vendor_name").agg(
    F.avg("_score").alias("avg_score"),
    F.count("*").alias("product_count"),
    F.stddev("_score").alias("score_stddev"),
    F.min("_score").alias("min_score"),
    F.max("_score").alias("max_score")
).filter(F.col("product_count") > 10)  # Only vendors with sufficient products

# Add consistency metric (lower stddev = more consistent)
vendor_stats = vendor_stats.withColumn(
    "consistency",
    1/F.col("score_stddev")  # Higher value = more consistent
)

# Rank vendors by average score
window = Window.orderBy(F.desc("avg_score"))
vendor_stats = vendor_stats.withColumn("rank", F.rank().over(window))

# Show top 20 vendors
top_vendors = vendor_stats.filter(F.col("rank") <= 20).orderBy("rank")
top_vendors.show(truncate=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert to Pandas for visualization
top_vendors_pd = top_vendors.toPandas()

plt.figure(figsize=(14, 8))

# Plot 1: Top Vendors by Average Score
plt.subplot(2, 1, 1)
sns.barplot(
    x="avg_score", 
    y="vendor_name", 
    data=top_vendors_pd,
    palette="viridis",
    hue="product_count"
)
plt.title("Top Vendors by Average Score (Bubble Size = Product Count)")
plt.xlabel("Average Score")
plt.ylabel("Vendor Name")
plt.xlim(top_vendors_pd["avg_score"].min()*0.98, 5.0)  # Assuming 5-point scale

# Plot 2: Score Consistency
plt.subplot(2, 1, 2)
sns.scatterplot(
    x="avg_score",
    y="score_stddev",
    size="product_count",
    hue="vendor_name",
    data=top_vendors_pd,
    legend=False,
    sizes=(40, 400)
)

plt.title("Score Consistency (Lower StdDev = More Consistent)")
plt.xlabel("Average Score")
plt.ylabel("Score Standard Deviation")

plt.tight_layout()
plt.show()

#### 8. Top Categories that has highest scores

In [None]:
# Translation dictionary for category titles
# Updated Translation dictionary for category titles
category_translation = {
    'گام شمار': 'Pedometer',
    'کتاب چاپی': 'Printed Books',
    'طلق موتور': 'Motor Oil',
    'کفش و دمپایی زنانه': 'Women\'s Shoes and Slippers',
    'غذای ماهی و میگو': 'Fish and Shrimp Food',
    'کفش، دمپایی مردانه': 'Men\'s Shoes and Slippers',
    'بذر و تخم گیاهان': 'Seeds and Plant Seeds',
    'ذخیره سازی مبتنی بر نوار': 'Tape-based Storage',
    'عطر و ادکلن زنانه و مردانه': 'Women\'s and Men\'s Perfume and Cologne',
    'سایر': 'Other',
    'ادویه': 'Spices',
    'زیورآلات زنانه': 'Women\'s Jewelry',
    'لباس زیر زنانه': 'Women\'s Lingerie',
    'گیاهان دارویی': 'Medicinal Plants',
    'مانتو و تونیک': 'Manto and Tunic'
}


# Analyze sales by category with product count threshold
category_analysis = df.groupBy('categoryTitle').agg(
    F.avg('_score').alias('avg_score'),
    F.count('*').alias('product_count'),
    F.avg('price').alias('avg_price'),
    F.avg('rating_average').alias('avg_rating')
).toPandas()

# Apply threshold to filter out categories with less than 5000 products
category_analysis = category_analysis[category_analysis['product_count'] >= 5000]

# Translate category titles
category_analysis['categoryTitle'] = category_analysis['categoryTitle'].map(category_translation).fillna(category_analysis['categoryTitle'])

# Plot top categories by average score
plt.figure(figsize=(15, 6))
top_categories = category_analysis.nlargest(10, 'avg_score')
sns.barplot(data=top_categories, x='categoryTitle', y='avg_score')
plt.xticks(rotation=45, ha='right')
plt.title('Average Score by Top 10 Categories (with 5000+ Products)')
plt.tight_layout()
plt.show()

# Display category statistics
print("\nCategory Statistics:")
display(category_analysis.sort_values('avg_score', ascending=False).head(10))


#### 9. vendor city vs score

In [None]:
from pyspark.sql import functions as F

# Calculate city statistics (only for cities with sufficient vendors)
city_stats = df.groupBy("vendor_cityId", "vendor_owner_city").agg(
    F.avg("_score").alias("avg_score"),
    F.countDistinct("vendor_id").alias("vendor_count"),
    F.count("*").alias("product_count"),
    F.stddev("_score").alias("score_stddev")
).filter(F.col("product_count") >= 10)  # Only cities with 5+ vendors

# Rank cities by average score
window = Window.partitionBy().orderBy(F.desc("avg_score"))  # Explicit empty partition
city_stats = city_stats.withColumn("rank", F.rank().over(window))

# Show top/bottom 10 cities
display(city_stats.orderBy("rank").limit(10))
display(city_stats.orderBy(F.desc("rank")).limit(10))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert top cities to Pandas
top_cities_pd = city_stats.filter(F.col("rank") <= 15).toPandas()

# Plot: Bar Chart of Average Scores for Top Cities
plt.figure(figsize=(14, 6))
sns.barplot(
    x="vendor_owner_city",
    y="avg_score",
    data=top_cities_pd,
    palette="viridis",
    order=top_cities_pd.sort_values("avg_score", ascending=False)["vendor_owner_city"]
)
plt.xticks(rotation=45, ha='right')
plt.title("Top Cities by Average Product Score")
plt.xlabel("City")
plt.ylabel("Average Score")
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

#### 10. Determine which vendors receive the highest customer satisfaction ratings. 

Adjusted Rating= ((avg_rating×rating_count)+(global_mean×min_samples)) / (rating_count+min_samples)

In [None]:
from pyspark.sql import functions as F

# Calculate global mean and minimum sample threshold
global_mean = df.select(F.avg("rating_average")).first()[0]
min_samples = 100  # Adjust based on your data

vendor_stats = df.groupBy("vendor_id", "vendor_name").agg(
    F.avg("rating_average").alias("avg_rating"),
    F.sum("rating_count").alias("total_ratings")
).filter(F.col("total_ratings") > min_samples)

# Apply Bayesian Average
vendor_stats = vendor_stats.withColumn(
    "adjusted_rating",
    (F.col("avg_rating") * F.col("total_ratings") + global_mean * min_samples) / 
    (F.col("total_ratings") + min_samples)
)

# Rank by adjusted rating
window = Window.orderBy(F.desc("adjusted_rating"))
vendor_stats = vendor_stats.withColumn("rank", F.rank().over(window))

In [None]:
top_vendors_pd = vendor_stats.orderBy("rank").limit(10).toPandas()

plt.figure(figsize=(12, 6))
sns.barplot(
    x="vendor_name",
    y="adjusted_rating",  # or "wilson_score"/"weighted_score"
    data=top_vendors_pd,
    palette="viridis"
)
plt.xticks(rotation=45)
plt.title("Top Vendors by Bayesian equation")
plt.ylabel("Weighted Average")
plt.xlabel("Vendor")
plt.show()

#### 11. score vs average rating

In [None]:
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import numpy as np

# Group by exact rating values (0-5)
rating_analysis = df.groupBy("rating_average") \
    .agg(
        F.avg("_score").alias("avg_score"),
        F.count("*").alias("product_count"),
        F.stddev("_score").alias("score_stddev")
    ) \
    .filter(F.col("rating_average").isin([0, 1, 2, 3, 4, 5])) \
    .orderBy("rating_average") \
    .toPandas()

# Plot
plt.figure(figsize=(12, 6))
plt.bar(
    rating_analysis["rating_average"].astype(str),  # Convert ratings to strings for labels
    rating_analysis["avg_score"],
    yerr=rating_analysis["score_stddev"],  # Show standard deviation as error bars
    capsize=5,
    color="skyblue",
    edgecolor="black"
)

# Annotate bars with product counts
for i, row in rating_analysis.iterrows():
    plt.text(
        i,
        row["avg_score"] + 0.5,  # Position above bar
        f"n={row['product_count']:,}",
        ha="center",
        fontsize=10
    )

plt.xlabel("Exact Rating (0-5 scale)")
plt.ylabel("Average Score")
plt.title("Average Score by Exact Rating Value")
plt.grid(axis="y", linestyle="--", alpha=0.3)
plt.tight_layout()
plt.show()

#### 12. score vs sales

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from pyspark.sql import functions as F

# Define the number of bins you want
num_bins = 10

# Calculate bins for each sales column
sales_bins = df.select(F.min("sales_count_week"), F.max("sales_count_week")).collect()
min_sales, max_sales = sales_bins[0][0], sales_bins[0][1]
sales_bin_width = (max_sales - min_sales) / num_bins

# Create binned columns
df = df.withColumn("sales_bin", F.floor((F.col("sales_count_week") - min_sales) / sales_bin_width))
# Aggregate average score by sales bin
sales_score = df.groupBy("sales_bin").agg(
    F.avg("_score").alias("avg_score"),
    F.count("*").alias("count")
).orderBy("sales_bin")
# Convert aggregated data to Pandas DataFrames
sales_pd = sales_score.toPandas()
# Calculate bin centers
sales_pd['bin_center'] = min_sales + (sales_pd['sales_bin'] + 0.5) * sales_bin_width


# Assuming df is your PySpark DataFrame
# First convert the necessary columns to Pandas
pdf = df.select("sales_count_week", "_score").toPandas()

# Set up the figure
plt.figure(figsize=(15, 6))

# 1. Sales vs Score histogram
plt.subplot(1, 2, 1)

# Create bins for Sales
sales_bins = np.linspace(pdf['sales_count_week'].min(), pdf['sales_count_week'].max(), num_bins + 1)
pdf['sales_bin'] = pd.cut(pdf['sales_count_week'], bins=sales_bins)

# Calculate average score per bin
sales_stats = pdf.groupby('sales_bin')['_score'].agg(['mean', 'count']).reset_index()

# Plot
plt.bar(sales_stats['sales_bin'].apply(lambda x: x.mid), 
        sales_stats['mean'], 
        width=(sales_bins[1]-sales_bins[0])*0.9,  # 90% of bin width for spacing
        alpha=0.7)
plt.title('Average Score by Sales Bin')
plt.xlabel('Sales Range')
plt.ylabel('Average Score')
plt.xticks(rotation=45)
plt.grid(True)

# 2. Sales vs Score histogram
plt.subplot(1, 2, 1)

plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:

# Calculate and plot correlation matrix

def plot_correlation_matrix(df, columns):
    """
    Plot correlation matrix for specified columns.
    
    Args:
        df: Spark DataFrame
        columns: List of column names to include in correlation matrix
    """
    correlation_data = df.select(columns).toPandas()
    correlation_matrix = correlation_data.corr()
    
    plt.figure(figsize=(25, 15))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
# Define binary columns
binary_columns = ['has_delivery', 'has_variation', 'vendor_has_delivery', 'isFreeShipping', 'IsAvailable', 'IsSaleable']

# Combine numerical and binary features
all_features = numerical_columns + binary_columns

# Plot correlation matrix with all features
plot_correlation_matrix(df, all_features)