In [1]:
# hypothesis_testing_main.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, f_oneway
import os

# Load data
df = pd.read_csv('../data/Walmart.csv')

# 1. EDA Summary
print("\n📊 Dataset Info")
print(df.info())
print("\n📈 Descriptive Stats")
print(df.describe())

# 2. T-Test: Holiday vs Non-Holiday
holiday_sales = df[df['Holiday_Flag'] == 1]['Weekly_Sales']
non_holiday_sales = df[df['Holiday_Flag'] == 0]['Weekly_Sales']

t_stat, p_val = ttest_ind(holiday_sales, non_holiday_sales, equal_var=False)
print(f"\n🧪 T-Test (Holiday vs Non-Holiday):\nT-statistic = {t_stat:.4f}, P-value = {p_val:.4f}")
if p_val < 0.05:
    print("✅ Reject Null Hypothesis — Sales differ between holidays and non-holidays")
else:
    print("❌ Fail to Reject Null — No significant difference")

# 3. Boxplot
plt.figure(figsize=(8, 5))
sns.boxplot(x='Holiday_Flag', y='Weekly_Sales', data=df)
plt.xticks([0, 1], ['Non-Holiday', 'Holiday'])
plt.title("Weekly Sales: Holiday vs Non-Holiday")
plt.xlabel("Week Type")
plt.ylabel("Weekly Sales")
os.makedirs('../visuals', exist_ok=True)
plt.savefig('../visuals/holiday_sales_boxplot.png')
plt.close()



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.11/site-p

AttributeError: _ARRAY_API not found

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [2]:
# Dataset shape
print("Dataset Shape:", df.shape)

# Column-wise info
df.info()

# Summary statistics
df.describe()

# Check missing values
df.isnull().sum()


Dataset Shape: (6435, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         6435 non-null   int64  
 1   Date          6435 non-null   object 
 2   Weekly_Sales  6435 non-null   float64
 3   Holiday_Flag  6435 non-null   int64  
 4   Temperature   6435 non-null   float64
 5   Fuel_Price    6435 non-null   float64
 6   CPI           6435 non-null   float64
 7   Unemployment  6435 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 402.3+ KB


Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64

In [3]:
for col in df.columns:
    print(f"{col} - Unique Values: {df[col].nunique()}")


Store - Unique Values: 45
Date - Unique Values: 143
Weekly_Sales - Unique Values: 6435
Holiday_Flag - Unique Values: 2
Temperature - Unique Values: 3528
Fuel_Price - Unique Values: 892
CPI - Unique Values: 2145
Unemployment - Unique Values: 349


In [1]:
from scipy.stats import ttest_ind

# Split data into holiday and non-holiday
holiday_sales = df[df['Holiday_Flag'] == 1]['Weekly_Sales']
non_holiday_sales = df[df['Holiday_Flag'] == 0]['Weekly_Sales']

# Perform t-test
t_stat, p_value = ttest_ind(holiday_sales, non_holiday_sales, equal_var=False)

print("T-statistic:", t_stat)
print("P-value:", p_value)

if p_value < 0.05:
    print("✅ Reject Null Hypothesis — Sales differ between holidays and non-holidays")
else:
    print("❌ Fail to Reject Null — No significant difference in sales")


NameError: name 'df' is not defined