# Week 7: Statistical Business Analysis
### Goal: Use inferential statistics to validate business assumptions and predict revenue.

import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('sales_data (4).csv')
df['Date'] = pd.to_datetime(df['Date'])

print("Data successfully loaded. Dataset contains", len(df), "records.")
df.head()

## 1. Descriptive Statistics
We calculate the Mean, Median, Mode, and Standard Deviation to understand the central tendency and spread of our sales.

# Calculate basic stats
mean_val = df['Total_Sales'].mean()
median_val = df['Total_Sales'].median()
std_val = df['Total_Sales'].std()
mode_val = df['Total_Sales'].mode()[0]

print(f"Mean Sales: ${mean_val:,.2f}")
print(f"Median Sales: ${median_val:,.2f}")
print(f"Standard Deviation: ${std_val:,.2f}")
print(f"Most Frequent Sale (Mode): ${mode_val:,.2f}")

# Visualize the distribution
plt.figure(figsize=(10,5))
sns.histplot(df['Total_Sales'], kde=True, color='teal')
plt.title('Frequency Distribution of Total Sales')
plt.show()

## 2. Hypothesis Testing
We perform a **t-test** to see if there is a significant difference between the **North** and **South** regions.
* **Null Hypothesis (H0):** There is no difference in sales between North and South.
* **Alternative Hypothesis (H1):** There is a significant difference.

north = df[df['Region'] == 'North']['Total_Sales']
south = df[df['Region'] == 'South']['Total_Sales']

t_stat, p_val = stats.ttest_ind(north, south)

print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_val:.4f}")

if p_val < 0.05:
    print("Conclusion: Significant difference found (Reject H0)")
else:
    print("Conclusion: No significant difference (Fail to reject H0)")

## 3. Correlation & Regression
Checking if **Price** is a strong predictor of **Total Sales**.

# Pearson Correlation
corr, _ = stats.pearsonr(df['Price'], df['Total_Sales'])
print(f"Pearson Correlation: {corr:.4f}")

# Linear Regression
X = sm.add_constant(df['Price']) # Independent variable
y = df['Total_Sales']           # Dependent variable

model = sm.OLS(y, X).fit()
print(model.summary())

# Regression Plot
sns.regplot(x='Price', y='Total_Sales', data=df, line_kws={'color':'red'})
plt.title('Linear Regression: Price vs Sales')
plt.show()

## 4. 95% Confidence Intervals
We are 95% confident that the true population mean of sales falls within this range.

mean_sales = df['Total_Sales'].mean()
st_error = stats.sem(df['Total_Sales'])
ci = stats.t.interval(0.95, len(df)-1, loc=mean_sales, scale=st_error)

print(f"95% Confidence Interval: ${ci[0]:,.2f} to ${ci[1]:,.2f}")