# **AI TECH INSTITUTE** · *Intermediate AI & Data Science*
### Week 01 · Notebook 04 — Exploratory Data Analysis (EDA)
**Instructor:** Amir Charkhi  |  **Goal:** Discover patterns, anomalies, and insights in real data.

> Format: systematic exploration → visualization preview → statistical insights → storytelling.


---
## EDA: The Detective Work of Data Science
Before modeling or visualization, we need to understand our data deeply.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Setup
plt.style.use('seaborn-v0_8-darkgrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)

print("Ready to explore! 🔍")

## 1. Load and First Look at Real Data

In [None]:
# Create a realistic e-commerce dataset
np.random.seed(42)
n_customers = 1000

# Generate customer data
customers = pd.DataFrame({
    'customer_id': range(1, n_customers + 1),
    'age': np.random.normal(35, 12, n_customers).clip(18, 70).astype(int),
    'city': np.random.choice(['Perth', 'Sydney', 'Melbourne', 'Brisbane', 'Adelaide'], 
                            n_customers, p=[0.15, 0.3, 0.25, 0.2, 0.1]),
    'member_type': np.random.choice(['Basic', 'Premium', 'VIP'], 
                                   n_customers, p=[0.6, 0.3, 0.1]),
    'signup_date': pd.date_range(end='2025-08-25', periods=n_customers).to_list()
})

# Generate order data
n_orders = 5000
orders = pd.DataFrame({
    'order_id': range(1, n_orders + 1),
    'customer_id': np.random.choice(customers['customer_id'], n_orders),
    'order_date': pd.date_range(end='2025-08-25', periods=n_orders),
    'amount': np.random.lognormal(4, 1, n_orders).clip(10, 1000),
    'items': np.random.poisson(3, n_orders).clip(1, 20),
    'category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home', 'Sports'],
                                n_orders, p=[0.25, 0.3, 0.15, 0.2, 0.1])
})

# Merge for complete dataset
df = pd.merge(orders, customers, on='customer_id')

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())

## 2. Statistical Summary & Distribution Analysis

In [None]:
# Basic statistics
print("Numerical Summary:")
print(df.describe())

# Extended statistics
print("\nExtended Statistics for Amount:")
amount_stats = {
    'Mean': df['amount'].mean(),
    'Median': df['amount'].median(),
    'Mode': df['amount'].mode()[0] if len(df['amount'].mode()) > 0 else None,
    'Std Dev': df['amount'].std(),
    'Variance': df['amount'].var(),
    'Skewness': df['amount'].skew(),
    'Kurtosis': df['amount'].kurtosis(),
    'IQR': df['amount'].quantile(0.75) - df['amount'].quantile(0.25),
    'CV': df['amount'].std() / df['amount'].mean()  # Coefficient of variation
}

for stat, value in amount_stats.items():
    print(f"{stat:15}: {value:.2f}")

# Categorical summaries
print("\nCategorical Variables:")
for col in ['city', 'member_type', 'category']:
    print(f"\n{col}:")
    print(df[col].value_counts())

**Exercise 1 — Distribution Detective (medium)**  
Check if order amounts follow a normal distribution using multiple methods.


In [None]:
# Your turn


<details>
<summary><b>Solution</b></summary>

```python
# Visual check
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Histogram
axes[0].hist(df['amount'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Distribution of Order Amounts')
axes[0].set_xlabel('Amount')

# Q-Q plot
stats.probplot(df['amount'], dist="norm", plot=axes[1])
axes[1].set_title('Q-Q Plot')

# Log-transformed
axes[2].hist(np.log(df['amount']), bins=50, edgecolor='black', alpha=0.7)
axes[2].set_title('Distribution of Log(Amount)')
axes[2].set_xlabel('Log(Amount)')

plt.tight_layout()
plt.show()

# Statistical test
statistic, p_value = stats.shapiro(df['amount'].sample(min(5000, len(df))))
print(f"Shapiro-Wilk test: statistic={statistic:.4f}, p-value={p_value:.4f}")
print(f"Normal distribution: {'Rejected' if p_value < 0.05 else 'Not rejected'}")

# Log-normal test
log_stat, log_p = stats.shapiro(np.log(df['amount']).sample(min(5000, len(df))))
print(f"\nLog-normal test: statistic={log_stat:.4f}, p-value={log_p:.4f}")
print(f"Log-normal distribution: {'Rejected' if log_p < 0.05 else 'Not rejected'}")
```
</details>

## 3. Correlation and Relationships

In [None]:
# Prepare numeric features
numeric_df = df[['amount', 'items', 'age']].copy()

# Calculate correlations
corr_matrix = numeric_df.corr()
print("Correlation Matrix:")
print(corr_matrix)

# Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1)
plt.title('Correlation Heatmap')
plt.show()

# Relationship between categorical and numerical
print("\nAverage order amount by member type:")
member_analysis = df.groupby('member_type').agg({
    'amount': ['mean', 'median', 'std', 'count']
}).round(2)
print(member_analysis)

# Statistical test for difference
groups = [df[df['member_type'] == mt]['amount'].values 
          for mt in df['member_type'].unique()]
f_stat, p_value = stats.f_oneway(*groups)
print(f"\nANOVA test for member types: F={f_stat:.2f}, p={p_value:.4f}")

## 4. Outlier Detection and Analysis

In [None]:
# Multiple outlier detection methods
def detect_outliers(data, column):
    # IQR method
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    iqr_outliers = data[(data[column] < lower) | (data[column] > upper)]
    
    # Z-score method
    z_scores = np.abs(stats.zscore(data[column]))
    z_outliers = data[z_scores > 3]
    
    # Isolation Forest (preview of ML)
    from sklearn.ensemble import IsolationForest
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    outlier_labels = iso_forest.fit_predict(data[[column]])
    iso_outliers = data[outlier_labels == -1]
    
    return {
        'IQR': iqr_outliers,
        'Z-score': z_outliers,
        'Isolation Forest': iso_outliers
    }

outliers = detect_outliers(df, 'amount')

print("Outlier Detection Results:")
for method, outlier_df in outliers.items():
    print(f"\n{method}: {len(outlier_df)} outliers ({len(outlier_df)/len(df)*100:.1f}%)")
    if len(outlier_df) > 0:
        print(f"  Range: ${outlier_df['amount'].min():.2f} - ${outlier_df['amount'].max():.2f}")

# Visualize outliers
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Boxplot
axes[0].boxplot([df['amount'], df['items']], labels=['Amount', 'Items'])
axes[0].set_title('Boxplot - Outlier Visualization')
axes[0].set_ylabel('Value')

# Scatter with outliers highlighted
axes[1].scatter(df['items'], df['amount'], alpha=0.5, label='Normal')
axes[1].scatter(outliers['IQR']['items'], outliers['IQR']['amount'], 
               color='red', label='Outliers (IQR)', s=50)
axes[1].set_xlabel('Items')
axes[1].set_ylabel('Amount')
axes[1].set_title('Outliers in Context')
axes[1].legend()

plt.tight_layout()
plt.show()

## 5. Temporal Patterns and Trends

In [None]:
# Time-based analysis
df['order_month'] = df['order_date'].dt.to_period('M')
df['order_weekday'] = df['order_date'].dt.day_name()
df['order_day'] = df['order_date'].dt.day

# Monthly trends
monthly_stats = df.groupby('order_month').agg({
    'amount': ['sum', 'mean', 'count'],
    'customer_id': 'nunique'
}).round(2)
monthly_stats.columns = ['Total_Revenue', 'Avg_Order', 'Order_Count', 'Unique_Customers']
print("Monthly Performance:")
print(monthly_stats.tail())

# Day of week patterns
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_stats = df.groupby('order_weekday')['amount'].agg(['mean', 'count'])
weekday_stats = weekday_stats.reindex(weekday_order)
print("\nWeekday Patterns:")
print(weekday_stats)

# Customer lifecycle
customer_stats = df.groupby('customer_id').agg({
    'amount': ['sum', 'mean', 'count'],
    'order_date': ['min', 'max']
})
customer_stats.columns = ['Total_Spent', 'Avg_Order', 'Order_Count', 'First_Order', 'Last_Order']
customer_stats['Days_Active'] = (customer_stats['Last_Order'] - customer_stats['First_Order']).dt.days
customer_stats['CLV_Category'] = pd.qcut(customer_stats['Total_Spent'], 
                                          q=4, labels=['Low', 'Medium', 'High', 'VIP'])

print("\nCustomer Lifetime Value Distribution:")
print(customer_stats['CLV_Category'].value_counts())

**Exercise 2 — Cohort Analysis (hard)**  
Create a cohort analysis showing customer retention by signup month.


In [None]:
# Your turn


<details>
<summary><b>Solution</b></summary>

```python
# Prepare cohort data
df['signup_month'] = pd.to_datetime(customers['signup_date']).dt.to_period('M')
df['order_month_dt'] = df['order_date'].dt.to_period('M')

# Create cohort table
cohort_data = df.groupby(['signup_month', 'order_month_dt']).agg({
    'customer_id': 'nunique'
}).reset_index()

# Calculate periods since signup
cohort_data['period_number'] = (cohort_data['order_month_dt'] - 
                                cohort_data['signup_month']).apply(lambda x: x.n)

# Pivot for cohort table
cohort_pivot = cohort_data.pivot_table(
    index='signup_month',
    columns='period_number',
    values='customer_id'
)

# Calculate retention rates
cohort_size = cohort_pivot.iloc[:, 0]
retention = cohort_pivot.divide(cohort_size, axis=0) * 100

print("Cohort Retention Rates (%)")
print(retention.iloc[:5, :5].round(1))  # First 5 cohorts, first 5 periods

# Visualize
plt.figure(figsize=(12, 6))
sns.heatmap(retention.iloc[:10, :10], annot=True, fmt='.0f', 
            cmap='YlOrRd', vmin=0, vmax=100)
plt.title('Customer Retention Cohort Analysis')
plt.xlabel('Periods Since Signup')
plt.ylabel('Signup Cohort')
plt.show()
```
</details>

## 6. Feature Engineering & Derived Insights

In [None]:
# Create meaningful features
df['revenue_per_item'] = df['amount'] / df['items']
df['is_weekend'] = df['order_date'].dt.dayofweek.isin([5, 6])
df['is_high_value'] = df['amount'] > df['amount'].quantile(0.75)

# Customer segmentation features
customer_features = df.groupby('customer_id').agg({
    'amount': ['sum', 'mean', 'std'],
    'items': 'mean',
    'order_id': 'count',
    'category': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'Unknown'
}).round(2)

customer_features.columns = ['Total_Revenue', 'Avg_Order_Value', 'Order_Volatility',
                             'Avg_Items', 'Order_Frequency', 'Favorite_Category']

# RFM Analysis (Recency, Frequency, Monetary)
current_date = df['order_date'].max()
rfm = df.groupby('customer_id').agg({
    'order_date': lambda x: (current_date - x.max()).days,  # Recency
    'order_id': 'count',  # Frequency
    'amount': 'sum'  # Monetary
})
rfm.columns = ['Recency', 'Frequency', 'Monetary']

# Create RFM segments
rfm['R_Score'] = pd.qcut(rfm['Recency'], q=4, labels=['4', '3', '2', '1'])
rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), q=4, labels=['1', '2', '3', '4'])
rfm['M_Score'] = pd.qcut(rfm['Monetary'], q=4, labels=['1', '2', '3', '4'])
rfm['RFM_Score'] = rfm['R_Score'].astype(str) + rfm['F_Score'].astype(str) + rfm['M_Score'].astype(str)

print("RFM Segmentation Summary:")
print(rfm.head(10))

# Top segments
print("\nTop Customer Segments:")
print(rfm['RFM_Score'].value_counts().head(10))

## 7. Data Quality & Integrity Checks

In [None]:
# Comprehensive data quality report
def comprehensive_eda_report(df):
    report = {}
    
    # Basic info
    report['shape'] = df.shape
    report['memory_usage'] = df.memory_usage(deep=True).sum() / 1024**2  # MB
    
    # Missing values
    report['missing_values'] = df.isnull().sum().to_dict()
    report['missing_percentage'] = (df.isnull().sum() / len(df) * 100).to_dict()
    
    # Duplicates
    report['duplicate_rows'] = df.duplicated().sum()
    
    # Data types
    report['dtypes'] = df.dtypes.value_counts().to_dict()
    
    # Unique values
    report['unique_counts'] = df.nunique().to_dict()
    
    # Numerical statistics
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        report['numeric_summary'] = df[numeric_cols].describe().to_dict()
    
    # Categorical statistics
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(cat_cols) > 0:
        report['categorical_summary'] = {}
        for col in cat_cols:
            report['categorical_summary'][col] = {
                'unique': df[col].nunique(),
                'most_common': df[col].mode()[0] if len(df[col].mode()) > 0 else None,
                'frequency': df[col].value_counts().iloc[0] if len(df[col].value_counts()) > 0 else 0
            }
    
    return report

eda_report = comprehensive_eda_report(df)

print("=" * 50)
print("COMPREHENSIVE EDA REPORT")
print("=" * 50)
print(f"\nDataset Shape: {eda_report['shape']}")
print(f"Memory Usage: {eda_report['memory_usage']:.2f} MB")
print(f"Duplicate Rows: {eda_report['duplicate_rows']}")
print(f"\nData Types Distribution:")
for dtype, count in eda_report['dtypes'].items():
    print(f"  {dtype}: {count}")

## 8. Key Insights & Recommendations

In [None]:
# Generate actionable insights
insights = []

# Insight 1: Best performing segment
best_segment = df.groupby('member_type')['amount'].mean().idxmax()
best_value = df.groupby('member_type')['amount'].mean().max()
insights.append(f"1. {best_segment} members have highest avg order value: ${best_value:.2f}")

# Insight 2: Peak shopping day
peak_day = weekday_stats['mean'].idxmax()
insights.append(f"2. {peak_day} has highest average order value")

# Insight 3: Customer concentration
top_10_pct = customer_stats.nlargest(int(len(customer_stats) * 0.1), 'Total_Spent')
revenue_concentration = top_10_pct['Total_Spent'].sum() / customer_stats['Total_Spent'].sum()
insights.append(f"3. Top 10% of customers generate {revenue_concentration:.1%} of revenue")

# Insight 4: Category performance
category_performance = df.groupby('category')['amount'].agg(['mean', 'count', 'sum'])
best_category = category_performance['mean'].idxmax()
insights.append(f"4. {best_category} has highest average order value")

# Insight 5: Growth trend
first_month_revenue = monthly_stats.iloc[0]['Total_Revenue']
last_month_revenue = monthly_stats.iloc[-1]['Total_Revenue']
growth_rate = (last_month_revenue - first_month_revenue) / first_month_revenue
insights.append(f"5. Revenue grew {growth_rate:.1%} from first to last month")

print("📊 KEY INSIGHTS FROM EDA")
print("="*50)
for insight in insights:
    print(insight)

print("\n📈 RECOMMENDED ACTIONS")
print("="*50)
recommendations = [
    "• Focus marketing efforts on converting Basic to Premium members",
    f"• Increase inventory and promotions for {peak_day}s",
    "• Implement VIP loyalty program for top 10% customers",
    f"• Expand {best_category} product line",
    "• Investigate and replicate factors driving growth"
]
for rec in recommendations:
    print(rec)

## 9. Mini-Challenges
- **M1 (easy):** Find the customer with highest average order value
- **M2 (medium):** Identify seasonal patterns in the data
- **M3 (hard):** Build a customer scoring system based on multiple factors

In [None]:
# Your turn - try the challenges!


<details>
<summary><b>Solutions</b></summary>

```python
# M1 - Top customer by average order
customer_avg = df.groupby('customer_id')['amount'].mean().sort_values(ascending=False)
top_customer = customer_avg.index[0]
print(f"Customer {top_customer} has highest avg order: ${customer_avg.iloc[0]:.2f}")

# M2 - Seasonal patterns
df['month'] = df['order_date'].dt.month
df['season'] = df['month'].apply(lambda x: 
    'Summer' if x in [12, 1, 2] else
    'Autumn' if x in [3, 4, 5] else
    'Winter' if x in [6, 7, 8] else 'Spring')

seasonal_analysis = df.groupby('season')['amount'].agg(['mean', 'sum', 'count'])
print("\nSeasonal Patterns:")
print(seasonal_analysis)

# M3 - Customer scoring
def customer_score(row):
    score = 0
    # Recency (lower is better)
    if row['Recency'] <= 30: score += 30
    elif row['Recency'] <= 60: score += 20
    elif row['Recency'] <= 90: score += 10
    
    # Frequency
    if row['Frequency'] >= 10: score += 30
    elif row['Frequency'] >= 5: score += 20
    elif row['Frequency'] >= 2: score += 10
    
    # Monetary
    if row['Monetary'] >= 1000: score += 40
    elif row['Monetary'] >= 500: score += 25
    elif row['Monetary'] >= 200: score += 15
    
    return score

rfm['Customer_Score'] = rfm.apply(customer_score, axis=1)
rfm['Customer_Grade'] = pd.cut(rfm['Customer_Score'], 
                               bins=[0, 30, 60, 90, 100],
                               labels=['D', 'C', 'B', 'A'])

print("\nCustomer Scoring Distribution:")
print(rfm['Customer_Grade'].value_counts())
print("\nTop 5 Customers by Score:")
print(rfm.nlargest(5, 'Customer_Score')[['Customer_Score', 'Customer_Grade']])
```
</details>

## Wrap-Up & Next Steps
✅ You've completed a comprehensive EDA workflow  
✅ You can identify patterns, outliers, and relationships  
✅ You generated actionable business insights  
✅ You're ready to visualize these findings beautifully  

**Week 2 Preview:** Transform these insights into interactive dashboards and compelling visualizations!

**Assignment:** Apply this EDA workflow to your own dataset and create a report with 5 key insights.
