<a href="https://colab.research.google.com/github/mallelamanojkumar90/AIML/blob/main/Week2_Day3_Exploratory_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 2, Day 3: Exploratory Data Analysis (EDA)

## Learning Objectives
- Understand the importance of EDA
- Learn key EDA techniques
- Master data summarization methods
- Practice identifying patterns and insights


In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')

## 1. Data Loading and Initial Exploration

In [None]:
# Create sample dataset
np.random.seed(42)
n_samples = 1000

# Generate synthetic e-commerce data
data = {
    'customer_id': range(1, n_samples + 1),
    'age': np.random.normal(35, 12, n_samples),
    'purchase_amount': np.random.exponential(100, n_samples),
    'items_bought': np.random.poisson(3, n_samples),
    'customer_type': np.random.choice(['New', 'Regular', 'VIP'], n_samples, p=[0.3, 0.5, 0.2]),
    'satisfaction_score': np.random.randint(1, 6, n_samples)
}

df = pd.DataFrame(data)

# Initial exploration
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nData Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

## 2. Data Quality Analysis

In [None]:
def analyze_data_quality(df):
    # Check for missing values
    print("Missing Values:")
    print(df.isnull().sum())

    # Check for duplicates
    print("\nNumber of duplicates:", df.duplicated().sum())

    # Value counts for categorical variables
    print("\nCustomer Type Distribution:")
    print(df['customer_type'].value_counts())

    # Check for outliers using IQR method
    def detect_outliers(column):
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        outliers = df[(df[column] < Q1 - 1.5*IQR) | (df[column] > Q3 + 1.5*IQR)][column]
        return len(outliers)

    print("\nNumber of outliers:")
    for column in ['age', 'purchase_amount', 'items_bought']:
        print(f"{column}: {detect_outliers(column)}")

analyze_data_quality(df)

## 3. Univariate Analysis

In [None]:
def perform_univariate_analysis(df):
    # Set up the figure
    plt.figure(figsize=(15, 10))

    # Histogram for age
    plt.subplot(231)
    sns.histplot(df['age'], kde=True)
    plt.title('Age Distribution')

    # Histogram for purchase amount
    plt.subplot(232)
    sns.histplot(df['purchase_amount'], kde=True)
    plt.title('Purchase Amount Distribution')

    # Count plot for items bought
    plt.subplot(233)
    sns.countplot(data=df, x='items_bought')
    plt.title('Items Bought Distribution')

    # Bar plot for customer type
    plt.subplot(234)
    sns.countplot(data=df, x='customer_type')
    plt.title('Customer Type Distribution')

    # Bar plot for satisfaction score
    plt.subplot(235)
    sns.countplot(data=df, x='satisfaction_score')
    plt.title('Satisfaction Score Distribution')

    plt.tight_layout()
    plt.show()

perform_univariate_analysis(df)

## 4. Bivariate Analysis

In [None]:
def perform_bivariate_analysis(df):
    # Set up the figure
    plt.figure(figsize=(15, 10))

    # Scatter plot: Age vs Purchase Amount
    plt.subplot(231)
    sns.scatterplot(data=df, x='age', y='purchase_amount', alpha=0.5)
    plt.title('Age vs Purchase Amount')

    # Box plot: Customer Type vs Purchase Amount
    plt.subplot(232)
    sns.boxplot(data=df, x='customer_type', y='purchase_amount')
    plt.title('Purchase Amount by Customer Type')

    # Box plot: Satisfaction vs Purchase Amount
    plt.subplot(233)
    sns.boxplot(data=df, x='satisfaction_score', y='purchase_amount')
    plt.title('Purchase Amount by Satisfaction')

    # Violin plot: Customer Type vs Age
    plt.subplot(234)
    sns.violinplot(data=df, x='customer_type', y='age')
    plt.title('Age Distribution by Customer Type')

    # Heat map: Correlation matrix
    plt.subplot(235)
    numeric_cols = ['age', 'purchase_amount', 'items_bought', 'satisfaction_score']
    sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')

    plt.tight_layout()
    plt.show()

perform_bivariate_analysis(df)

## 5. Advanced Analysis and Insights

In [None]:
def generate_insights(df):
    # Customer segmentation by purchase amount
    df['purchase_segment'] = pd.qcut(df['purchase_amount'], q=3, labels=['Low', 'Medium', 'High'])

    # Average metrics by customer type
    customer_metrics = df.groupby('customer_type').agg({
        'purchase_amount': 'mean',
        'items_bought': 'mean',
        'satisfaction_score': 'mean'
    }).round(2)

    print("Average Metrics by Customer Type:")
    print(customer_metrics)

    # Satisfaction analysis
    satisfaction_by_segment = pd.crosstab(df['purchase_segment'], df['satisfaction_score'])
    print("\nSatisfaction Distribution by Purchase Segment:")
    print(satisfaction_by_segment)

    # Age group analysis
    df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 50, 100], labels=['Young', 'Adult', 'Middle-aged', 'Senior'])
    age_analysis = df.groupby('age_group').agg({
        'purchase_amount': ['mean', 'count'],
        'satisfaction_score': 'mean'
    }).round(2)

    print("\nMetrics by Age Group:")
    print(age_analysis)

generate_insights(df)

## Practical Exercises

In [None]:
# Exercise 1: Customer Behavior Analysis

def analyze_customer_behavior(df):
    """
    Analyze customer behavior patterns and create visualizations
    """
    # 1. Calculate average purchase amount by age group and customer type
    pivot_table = pd.pivot_table(df,
                                values='purchase_amount',
                                index='age_group',
                                columns='customer_type',
                                aggfunc='mean')

    # 2. Visualize the results
    plt.figure(figsize=(12, 6))
    pivot_table.plot(kind='bar')
    plt.title('Average Purchase Amount by Age Group and Customer Type')
    plt.xlabel('Age Group')
    plt.ylabel('Average Purchase Amount')
    plt.legend(title='Customer Type')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # 3. Print key findings
    print("Key Findings:")
    print(pivot_table.round(2))

analyze_customer_behavior(df)

In [None]:
# Exercise 2: Satisfaction Analysis

def analyze_satisfaction(df):
    """
    Analyze factors affecting customer satisfaction
    """
    # 1. Create satisfaction segments
    df['satisfaction_level'] = pd.cut(df['satisfaction_score'],
                                     bins=[0, 2, 3, 5],
                                     labels=['Low', 'Medium', 'High'])

    # 2. Analyze purchase patterns by satisfaction
    satisfaction_analysis = df.groupby('satisfaction_level').agg({
        'purchase_amount': ['mean', 'count'],
        'items_bought': 'mean'
    }).round(2)

    print("Satisfaction Analysis:")
    print(satisfaction_analysis)

    # 3. Visualize satisfaction distribution by customer type
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='satisfaction_level', hue='customer_type')
    plt.title('Satisfaction Distribution by Customer Type')
    plt.xlabel('Satisfaction Level')
    plt.ylabel('Count')
    plt.legend(title='Customer Type')
    plt.tight_layout()
    plt.show()

analyze_satisfaction(df)

## MCQ Quiz

1. What is the primary purpose of EDA?
   - a) To clean data
   - b) To understand patterns in data
   - c) To build models
   - d) To create reports

2. Which plot is best for showing the distribution of a continuous variable?
   - a) Bar plot
   - b) Histogram
   - c) Scatter plot
   - d) Line plot

3. What does df.describe() show?
   - a) Column names
   - b) Missing values
   - c) Statistical summary
   - d) Data types

4. Which method is used to detect missing values?
   - a) df.missing()
   - b) df.isnull()
   - c) df.isna()
   - d) Both b and c

5. What is a box plot useful for?
   - a) Showing correlations
   - b) Detecting outliers
   - c) Time series analysis
   - d) Categorical counts

6. Which plot shows the relationship between two continuous variables?
   - a) Bar plot
   - b) Scatter plot
   - c) Box plot
   - d) Pie chart

7. What does IQR stand for?
   - a) Internal Quality Range
   - b) Interquartile Range
   - c) Internal Quantity Review
   - d) Index Quality Rating

8. Which method creates a frequency table?
   - a) value_counts()
   - b) frequency()
   - c) count()
   - d) groupby()

9. What is a heat map used for?
   - a) Showing missing values
   - b) Showing correlations
   - c) Showing distributions
   - d) Showing trends

10. Which type of analysis examines two variables together?
    - a) Univariate analysis
    - b) Bivariate analysis
    - c) Multivariate analysis
    - d) Single analysis

Answers: 1-b, 2-b, 3-c, 4-d, 5-b, 6-b, 7-b, 8-a, 9-b, 10-b