<a href="https://colab.research.google.com/github/mallelamanojkumar90/AIML/blob/main/Week2_Day6_EDA_Hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 2, Day 6: Hackathon Challenge - Exploratory Data Analysis

## Challenge Overview
In this hackathon, you will perform a comprehensive exploratory data analysis on a real-world dataset. You'll apply the concepts learned throughout Week 2:
- Data cleaning and preprocessing
- Statistical analysis
- Data visualization
- Probability distributions
- Insight generation

## Dataset: E-commerce Customer Behavior
We'll analyze an e-commerce dataset containing customer behavior and purchase information.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')

## Part 1: Data Generation and Loading

In [None]:
def generate_ecommerce_data(n_customers=1000):
    """Generate synthetic e-commerce data"""
    np.random.seed(42)

    data = {
        'customer_id': range(1, n_customers + 1),
        'age': np.random.normal(35, 12, n_customers),
        'time_spent_mins': np.random.exponential(30, n_customers),
        'pages_visited': np.random.poisson(8, n_customers),
        'cart_value': np.random.normal(100, 50, n_customers),
        'items_purchased': np.random.poisson(3, n_customers),
        'customer_type': np.random.choice(['New', 'Returning', 'Loyal'], n_customers, p=[0.3, 0.5, 0.2]),
        'device': np.random.choice(['Mobile', 'Desktop', 'Tablet'], n_customers, p=[0.45, 0.4, 0.15]),
        'satisfaction_score': np.random.randint(1, 6, n_customers)
    }

    # Add some correlations and constraints
    data['cart_value'] = data['cart_value'] * (data['items_purchased'] / 3) + np.random.normal(0, 10, n_customers)
    data['cart_value'] = np.maximum(0, data['cart_value'])  # Ensure non-negative values

    return pd.DataFrame(data)

# Generate dataset
df = generate_ecommerce_data()
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

## Challenge Tasks

### Task 1: Data Cleaning and Preprocessing

In [None]:
def clean_data(df):
    """Clean and preprocess the dataset"""
    # Your code here:
    # 1. Handle missing values
    # 2. Remove duplicates
    # 3. Handle outliers
    # 4. Feature engineering
    pass

# Example solution structure:
def example_cleaning(df):
    df_clean = df.copy()

    # Check for missing values
    print("Missing values:")
    print(df_clean.isnull().sum())

    # Check for duplicates
    print("\nDuplicate rows:", df_clean.duplicated().sum())

    # Handle outliers using IQR method
    numeric_cols = ['age', 'time_spent_mins', 'cart_value']
    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        df_clean[f'{col}_outlier'] = ((df_clean[col] < (Q1 - 1.5 * IQR)) |
                                      (df_clean[col] > (Q3 + 1.5 * IQR)))

    # Feature engineering
    df_clean['avg_item_value'] = df_clean['cart_value'] / df_clean['items_purchased']

    return df_clean

df_clean = example_cleaning(df)

### Task 2: Statistical Analysis

In [None]:
def perform_statistical_analysis(df):
    """Perform comprehensive statistical analysis"""
    # Your code here:
    # 1. Descriptive statistics
    # 2. Correlation analysis
    # 3. Hypothesis testing
    # 4. Distribution analysis
    pass

# Example solution structure:
def example_analysis(df):
    # Basic statistics
    print("Basic Statistics:")
    print(df.describe())

    # Correlation analysis
    numeric_cols = ['age', 'time_spent_mins', 'cart_value', 'items_purchased']
    correlation = df[numeric_cols].corr()

    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation, annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()

    # Distribution analysis
    plt.figure(figsize=(15, 5))
    for i, col in enumerate(numeric_cols[:3], 1):
        plt.subplot(1, 3, i)
        sns.histplot(df[col], kde=True)
        plt.title(f'{col} Distribution')
    plt.tight_layout()
    plt.show()

example_analysis(df_clean)

### Task 3: Customer Segmentation Analysis

In [None]:
def analyze_customer_segments(df):
    """Analyze different customer segments"""
    # Your code here:
    # 1. Segment customers by type
    # 2. Analyze behavior patterns
    # 3. Compare metrics across segments
    pass

# Example solution structure:
def example_segmentation(df):
    # Analyze metrics by customer type
    segment_analysis = df.groupby('customer_type').agg({
        'cart_value': ['mean', 'count'],
        'items_purchased': 'mean',
        'satisfaction_score': 'mean'
    }).round(2)

    print("Customer Segment Analysis:")
    print(segment_analysis)

    # Visualize segments
    plt.figure(figsize=(15, 5))

    plt.subplot(131)
    sns.boxplot(data=df, x='customer_type', y='cart_value')
    plt.title('Cart Value by Customer Type')

    plt.subplot(132)
    sns.boxplot(data=df, x='customer_type', y='items_purchased')
    plt.title('Items Purchased by Customer Type')

    plt.subplot(133)
    sns.boxplot(data=df, x='customer_type', y='satisfaction_score')
    plt.title('Satisfaction Score by Customer Type')

    plt.tight_layout()
    plt.show()

example_segmentation(df_clean)

### Task 4: Advanced Visualization

In [None]:
def create_visualizations(df):
    """Create advanced visualizations"""
    # Your code here:
    # 1. Create multi-dimensional plots
    # 2. Time-based analysis
    # 3. Interactive visualizations
    pass

# Example solution structure:
def example_visualization(df):
    # Scatter plot with multiple dimensions
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x='time_spent_mins', y='cart_value',
                    hue='customer_type', size='items_purchased')
    plt.title('Customer Behavior Analysis')
    plt.show()

    # Device usage by customer type
    plt.figure(figsize=(10, 6))
    device_customer = pd.crosstab(df['device'], df['customer_type'])
    device_customer.plot(kind='bar', stacked=True)
    plt.title('Device Usage by Customer Type')
    plt.legend(title='Customer Type')
    plt.show()

example_visualization(df_clean)

### Task 5: Insights and Recommendations

In [None]:
def generate_insights(df):
    """Generate insights and recommendations"""
    # Your code here:
    # 1. Key findings
    # 2. Business recommendations
    # 3. Future analysis suggestions
    pass

# Example solution structure:
def example_insights(df):
    print("Key Insights:")

    # Customer behavior patterns
    print("\n1. Customer Behavior:")
    print(f"Average cart value: ${df['cart_value'].mean():.2f}")
    print(f"Average items per purchase: {df['items_purchased'].mean():.2f}")

    # Customer satisfaction
    print("\n2. Customer Satisfaction:")
    satisfaction_by_type = df.groupby('customer_type')['satisfaction_score'].mean()
    print(satisfaction_by_type)

    # Device preferences
    print("\n3. Device Usage:")
    device_usage = df['device'].value_counts(normalize=True)
    print(device_usage)

    print("\nRecommendations:")
    print("1. Focus on mobile optimization")
    print("2. Implement loyalty program improvements")
    print("3. Enhance customer engagement strategies")

example_insights(df_clean)

## Evaluation Criteria

Your analysis will be evaluated based on:
1. Data Cleaning and Preprocessing (20%)
   - Handling missing values
   - Dealing with outliers
   - Feature engineering

2. Statistical Analysis (20%)
   - Descriptive statistics
   - Correlation analysis
   - Distribution analysis

3. Visualization Quality (20%)
   - Appropriate chart types
   - Clear presentation
   - Meaningful insights

4. Customer Segmentation (20%)
   - Segment identification
   - Behavior analysis
   - Actionable insights

5. Insights and Recommendations (20%)
   - Clear findings
   - Business value
   - Actionable recommendations

## Submission Guidelines
1. Complete all tasks in this notebook
2. Add comments explaining your analysis
3. Include a summary of findings
4. Submit the completed notebook