<a href="https://colab.research.google.com/github/mallelamanojkumar90/AIML/blob/main/Week2_Day5_Probability_Fundamentals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 2, Day 5: Probability Fundamentals

## Learning Objectives
- Understand basic probability concepts
- Learn probability distributions
- Master Bayes' Theorem
- Apply probability concepts in ML context


In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

## 1. Basic Probability Concepts

In [None]:
def coin_flip_simulation(n_flips=1000):
    """Simulate coin flips and demonstrate probability"""
    # Simulate coin flips
    flips = np.random.choice(['Heads', 'Tails'], size=n_flips)

    # Calculate probabilities
    prob_heads = np.mean(flips == 'Heads')
    prob_tails = np.mean(flips == 'Tails')

    print(f"Probability of Heads: {prob_heads:.3f}")
    print(f"Probability of Tails: {prob_tails:.3f}")

    # Plot convergence to 0.5
    cumulative_prob = np.cumsum(flips == 'Heads') / np.arange(1, n_flips + 1)

    plt.figure(figsize=(10, 5))
    plt.plot(cumulative_prob)
    plt.axhline(y=0.5, color='r', linestyle='--')
    plt.title('Convergence of Coin Flip Probability')
    plt.xlabel('Number of Flips')
    plt.ylabel('Probability of Heads')
    plt.show()

coin_flip_simulation()

## 2. Probability Distributions

In [None]:
def demonstrate_distributions():
    """Demonstrate common probability distributions"""
    np.random.seed(42)

    # Generate data
    normal = np.random.normal(loc=0, scale=1, size=1000)
    binomial = np.random.binomial(n=10, p=0.5, size=1000)
    poisson = np.random.poisson(lam=3, size=1000)

    # Plot distributions
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

    # Normal distribution
    sns.histplot(normal, kde=True, ax=ax1)
    ax1.set_title('Normal Distribution')

    # Binomial distribution
    sns.histplot(binomial, discrete=True, ax=ax2)
    ax2.set_title('Binomial Distribution')

    # Poisson distribution
    sns.histplot(poisson, discrete=True, ax=ax3)
    ax3.set_title('Poisson Distribution')

    plt.tight_layout()
    plt.show()

    # Print summary statistics
    distributions = {'Normal': normal, 'Binomial': binomial, 'Poisson': poisson}
    for name, dist in distributions.items():
        print(f"\n{name} Distribution:")
        print(f"Mean: {np.mean(dist):.3f}")
        print(f"Variance: {np.var(dist):.3f}")

demonstrate_distributions()

## 3. Bayes' Theorem

In [None]:
def demonstrate_bayes_theorem():
    """Demonstrate Bayes' Theorem with a medical test example"""

    # Example: Disease testing
    # Prior probability of having the disease
    P_disease = 0.01  # 1% of population has the disease

    # Test accuracy
    P_positive_given_disease = 0.95    # Sensitivity
    P_negative_given_healthy = 0.90    # Specificity

    # Calculate P(positive_test)
    P_positive = (P_positive_given_disease * P_disease +
                 (1 - P_negative_given_healthy) * (1 - P_disease))

    # Calculate P(disease|positive) using Bayes' Theorem
    P_disease_given_positive = (P_positive_given_disease * P_disease) / P_positive

    print("Medical Test Example:")
    print(f"Probability of having disease: {P_disease:.1%}")
    print(f"Test sensitivity: {P_positive_given_disease:.1%}")
    print(f"Test specificity: {P_negative_given_healthy:.1%}")
    print(f"\nIf test is positive, probability of having disease: {P_disease_given_positive:.1%}")

    # Visualize with a tree diagram
    plt.figure(figsize=(12, 6))
    plt.axis('off')

    # Draw tree structure
    plt.text(0.1, 0.8, f'Disease\n{P_disease:.1%}', ha='center')
    plt.text(0.1, 0.2, f'No Disease\n{1-P_disease:.1%}', ha='center')

    plt.text(0.5, 0.9, f'Test +ve\n{P_positive_given_disease:.1%}', ha='center')
    plt.text(0.5, 0.7, f'Test -ve\n{1-P_positive_given_disease:.1%}', ha='center')
    plt.text(0.5, 0.3, f'Test +ve\n{1-P_negative_given_healthy:.1%}', ha='center')
    plt.text(0.5, 0.1, f'Test -ve\n{P_negative_given_healthy:.1%}', ha='center')

    # Draw lines
    plt.plot([0.15, 0.45], [0.8, 0.9], 'k-')
    plt.plot([0.15, 0.45], [0.8, 0.7], 'k-')
    plt.plot([0.15, 0.45], [0.2, 0.3], 'k-')
    plt.plot([0.15, 0.45], [0.2, 0.1], 'k-')

    plt.title("Probability Tree Diagram")
    plt.show()

demonstrate_bayes_theorem()

## 4. Probability in Machine Learning

In [None]:
def demonstrate_ml_probability():
    """Demonstrate probability concepts in ML context"""

    # Generate synthetic classification data
    np.random.seed(42)
    n_samples = 1000

    # Feature (e.g., email length)
    spam_length = np.random.normal(100, 20, n_samples // 2)
    ham_length = np.random.normal(50, 15, n_samples // 2)

    # Labels
    labels = np.array(['spam'] * (n_samples // 2) + ['ham'] * (n_samples // 2))
    lengths = np.concatenate([spam_length, ham_length])

    # Plot distributions
    plt.figure(figsize=(10, 5))
    sns.kdeplot(data=pd.DataFrame({'length': lengths, 'type': labels}),
                x='length', hue='type')
    plt.title('Email Length Distribution by Type')
    plt.xlabel('Email Length')
    plt.ylabel('Density')
    plt.show()

    # Calculate probabilities
    def calculate_probability(x, mean, std):
        return stats.norm.pdf(x, mean, std)

    # Example email length
    test_length = 80

    p_spam = calculate_probability(test_length, np.mean(spam_length), np.std(spam_length))
    p_ham = calculate_probability(test_length, np.mean(ham_length), np.std(ham_length))

    print(f"\nFor an email of length {test_length}:")
    print(f"P(length|spam): {p_spam:.4f}")
    print(f"P(length|ham): {p_ham:.4f}")
    print(f"Likelihood ratio (spam/ham): {p_spam/p_ham:.4f}")

demonstrate_ml_probability()

## Practical Exercises

In [None]:
# Exercise 1: Spam Filter Probability

def spam_filter_exercise():
    # Create synthetic email data
    np.random.seed(42)
    n_emails = 1000

    # Features
    contains_urgent = np.random.binomial(1, 0.7, n_emails)  # 1 if contains "urgent"
    contains_money = np.random.binomial(1, 0.6, n_emails)   # 1 if contains "money"

    # Generate labels (spam/ham)
    is_spam = (contains_urgent & contains_money) | \
              (np.random.random(n_emails) < 0.2)  # Some random noise

    # Calculate probabilities
    p_spam = np.mean(is_spam)
    p_urgent_given_spam = np.mean(contains_urgent[is_spam])
    p_money_given_spam = np.mean(contains_money[is_spam])

    print("Email Classification Probabilities:")
    print(f"P(spam): {p_spam:.3f}")
    print(f"P(urgent|spam): {p_urgent_given_spam:.3f}")
    print(f"P(money|spam): {p_money_given_spam:.3f}")

    # Visualize relationships
    plt.figure(figsize=(10, 5))

    # Create contingency table
    contingency = pd.crosstab(contains_urgent, contains_money)

    sns.heatmap(contingency, annot=True, fmt='d', cmap='YlOrRd')
    plt.title('Relationship between "urgent" and "money" keywords')
    plt.xlabel('Contains "money"')
    plt.ylabel('Contains "urgent"')
    plt.show()

spam_filter_exercise()

In [None]:
# Exercise 2: Customer Purchase Probability

def customer_purchase_exercise():
    # Generate customer data
    np.random.seed(42)
    n_customers = 1000

    # Customer features
    age = np.random.normal(35, 10, n_customers)
    income = np.random.normal(50000, 20000, n_customers)

    # Generate purchase probability based on age and income
    purchase_prob = stats.norm.cdf(age, 35, 10) * stats.norm.cdf(income, 50000, 20000)
    purchased = np.random.binomial(1, purchase_prob)

    # Create DataFrame
    customer_data = pd.DataFrame({
        'Age': age,
        'Income': income,
        'Purchased': purchased
    })

    # Calculate probabilities
    p_purchase = np.mean(purchased)

    print("Customer Purchase Analysis:")
    print(f"Overall purchase probability: {p_purchase:.3f}")

    # Visualize purchase probability by age and income
    plt.figure(figsize=(12, 5))

    plt.subplot(121)
    sns.boxplot(data=customer_data, x='Purchased', y='Age')
    plt.title('Age Distribution by Purchase Decision')

    plt.subplot(122)
    sns.boxplot(data=customer_data, x='Purchased', y='Income')
    plt.title('Income Distribution by Purchase Decision')

    plt.tight_layout()
    plt.show()

customer_purchase_exercise()

## MCQ Quiz

1. What is the sum of probabilities for all possible outcomes in a sample space?
   - a) 0
   - b) 0.5
   - c) 1
   - d) 100

2. Which probability distribution is symmetric and bell-shaped?
   - a) Uniform
   - b) Normal
   - c) Exponential
   - d) Poisson

3. What does Bayes' Theorem calculate?
   - a) Joint probability
   - b) Conditional probability
   - c) Marginal probability
   - d) Independent probability

4. In a binomial distribution, what must be true about each trial?
   - a) Continuous outcomes
   - b) Independent events
   - c) Variable probability
   - d) Dependent events

5. What is the probability of getting heads OR tails in a fair coin flip?
   - a) 0.5
   - b) 1.0
   - c) 0
   - d) 2.0

6. Which term describes the probability of A given that B has occurred?
   - a) Joint probability
   - b) Conditional probability
   - c) Independent probability
   - d) Marginal probability

7. What is the expected value of a fair six-sided die roll?
   - a) 3
   - b) 3.5
   - c) 6
   - d) 7

8. In machine learning, what does the likelihood function measure?
   - a) Model accuracy
   - b) Probability of data given parameters
   - c) Error rate
   - d) Feature importance

9. What is the variance of a constant?
   - a) 1
   - b) The constant value
   - c) 0
   - d) Undefined

10. Which probability concept is most relevant to Naive Bayes classification?
    - a) Central Limit Theorem
    - b) Law of Large Numbers
    - c) Conditional Independence
    - d) Joint Distribution

Answers: 1-c, 2-b, 3-b, 4-b, 5-b, 6-b, 7-b, 8-b, 9-c, 10-c