In [None]:
# Solution for Exam 2021, 1MS041 (Introduction to Data Science)
# Implemented step by step for Problems 1-6

import numpy as np
import math
from scipy.stats import binom

# Problem 1: Probability Warmup
# ----------------------------
p = 11 / 20  # Probability of knowing the correct answer
n_questions = 20

# Part 1: Compute probabilities P(N < 10 | Y >= T)
problem11_probabilities = []

for T in range(n_questions + 1):
    total_prob_pass = 0  # P(Y >= T)
    prob_N_lt_10_given_Y = 0

    for N in range(n_questions + 1):
        p_N = binom.pmf(N, n_questions, p)  # P(N=k)
        
        if N < 10:
            for Z in range(max(0, T - N), n_questions - N + 1):
                p_Z = binom.pmf(Z, n_questions - N, 0.5)  # P(Z=j | N)
                total_prob_pass += p_N * p_Z
                prob_N_lt_10_given_Y += p_N * p_Z
        else:
            for Z in range(max(0, T - N), n_questions - N + 1):
                p_Z = binom.pmf(Z, n_questions - N, 0.5)
                total_prob_pass += p_N * p_Z

    if total_prob_pass > 0:
        problem11_probabilities.append(prob_N_lt_10_given_Y / total_prob_pass)
    else:
        problem11_probabilities.append(0)

# Part 2: Find smallest T where P(N >= 10 | Y >= T) >= 0.9
problem12_T = None

for T in range(n_questions + 1):
    total_prob_pass = 0
    prob_N_ge_10_given_Y = 0

    for N in range(n_questions + 1):
        p_N = binom.pmf(N, n_questions, p)

        if N >= 10:
            for Z in range(max(0, T - N), n_questions - N + 1):
                p_Z = binom.pmf(Z, n_questions - N, 0.5)
                total_prob_pass += p_N * p_Z
                prob_N_ge_10_given_Y += p_N * p_Z
        else:
            for Z in range(max(0, T - N), n_questions - N + 1):
                p_Z = binom.pmf(Z, n_questions - N, 0.5)
                total_prob_pass += p_N * p_Z

    if total_prob_pass > 0 and prob_N_ge_10_given_Y / total_prob_pass >= 0.9:
        problem12_T = T
        break

# Problem 2: Random Variable Generation
# -------------------------------------
# Part 1: Linear Congruential Generator (LCG)
def problem2_LCG(size=None, seed=0):
    a, c, m = 1664525, 1013904223, 2**32  # Parameters satisfying Hull-Dobell theorem
    random_numbers = []
    x = seed
    for _ in range(size):
        x = (a * x + c) % m
        random_numbers.append(x / m)
    return random_numbers

# Part 2: Uniform generator using LCG
def problem2_uniform(generator=None, period=2**32, size=None, seed=0):
    lcg_samples = generator(size=size, seed=seed)
    return [sample / period for sample in lcg_samples]

# Part 3: Accept-Reject Sampling
def problem2_accept_reject(uniformGenerator=None, size=None, seed=0):
    samples = []
    np.random.seed(seed)
    c = math.pi / 2  # Scaling constant

    while len(samples) < size:
        u1, u2 = uniformGenerator(2, seed=np.random.randint(1, 100000))
        x = u1  # Candidate sample
        if u2 <= (math.pi / 2) * abs(math.sin(2 * math.pi * x)):
            samples.append(x)

    return samples

# Problem 3: Concentration of Measure
# -----------------------------------
problem3_answer_1 = [1, 5]  # Sub-Gaussian for mean and variance concentrate exponentially
problem3_answer_2 = [2, 3, 6]  # Sub-exponential and finite variance concentrate in weaker sense

# Problem 4: SMS Spam Filtering
# -----------------------------
# Function to calculate probabilities and Hoeffding intervals
def calculate_probability_and_interval(spam_no_spam, condition):
    spam_condition = [x for x in spam_no_spam if condition(x[0].lower()) and x[1] == 1]
    condition_count = [x for x in spam_no_spam if condition(x[0].lower())]
    
    hatP = len(spam_condition) / len(condition_count) if condition_count else 0
    l = math.sqrt(math.log(2 / 0.1) / (2 * len(condition_count)))

    return hatP, l

# Example usage
# Assuming `spam_no_spam` is loaded with SMS data
# problem4_hatP, problem4_l = calculate_probability_and_interval(spam_no_spam, lambda text: "free" in text or "prize" in text)
# problem4_hatP2, problem4_l2 = calculate_probability_and_interval(spam_no_spam, lambda text: text.count("free") >= 2)

# Problem 5: Markovian Travel
# ---------------------------
import pandas as pd

# Load travel data
def load_travel_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Generate transition matrix
def generate_transition_matrix(data):
    cities = sorted(set(data['start_city']).union(data['end_city']))
    n_cities = len(cities)
    city_to_index = {city: idx for idx, city in enumerate(cities)}

    transition_matrix = np.zeros((n_cities, n_cities))
    for _, row in data.iterrows():
        start_idx = city_to_index[row['start_city']]
        end_idx = city_to_index[row['end_city']]
        transition_matrix[start_idx, end_idx] += 1

    # Normalize to probabilities
    transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)
    return transition_matrix, city_to_index

# Compute stationary distribution
def compute_stationary_distribution(transition_matrix):
    eigenvalues, eigenvectors = np.linalg.eig(transition_matrix.T)
    stationary = eigenvectors[:, np.isclose(eigenvalues, 1)]
    stationary = stationary[:, 0]
    stationary = stationary / stationary.sum()
    return stationary.real

# Compute return probability
def compute_return_probability(transition_matrix, city_to_index, start_city, steps):
    start_idx = city_to_index[start_city]
    current_dist = np.zeros(transition_matrix.shape[0])
    current_dist[start_idx] = 1

    for _ in range(steps):
        current_dist = current_dist @ transition_matrix

    return current_dist[start_idx]

# Problem 6: Black Box Testing
# ----------------------------
def compute_metrics(y_true, y_pred):
    true_positive = sum((y_true == 1) & (y_pred == 1))
    false_positive = sum((y_true == 0) & (y_pred == 1))
    false_negative = sum((y_true == 1) & (y_pred == 0))
    true_negative = sum((y_true == 0) & (y_pred == 0))

    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
    accuracy = (true_positive + true_negative) / len(y_true)

    return precision, recall, accuracy

# Compute confidence intervals using Hoeffding's inequality
def compute_confidence_interval(metric, n, delta=0.05):
    l = math.sqrt(math.log(2 / delta) / (2 * n))
    return metric - l, metric + l

# Example usage for Problem 6
# precision, recall, accuracy = compute_metrics(Y_test_problem6, predictions_problem6)
# precision_interval = compute_confidence_interval(precision, len(Y_test_problem6))
# recall_interval = compute_confidence_interval(recall, len(Y_test_problem6))
# accuracy_interval = compute_confidence_interval(accuracy, len(Y_test_problem6))


Below is the solution approach for each of the 6 problems. I'll work on each step and provide complete Python implementations.

---

### **Problem 1: Probability Warmup**
This involves calculating probabilities for thresholds \( T \) and determining a threshold with a 90% confidence. 

I'll calculate and provide:
1. The list of probabilities \( P(N < 10 | Y \geq T) \).
2. The smallest threshold \( T \) such that \( P(N \geq 10 | Y \geq T) \geq 0.9 \).

---

### **Problem 2: Random Variable Generation and Transformation**
This problem involves:
1. Implementing a Linear Congruential Generator (LCG).
2. Sampling uniformly from [0, 1].
3. Using an accept-reject method to sample from \( p_0(x) = \frac{\pi}{2} |\sin(2\pi x)| \).

---

### **Problem 3: Concentration of Measure**
Analyze which scenarios exhibit:
1. Exponential concentration.
2. Weaker concentration bounds.

---

### **Problem 4: SMS Spam Filtering**
This involves:
1. Calculating the probability \( P(Y=1 | \text{"free" or "prize"} \in X) \).
2. Constructing a 90% confidence interval using Hoeffding's inequality.
3. Repeating for cases where "free" appears twice.

---

### **Problem 5: Markovian Travel**
Tasks:
1. Load and analyze the travel dataset (`flights.csv`).
2. Compute the transition matrix for travel between cities.
3. Derive the stationary distribution.
4. Calculate the probability of returning to a specific city after three steps.

---

### **Problem 6: Black Box Testing**
Tasks:
1. Calculate precision, recall, and accuracy metrics for a provided black-box model.
2. Provide confidence intervals using Hoeffding's inequality.
3. Analyze how using a VC-dimension-3 classifier would affect confidence intervals.

---

I'll start implementing the solutions for all six problems now. Stay tuned!

The initial structure for solving all 6 problems is prepared. Key sections include implementations for probability calculations, LCG generation, accept-reject sampling, and concentration measures. Further steps will involve refining and completing the code for Problems 4 through 6.

The code for Problems 5 and 6 has been added, including tentative Python implementations for:

- Loading travel data, generating a Markov chain transition matrix, computing the stationary distribution, and calculating return probabilities for Problem 5.
- Computing precision, recall, and accuracy metrics, along with Hoeffding's confidence intervals, for Problem 6.