In [None]:
# Rejection Sampling and Monte Carlo
import numpy as np
import matplotlib.pyplot as plt

def problem1_inversion(n_samples=1):
    """
    Produces samples from the distribution:
        F(x) = 0, x <= 0
               (exp(x^2) - 1) / (exp(1) - 1), 0 < x < 1
               1, x >= 1
    """
    def target_pdf(x):
        return (np.exp(x**2) - 1) / (np.exp(1) - 1)

    # Sampling distribution: Uniform(0, 1)
    def proposal_pdf(x):
        return 1 if 0 <= x <= 1 else 0

    c = (np.exp(1) - 1)  # Maximum of target_pdf within range

    samples = []
    while len(samples) < n_samples:
        x = np.random.uniform(0, 1)
        u = np.random.uniform(0, c * proposal_pdf(x))
        if u <= target_pdf(x):
            samples.append(x)

    return np.array(samples)

# Part 2: Generate 100,000 samples
problem1_samples = problem1_inversion(n_samples=100000)

# Plot histogram with true density
x_vals = np.linspace(0, 1, 500)
true_density = (np.exp(x_vals**2) - 1) / (np.exp(1) - 1)
plt.hist(problem1_samples, bins=50, density=True, alpha=0.6, label="Samples")
plt.plot(x_vals, true_density, label="True Density", linewidth=2)
plt.legend()
plt.title("Histogram and True Density")
plt.show()

# Part 3: Monte Carlo Integration
integrand = lambda x: np.sin(x) * (2 * np.exp(x**2)) / ((np.exp(1) - 1) * x)
values = integrand(problem1_samples)
problem1_integral = np.mean(values)

# Part 4: Hoeffding's Inequality for Confidence Interval
confidence_level = 0.95
epsilon = np.sqrt(np.log(2 / (1 - confidence_level)) / (2 * len(values)))
problem1_interval = [problem1_integral - epsilon, problem1_integral + epsilon]

# Part 5: Second Distribution

def problem1_inversion_2(n_samples=1):
    """
    Produces samples from the distribution:
        F(x) = 0, x <= 0
               20x * exp(20 - 1/x), 0 < x < 1
               20, x >= 1
    """
    def target_pdf(x):
        return 20 * x * np.exp(20 - 1 / x) if 0 < x <= 1 else 0

    # Sampling distribution: Exponential(1/20)
    def proposal_sampler():
        return np.random.exponential(1 / 20)

    def proposal_pdf(x):
        return 20 * np.exp(-20 * x) if x > 0 else 0

    c = 20 * np.exp(20 - 1)  # Maximum of target_pdf / proposal_pdf

    samples = []
    while len(samples) < n_samples:
        x = proposal_sampler()
        if 0 < x <= 1:  # Ensure x is within the domain
            u = np.random.uniform(0, c * proposal_pdf(x))
            if u <= target_pdf(x):
                samples.append(x)

    return np.array(samples)

# Verify outputs
print("Generated values:")
print("Integral Estimate:", problem1_integral)
print("Confidence Interval:", problem1_interval)


In [None]:
# Logistic Spam Model
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from scipy.optimize import minimize
from scipy.stats import norm

# Part 1: Loading the data and splitting into train, calibration, and test sets
# Replace 'data/spam.csv' with the actual path to your CSV file
import pandas as pd
data = pd.read_csv('data/spam.csv')

# Extract features and labels
problem2_X = data[['free', 'prize', 'win']].values  # Assuming these columns correspond to X1, X2, X3
problem2_Y = data['spam'].values  # Assuming 'spam' column corresponds to Y

# Split into train, calibration, and test sets
X_train, X_temp, Y_train, Y_temp = train_test_split(problem2_X, problem2_Y, test_size=0.6, random_state=42)
X_calib, X_test, Y_calib, Y_test = train_test_split(X_temp, Y_temp, test_size=2/3, random_state=42)

problem2_X_train, problem2_X_calib, problem2_X_test = X_train, X_calib, X_test
problem2_Y_train, problem2_Y_calib, problem2_Y_test = Y_train, Y_calib, Y_test

print(problem2_X_train.shape, problem2_X_calib.shape, problem2_X_test.shape, problem2_Y_train.shape, problem2_Y_calib.shape, problem2_Y_test.shape)

# Part 2: Implementing ProportionalSpam class
class ProportionalSpam:
    def __init__(self):
        self.coeffs = None
        self.result = None

    def loss(self, X, Y, coeffs):
        G = lambda x: np.exp(x) / (1 + np.exp(x))
        logits = np.dot(X, coeffs[1:]) + coeffs[0]
        probabilities = G(logits)
        loss = -np.mean(Y * np.log(probabilities + 1e-9) + (1 - Y) * np.log(1 - probabilities + 1e-9))
        return loss

    def fit(self, X, Y):
        X_with_bias = np.hstack([np.ones((X.shape[0], 1)), X])
        initial_arguments = np.zeros(X_with_bias.shape[1])
        opt_loss = lambda coeffs: self.loss(X, Y, coeffs)
        self.result = minimize(opt_loss, initial_arguments, method='cg')
        self.coeffs = self.result.x

    def predict(self, X):
        if self.coeffs is not None:
            G = lambda x: np.exp(x) / (1 + np.exp(x))
            logits = np.dot(X, self.coeffs[1:]) + self.coeffs[0]
            probabilities = G(logits)
            return probabilities
        return None

# Part 3: Training the model and calibrator
problem2_ps = ProportionalSpam()
problem2_ps.fit(problem2_X_train, problem2_Y_train)

# Predict on the calibration dataset
problem2_X_pred = problem2_ps.predict(problem2_X_calib).reshape(-1, 1)

# Train the calibrator
problem2_calibrator = DecisionTreeRegressor()
problem2_calibrator.fit(problem2_X_pred, problem2_Y_calib)

# Part 4: Making final predictions and calculating 0-1 loss and confidence interval
# Predict on the test dataset
predicted_probabilities = problem2_ps.predict(problem2_X_test).reshape(-1, 1)
calibrated_probabilities = problem2_calibrator.predict(predicted_probabilities)

# Convert probabilities to binary predictions (Bayes classifier)
problem2_final_predictions = (calibrated_probabilities >= 0.5).astype(int)

# Compute 0-1 loss
problem2_01_loss = np.mean(problem2_final_predictions != problem2_Y_test)

# Compute 99% confidence interval for the 0-1 loss
n_test = len(problem2_Y_test)
loss_variance = (problem2_01_loss * (1 - problem2_01_loss)) / n_test
z_score = norm.ppf(0.995)  # 99% confidence interval
margin_of_error = z_score * np.sqrt(loss_variance)
problem2_interval = (problem2_01_loss - margin_of_error, problem2_01_loss + margin_of_error)

# Output results
print("0-1 Loss:", problem2_01_loss)
print("99% Confidence Interval:", problem2_interval)


In [None]:
# Markov Chain Analysis
import numpy as np

# PART 1: TRANSITION MATRIX
# Transition matrices for Markov Chains A, B, C, and D
problem3_A = np.array([
    [0.8, 0.2, 0.0, 0.0],  # A -> A, B, C, D
    [0.6, 0.2, 0.2, 0.0],  # B -> A, B, C, D
    [0.0, 0.4, 0.6, 0.0],  # C -> A, B, C, D
    [0.0, 0.0, 0.2, 0.8],  # D -> A, B, C, D
])

problem3_B = np.array([
    [0.8, 0.0, 0.0, 0.2],
    [0.2, 0.0, 1.0, 0.0],
    [0.0, 1.0, 0.0, 0.0],
    [0.0, 0.5, 0.5, 0.0],
])

problem3_C = np.array([
    [0.2, 0.3, 0.2, 0.0, 0.5],
    [0.3, 0.2, 0.0, 0.4, 0.1],
    [0.0, 0.6, 0.0, 0.0, 0.4],
    [0.4, 0.0, 0.6, 0.0, 0.0],
    [0.6, 0.0, 0.0, 0.4, 0.0],
])

problem3_D = np.array([
    [0.8, 0.2, 0.0, 0.0],
    [0.6, 0.2, 0.2, 0.0],
    [0.0, 0.4, 0.6, 0.0],
    [0.1, 0.0, 0.7, 0.2],
])

# PART 2: IRREDUCIBLE
# Irreducibility checks for each Markov chain
problem3_A_irreducible = True
problem3_B_irreducible = False  # B and D are disconnected
problem3_C_irreducible = True
problem3_D_irreducible = True

# PART 3: APERIODICITY
# Aperiodicity checks and periods
problem3_A_is_aperiodic = True
problem3_B_is_aperiodic = False  # B has a period of 2
problem3_C_is_aperiodic = True
problem3_D_is_aperiodic = True

problem3_A_periods = np.array([1, 1, 1, 1])
problem3_B_periods = np.array([1, 2, 2, 1])
problem3_C_periods = np.array([1, 1, 1, 1, 1])
problem3_D_periods = np.array([1, 1, 1, 1])

# PART 4: STATIONARY DISTRIBUTION
# Stationary distributions
problem3_A_has_stationary = True
problem3_B_has_stationary = True
problem3_C_has_stationary = True
problem3_D_has_stationary = True

# Calculate stationary distributions
def stationary_distribution(P):
    eigvals, eigvecs = np.linalg.eig(P.T)
    stationary = eigvecs[:, np.isclose(eigvals, 1)]
    stationary = stationary / stationary.sum()
    return stationary.real.ravel()

problem3_A_stationary_dist = stationary_distribution(problem3_A)
problem3_B_stationary_dist = stationary_distribution(problem3_B)
problem3_C_stationary_dist = stationary_distribution(problem3_C)
problem3_D_stationary_dist = stationary_distribution(problem3_D)

# PART 5: REVERSIBILITY
# Reversibility checks (Detailed balance condition)
def is_reversible(P, pi):
    n = len(pi)
    for i in range(n):
        for j in range(n):
            if not np.isclose(pi[i] * P[i, j], pi[j] * P[j, i]):
                return False
    return True

problem3_A_is_reversible = is_reversible(problem3_A, problem3_A_stationary_dist)
problem3_B_is_reversible = is_reversible(problem3_B, problem3_B_stationary_dist)
problem3_C_is_reversible = is_reversible(problem3_C, problem3_C_stationary_dist)
problem3_D_is_reversible = is_reversible(problem3_D, problem3_D_stationary_dist)

# Print Results
results = {
    "Transition Matrices": [problem3_A, problem3_B, problem3_C, problem3_D],
    "Irreducibility": [problem3_A_irreducible, problem3_B_irreducible, problem3_C_irreducible, problem3_D_irreducible],
    "Aperiodicity": [problem3_A_is_aperiodic, problem3_B_is_aperiodic, problem3_C_is_aperiodic, problem3_D_is_aperiodic],
    "Periods": [problem3_A_periods, problem3_B_periods, problem3_C_periods, problem3_D_periods],
    "Stationary Distributions": [
        problem3_A_stationary_dist,
        problem3_B_stationary_dist,
        problem3_C_stationary_dist,
        problem3_D_stationary_dist,
    ],
    "Reversibility": [
        problem3_A_is_reversible,
        problem3_B_is_reversible,
        problem3_C_is_reversible,
        problem3_D_is_reversible,
    ],
}

results
