In [None]:
# Markov Chain Analysis
import numpy as np

# Transition matrix
transition_matrix = np.array([
    [0.3, 0.4, 0.3],  # Downtown
    [0.2, 0.5, 0.3],  # Suburbs
    [0.4, 0.3, 0.3]   # Countryside
])

# Part 1: Probability of being in downtown after two steps from suburbs
initial_state_suburbs = np.array([0, 1, 0])  # Start in suburbs
prob_two_steps = np.dot(initial_state_suburbs, np.linalg.matrix_power(transition_matrix, 2))
problem1_p1 = prob_two_steps[0]

# Part 2: Probability of first being in downtown after two steps from suburbs
# We calculate the probability of going from suburbs to suburbs in the first step,
# and then from suburbs to downtown in the second step.
prob_first_downtown_two_steps = (
    transition_matrix[1, 1] * transition_matrix[1, 0]
)
problem1_p2 = prob_first_downtown_two_steps

# Part 3: Is the Markov chain irreducible?
# The Markov chain is irreducible if it's possible to go from any state to any other state.
def is_irreducible(matrix):
    n = matrix.shape[0]
    reachable = np.linalg.matrix_power(matrix, n - 1) > 0
    return np.all(reachable)

problem1_irreducible = is_irreducible(transition_matrix)

# Part 4: Stationary distribution
# Solve for the stationary distribution \pi such that \pi * P = \pi
# and the entries of \pi sum to 1.
def stationary_distribution(matrix):
    n = matrix.shape[0]
    A = np.vstack((matrix.T - np.eye(n), np.ones(n)))
    b = np.append(np.zeros(n), 1)
    return np.linalg.lstsq(A, b, rcond=None)[0]

problem1_stationary = stationary_distribution(transition_matrix)

# Part 5: Expected number of steps to enter the suburbs starting from downtown
# Use hitting time calculations.
max_steps = 30
hitting_probabilities = []
for step in range(1, max_steps + 1):
    hitting_matrix = np.linalg.matrix_power(transition_matrix, step)
    hitting_probabilities.append(hitting_matrix[0, 1])

expected_hitting_time = sum(step * hitting_probabilities[step - 1] for step in range(1, max_steps + 1))
problem1_ET = expected_hitting_time

# Print results
print(f"Part 1: Probability of being in downtown after two steps = {problem1_p1}")
print(f"Part 2: Probability of first being in downtown after two steps = {problem1_p2}")
print(f"Part 3: Is the Markov chain irreducible? {problem1_irreducible}")
print(f"Part 4: Stationary distribution = {problem1_stationary}")
print(f"Part 5: Expected number of steps to enter the suburbs = {problem1_ET}")


In [None]:
# Abalone Regression
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from scipy.stats import binom

# Part 1: Load the dataset into a DataFrame
file_path = 'data/abalone.csv'
problem2_df = pd.read_csv(file_path)

# Extract features and target based on column names
problem2_features = ['Length', 'Diameter', 'Height', 'WholeWeight', 'ShuckedWeight', 'VisceraWeight', 'ShellWeight']
problem2_target = 'Rings'

# Part 2: Split the dataset into training and testing sets
X = problem2_df[problem2_features]
y = problem2_df[problem2_target]
problem2_X_train, problem2_X_test, problem2_y_train, problem2_y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Part 3: Train a linear regression model
problem2_model = LinearRegression()
problem2_model.fit(problem2_X_train, problem2_y_train)

# Part 4: Evaluate the model with MAE and plot EDF of residuals
# Compute predictions
problem2_y_pred = problem2_model.predict(problem2_X_test)

# Compute Mean Absolute Error (MAE)
problem2_mae = mean_absolute_error(problem2_y_test, problem2_y_pred)

# Plot Empirical Distribution Function (EDF) of residuals
residuals = problem2_y_test - problem2_y_pred

# Function to compute EDF and confidence bands using the DKW inequality
def makeEDF(data):
    sorted_data = np.sort(data)
    n = len(data)
    y = np.arange(1, n + 1) / n
    epsilon = np.sqrt(np.log(2 / 0.05) / (2 * n))
    return sorted_data, y, epsilon

def plotEDF(data):
    x, y, epsilon = makeEDF(data)
    plt.step(x, y, label='EDF')
    plt.fill_between(x, np.maximum(0, y - epsilon), np.minimum(1, y + epsilon), color='blue', alpha=0.2, label='95% Confidence Band')
    plt.xlabel('Residuals')
    plt.ylabel('EDF')
    plt.title('Empirical Distribution Function of Residuals')
    plt.legend()
    plt.show()

plotEDF(residuals)

# Part 5: Scatter plot of predicted vs. true values
plt.scatter(problem2_y_pred, problem2_y_test, alpha=0.6)
plt.plot([min(problem2_y_test), max(problem2_y_test)], [min(problem2_y_test), max(problem2_y_test)], color='red', linestyle='--', label='Ideal Fit')
plt.xlabel('Predicted Values')
plt.ylabel('True Values')
plt.title('Predicted vs True Values')
plt.legend()
plt.show()

# Part 6: Discussion
print("Discussion:\n")
print(f"1. The Mean Absolute Error (MAE) is {problem2_mae:.2f}. This provides an indication of the average deviation of predictions from the true values.")
print("2. The scatter plot shows how well the predictions align with the true values. Ideally, the points should cluster around the diagonal red line. Deviations from this line indicate prediction errors.")


In [None]:
# Poisson Regression
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy import optimize
from sklearn.metrics import mean_absolute_error

# Part 1: Load the data and decide features and target
# Load the data
data_path = 'data/visits_clean.csv'  # Update the file path as necessary
problem3_df = pd.read_csv(data_path)

# Selecting features and target based on the problem statement
# Features exclude "ofnp", "opp", "opnp", "emr", "hosp" as they may correlate directly
# Target: "ofp" (number of physician office visits)
problem3_features = [
    "exclhlth", "poorhealth", "numchron", "adldiff", "noreast", "midwest",
    "west", "age", "male", "married", "school", "faminc", "employed",
    "privins", "medicaid"
]
problem3_target = "ofp"

# Part 2: Create X and y and perform train-test split
problem3_X = problem3_df[problem3_features].to_numpy()
problem3_y = problem3_df[problem3_target].to_numpy()

# Train-test split
problem3_X_train, problem3_X_test, problem3_y_train, problem3_y_test = train_test_split(
    problem3_X, problem3_y, test_size=0.2, random_state=42
)

# Part 3: Implement the loss function
class PoissonRegression:
    def __init__(self):
        self.coeffs = None
        self.result = None

    def fit(self, X, Y):
        def loss(coeffs):
            lam = np.exp(np.dot(X, coeffs[:-1]) + coeffs[-1])
            return -np.sum(Y * np.log(lam) - lam)

        initial_arguments = np.zeros(shape=X.shape[1] + 1)  # Initial guess as 0
        self.result = optimize.minimize(loss, initial_arguments, method='cg')
        self.coeffs = self.result.x

    def predict(self, X):
        if self.coeffs is not None:
            return np.exp(np.dot(X, self.coeffs[:-1]) + self.coeffs[-1])
        return None

# Part 4: Train the model
problem3_model = PoissonRegression()
problem3_model.fit(problem3_X_train, problem3_y_train)

# Check if optimization was successful
print("Optimization success:", problem3_model.result.success)

# Part 5: Evaluate the model
# Using Mean Absolute Error (MAE) as the metric
problem3_y_pred = problem3_model.predict(problem3_X_test)
problem3_metric = mean_absolute_error(problem3_y_test, problem3_y_pred)
print("Mean Absolute Error (MAE):", problem3_metric)

# Discussion on naive model comparison
naive_prediction = np.mean(problem3_y_train)  # Using the mean of the training set as a naive prediction
naive_mae = mean_absolute_error(problem3_y_test, np.full_like(problem3_y_test, naive_prediction))
print("Naive Mean Absolute Error (MAE):", naive_mae)

# Interpretation
if problem3_metric < naive_mae:
    print("The Poisson regression model performs better than the naive model.")
else:
    print("The Poisson regression model does not outperform the naive model.")
