In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

def true_function(x):
    return x + np.sin(1.5 * x)

## P1.2



In [None]:
import numpy as np
import matplotlib.pyplot as plt


np.random.seed(42) 
x_values = np.linspace(0, 10, 20) # Create 20 data from 0 to 10

# True function
f_x = x_values + np.sin(1.5 * x_values)

# Generate noise from N(0, 0.3)
epsilon = np.random.normal(0, 0.3, 20) # Create noise of N(0,0.3)

# Generate y = f(x) + epsilon
y_values = f_x + epsilon

# Plotting
plt.figure(figsize=(10, 6))
plt.scatter(x_values, y_values, label='Generated Data (y)', color='blue', alpha=0.7)
plt.plot(x_values, f_x, label='True Function f(x)', color='red')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Generated Dataset and True Function')
plt.legend()
plt.show()

## P.2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

np.random.seed(42)  # For reproducibility
size = 20  # Dataset size
x_values = np.linspace(0, 10, size)
noise_std = 0.3

f_x = x_values + np.sin(1.5 * x_values)

# Generate noise from N(0, 0.3)
epsilon = np.random.normal(0, noise_std, size)
y_values = f_x + epsilon
x_reshaped = x_values.reshape(-1, 1)

degrees = [1, 3, 10]
estimators = []

for degree in degrees:
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(x_reshaped)
    
    # Linear regression
    model = LinearRegression()
    model.fit(X_poly, y_values)
    
    y_pred = model.predict(X_poly)
    estimators.append((degree, y_pred))

# Plotting
plt.figure(figsize=(10, 6))
plt.scatter(x_values, y_values, label='Sampled Data', color='black', alpha=0.7)
plt.plot(x_values, f_x, label='f(x)', color='red', linewidth=2)

colors = ['green', 'pink', 'purple']
for (degree, y_pred), color in zip(estimators, colors):
    plt.plot(x_values, y_pred, label=f'Estimator g{degree}(x)', color=color)

plt.xlabel('x')
plt.ylabel('y')
plt.title('True Function and Polynomial Estimators')
plt.legend()
plt.show()
print(estimators)

## P1.4


In [None]:
num_datasets = 100
dataset_size = 50
train_size = int(0.8 * dataset_size)
max_degree = 15
noise_std = 0.3
test_size = 10 
train_size = dataset_size - test_size  

x_test_fixed = np.linspace(0, 10, test_size)
epsilon_test = np.random.normal(0, noise_std, test_size)
y_test_fixed = true_function(x_test_fixed) + epsilon_test

bias_squared = []
variance = []
errors = []

for degree in range(1, max_degree + 1):
    predictions_on_test = []

    for _ in range(num_datasets): # MAke 100 diff dataset randomly
        x_train = np.random.choice(np.linspace(0, 10, 1000), train_size, replace=False)
        epsilon_train = np.random.normal(0, noise_std, train_size)
        y_train = true_function(x_train) + epsilon_train

        x_train_reshaped = x_train.reshape(-1, 1)
        x_test_reshaped = x_test_fixed.reshape(-1, 1)
        
        # Polynomial transformation
        poly = PolynomialFeatures(degree)
        X_train_poly = poly.fit_transform(x_train_reshaped)
        X_test_poly = poly.transform(x_test_reshaped)
        
        # Fit model and make predictions on fixed test set
        model = LinearRegression()
        model.fit(X_train_poly, y_train)
        y_pred = model.predict(X_test_poly)
        
        predictions_on_test.append(y_pred)

    predictions_on_test = np.array(predictions_on_test)
    mean_prediction = np.mean(predictions_on_test, axis=0)
    squared_bias = np.mean((mean_prediction - y_test_fixed) ** 2)
    var = np.mean(np.var(predictions_on_test, axis=0))
    mse = squared_bias + var + noise_std ** 2  # Including the noise variance

    bias_squared.append(squared_bias)
    variance.append(var)
    errors.append(mse)

# Plot
plt.figure(figsize=(12, 6))
plt.plot(range(1, max_degree + 1), bias_squared, label='Squared Bias', color='blue', marker='o')
plt.plot(range(1, max_degree + 1), variance, label='Variance', color='green', marker='o')
plt.plot(range(1, max_degree + 1), errors, label='MSE Error', color='red', marker='o')
plt.xlabel('Model Complexity gi(x)')
plt.ylabel('Value (Log Scale)')
plt.title('Bias-Variance Tradeoff with Model Complexity (Log Scale)')
plt.legend()
plt.show()
print()

## 1.5

In [None]:
from sklearn.linear_model import Ridge

regularization_rate = 1.0

bias_squared_regularized = []
variance_regularized = []
errors_regularized = []
degree = 10
predictions_on_test_unregularized = []
predictions_on_test_regularized = []

for _ in range(num_datasets):
    # Generate random dataset
    x_values = np.linspace(0, 10, dataset_size)
    epsilon = np.random.normal(0, noise_std, dataset_size)
    y_values = true_function(x_values) + epsilon
    
    x_train, x_test = x_values[:train_size], x_values[train_size:]
    y_train, y_test = y_values[:train_size], y_values[train_size:]
    x_train_reshaped = x_train.reshape(-1, 1)
    x_test_reshaped = x_test.reshape(-1, 1)
    
    poly = PolynomialFeatures(degree)
    X_train_poly = poly.fit_transform(x_train_reshaped)
    X_test_poly = poly.transform(x_test_reshaped)
    
    model_unregularized = LinearRegression()
    model_unregularized.fit(X_train_poly, y_train)
    y_pred_unregularized = model_unregularized.predict(X_test_poly)
    predictions_on_test_unregularized.append(y_pred_unregularized)
    
    model_regularized = Ridge(alpha=regularization_rate)
    model_regularized.fit(X_train_poly, y_train)
    y_pred_regularized = model_regularized.predict(X_test_poly)
    predictions_on_test_regularized.append(y_pred_regularized)

predictions_on_test_unregularized = np.array(predictions_on_test_unregularized)
predictions_on_test_regularized = np.array(predictions_on_test_regularized)

mean_prediction_unregularized = np.mean(predictions_on_test_unregularized, axis=0)
mean_prediction_regularized = np.mean(predictions_on_test_regularized, axis=0)

actual_y_test = true_function(x_test)

# Squared Bias
squared_bias_unregularized = np.mean((mean_prediction_unregularized - actual_y_test) ** 2)
squared_bias_regularized = np.mean((mean_prediction_regularized - actual_y_test) ** 2)

# Variance
variance_unregularized = np.mean(np.var(predictions_on_test_unregularized, axis=0))
variance_regularized = np.mean(np.var(predictions_on_test_regularized, axis=0))

# MSE
mse_unregularized = squared_bias_unregularized + variance_unregularized + noise_std ** 2
mse_regularized = squared_bias_regularized + variance_regularized + noise_std ** 2

# Display results
print(f"Unregularized Model (Degree 10):")
print(f"Squared Bias: {squared_bias_unregularized:.4f}")
print(f"Variance: {variance_unregularized:.4f}")
print(f"MSE: {mse_unregularized:.4f}")
print("\nRegularized Model (Degree 10 with L2):")
print(f"Squared Bias: {squared_bias_regularized:.4f}")
print(f"Variance: {variance_regularized:.4f}")
print(f"MSE: {mse_regularized:.4f}")
print()

## 2.2


In [None]:
#2-2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import fetch_openml

data = fetch_openml(data_id=31, as_frame=True) 
X = data.data
y = data.target

# Preprocess
# Convert target to binary labels
y = y.map({'good': 1, 'bad': 0})
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['number']).columns
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_categorical_encoded = encoder.fit_transform(X[categorical_features])
X_numeric = X[numeric_features].values
X_encoded = np.hstack((X_categorical_encoded, X_numeric))


X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

# Use Adaboost and Logistic Regression
clf_adaboost = AdaBoostClassifier(random_state=42)
clf_logreg = LogisticRegression(max_iter=1000, random_state=42)
clf_adaboost.fit(X_train, y_train)
clf_logreg.fit(X_train, y_train)

y_score_adaboost = clf_adaboost.predict_proba(X_test)[:, 1]
y_score_logreg = clf_logreg.predict_proba(X_test)[:, 1]

# Calculate ROC curve and PR curve for each classifier
fpr_adaboost, tpr_adaboost, _ = roc_curve(y_test, y_score_adaboost)
fpr_logreg, tpr_logreg, _ = roc_curve(y_test, y_score_logreg)

precision_adaboost, recall_adaboost, _ = precision_recall_curve(y_test, y_score_adaboost)
precision_logreg, recall_logreg, _ = precision_recall_curve(y_test, y_score_logreg)

all_positive_tpr = 1
all_positive_fpr = 1
positive_proportion = np.sum(y_test.to_numpy().astype(int)) / len(y_test)  # Proportion of positives in test set
all_positive_precision = positive_proportion

# Plot ROC curves
plt.figure(figsize=(10, 5))
plt.plot(fpr_adaboost, tpr_adaboost, label='Adaboost')
plt.plot(fpr_logreg, tpr_logreg, label='Logistic Regression')
plt.scatter(all_positive_fpr, all_positive_tpr, color='red', label='All Positive Classifier', marker='x')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curves')
plt.legend()
plt.show()

# Plot PR curves
plt.figure(figsize=(10, 5))
plt.plot(recall_adaboost, precision_adaboost, label='Adaboost')
plt.plot(recall_logreg, precision_logreg, label='Logistic Regression')
plt.scatter(1, all_positive_precision, color='red', label='All Positive Classifier', marker='x')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves')
plt.legend()
plt.show()


## 2.3

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

auroc_adaboost = roc_auc_score(y_test, y_score_adaboost)
auroc_logreg = roc_auc_score(y_test, y_score_logreg)

aupr_adaboost = average_precision_score(y_test, y_score_adaboost)
aupr_logreg = average_precision_score(y_test, y_score_logreg)

def calculate_auprg(precision, recall, positive_proportion):
    precision_gain = (precision - positive_proportion) / (1 - positive_proportion)
    recall_gain = (recall - positive_proportion) / (1 - positive_proportion)
    return np.trapz(precision_gain, recall_gain)

poss = positive_proportion

# Calculate AUPRG for both classifiers
auprg_adaboost = calculate_auprg(precision_adaboost, recall_adaboost, poss)
auprg_logreg = calculate_auprg(precision_logreg, recall_logreg, poss)


print(f"AUROC (Adaboost): {auroc_adaboost:.3f}")
print(f"AUROC (Logistic Regression): {auroc_logreg:.3f}")
print(f"AUPR (Adaboost): {aupr_adaboost:.3f}")
print(f"AUPR (Logistic Regression): {aupr_logreg:.3f}")
print(f"AUPRG (Adaboost): {auprg_adaboost:.3f}")
print(f"AUPRG (Logistic Regression): {auprg_logreg:.3f}")
