### Gradient Descent Algorithm for Linear Regression (10 points)

The gradient descent algorithm is a powerful optimization technique used to iteratively refine model parameters for a better fit. In the context of linear regression, it aims to find optimal values of the slope $ m $ and intercept $ c $ that minimize the mean squared error between the predicted and actual values.

The steps for each iteration (or epoch) are:

1. **Prediction:**
   Use the current values of $ m $ and $ c $ to calculate the predicted values:
   $ y_{\text{pred}} = w \cdot x + b $

2. **Compute Gradients:**
   Determine the gradient of the loss with respect to each parameter:
   - Gradient with respect to the slope $ m $:
$$
D_w = \frac{-2}{n} \sum (x \cdot (y_{\text{noisy}} - y_{\text{pred}}))
$$



   - Gradient with respect to the intercept $ c $:
$$
D_b = \frac{-2}{n} \sum (y_{\text{noisy}} - y_{\text{pred}})
$$

3. **Update Parameters:**
   Adjust $ w $ and $ b $ based on the gradients and the learning rate $ \alpha $:
   $$ w = w - \alpha \cdot D_w $$
   $$ b = b - \alpha \cdot D_b $$


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import os
import imageio

random.seed(42)
np.random.seed(42)

x = np.linspace(0, 10, 100)

# Noisy Linear Data generation
y_linear_noisy = 3*x + 2 + np.random.randn(100)*5

# Initial values for m (slope) and c (intercept)
w, b = 0, 0  # Model parameters
learning_rate = 0.01
epochs = 15  # Limit to 15 epochs

# Lists to save the plots at each epoch for visualization
lines = []

# Gradient Descent
for epoch in range(epochs):
    # Make predictions, watch formula listed above to implement
    y_pred = # Your code here

    # Calculate gradients
    D_w = (-2/len(x)) * sum(x * (y_linear_noisy - y_pred))
    D_b = # Your code here

    # Update parameters, watch formulas listed above to implement
    w = # Your code here
    b = # Your code here

    # Visualization
    plt.figure(figsize=(10,5))
    plt.scatter(x, y_linear_noisy, c='blue', label='Noisy Linear Data')
    plt.plot(x, w*x+b, '-r', label=f'Epoch {epoch+1}')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.title(f'Gradient Descent Epoch {epoch+1}')
    plt.legend()
    plt.grid(True)
    filename = f'epoch_{epoch}.png'
    plt.savefig(filename)
    lines.append(filename)
    plt.close()

# Create gif with slowed down duration and infinite loop
with imageio.get_writer('gradient_descent.gif', mode='I', duration=700, loop = 0) as writer:  # duration is in seconds per frame
    for filename in lines:
        image = imageio.imread(filename)
        writer.append_data(image)

# Cleanup the individual frames
for filename in lines:
    os.remove(filename)

# Display the optimized parameters and the gif
print(f"Optimized Parameters: Slope (m) = {w}, Intercept (c) = {b}")

# Display gif in Jupyter notebook
from IPython.display import Image
Image(filename="gradient_descent.gif")  # loop=0 makes it loop indefinitely


### Linear regression using sklearn (10 points)

Load the dataset and perform basic data exploration.

In [None]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

california_housing = fetch_california_housing(as_frame=True)

# Load dataset
df = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
df['MedHouseVal'] = california_housing.target

# Display the first few rows
print(df.head())


Split the data into training and testing sets.



In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Fit a linear regression model to the training data and evaluate its performance on the testing set.

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import random
# set random seed
random.seed(42)
np.random.seed(42)

# Create a model object
reg = #Your code here
model = make_pipeline(StandardScaler(),
                      reg
                      )
# In case of any difficulties check: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor

# Fit the model to the training data
# Your code here


# Make predictions on the test set
y_pred = # Your code here

# Calculate and display the mean squared error between the actual and predicted values on test set
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


# Plot the actual vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual Prices vs Predicted Prices")
plt.show()


Tip: SGDRegressor has fit and predict methods

(10 points)

### Changing hyperparameters

What heppends with accurancy if you change  max_iter to 3? Modify and place code in the cell below.


How can you explain this behavior? (5 points)

In [None]:
# Place modified code here

### Regularization Parameter Search for Ridge Regression (5 points)



Manually search for the best regularization parameter alpha in Ridge regression.

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize and fit the scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Find optimal value of alpha for Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# List of alphas (regularization parameters) to test
alphas = [1e-5, 1e-3, 1e-2, 100, 500, 1000, 10000]


alpha = # Your code here, you can chose any value from the list above

ridge = Ridge(alpha=alpha)
ridge.fit(X_train_scaled, y_train)
# In case of any difficulties check: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge

# Predict on the validation data
y_pred = # Your code here

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)

print(f"Alpha Value: {alpha}")
print(f"Mean Squared Error: {mse}")


In [None]:
y_train

What happends when alpha is too high or too low?

Your answer here

## Classification with the Iris Dataset Using Logistic Regression (8 points)
Load the dataset and perform basic data exploration.



Split the data into training and testing sets.


Train a Logistic Regression classifier and evaluate its performance.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
iris = sns.load_dataset("iris")

# Visualize the dataset using a pair plot
sns.pairplot(iris, hue="species")
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into features and labels
X = iris.drop("species", axis=1)
y = iris["species"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.linear_model import LogisticRegression

# Create a logistic regression model with OvR multi-class strategy, peek suitable value for max_iter
clf = # Your code here

# In case of any difficulties check: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
# Train the model
# Your code here


from sklearn.metrics import accuracy_score, classification_report

# Predict the species for the test set
y_pred = # Your code here

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Display a classification report
report = classification_report(y_test, y_pred)
print(report)





import numpy as np

# For simplicity, let's visualize using only the first two features (sepal length and sepal width)
X_train_2d = X_train[:, :2]

# Train the model again on the 2D data
clf.fit(X_train_2d, y_train)

# Plot the decision boundaries
x_min, x_max = X_train_2d[:, 0].min() - 1, X_train_2d[:, 0].max() + 1
y_min, y_max = X_train_2d[:, 1].min() - 1, X_train_2d[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                     np.arange(y_min, y_max, 0.01))

Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

from sklearn.preprocessing import LabelEncoder

# Encode labels into numbers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Train the classifier again on the 2D data
clf.fit(X_train_2d, y_train_encoded)

# Predict
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# No need to decode since we're just plotting the boundaries
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, alpha=0.3)
sns.scatterplot(x=X_train_2d[:, 0], y=X_train_2d[:, 1], hue=label_encoder.transform(y_train))
plt.title('Decision Boundaries with Sepal Length and Sepal Width')
plt.xlabel('Sepal Length (Standardized)')
plt.ylabel('Sepal Width (Standardized)', )
plt.legend(title='Legend', loc='best', labels=['setosa', 'versicolor', 'virginica'])
plt.show()

# Understanding Overfitting and Underfitting (10 points)

One of the most important step in evaluating a machine learning model is to undersand whether the model overfits or underfits the data.

In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# set seed
np.random.seed(132)


def true_fun(X):
    return np.cos(2 * np.pi * X)

n_samples = 50
X = np.sort(np.random.rand(n_samples))
y = true_fun(X) + np.random.randn(n_samples) * 0.2


The aim is to try to fit a polynomial function of degree n to the data. n should be chosen as to not overfit or underfit the underlying data. This is often a trial-and-error process, using cross-validation and visualization to estimate the "best" fit to the trainig data.

The first step is to fit the data.

In [None]:

degrees = int(input("Please Enter The Degree of Polynomial Between 1-20:"))

polynomial_features = PolynomialFeatures(degree=degrees,
                                          include_bias=False)
linear_regression = LinearRegression()
pipeline = Pipeline([("polynomial_features", polynomial_features),
                      ("linear_regression", linear_regression)])
pipeline.fit(X[:, np.newaxis], y)



Then to evaluate the model using cross-validation and visualization.

In [None]:
plt.figure(figsize=(15, 6))

scores = cross_val_score(pipeline, X[:, np.newaxis], y,
                          scoring="neg_mean_squared_error", cv=10)

X_test = np.linspace(0, 1, 100)
plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
plt.plot(X_test, true_fun(X_test), label="True function")
plt.scatter(X, y, edgecolor='r', s=20, label="Samples")
plt.xlabel("x")
plt.ylabel("y")
plt.xlim((0, 1))
plt.ylim((-2, 2))
plt.legend(loc="best")
plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
    degrees, -scores.mean(), scores.std()))
plt.show()

Optimal degree in terms of MSE is:

*FILL THE VAULE HERE*

Explain what heppends when the value is higher:

*YOUR EXPLANATION HERE*

Explain what heppends when the value is lower:

*YOUR EXPLANATION HERE*

(10 points)


# Bouns: Implement kNN (10 points)

In [None]:
import numpy as np

# Write your own implementation of KNN. Your code should pass tests in the cell below.
class KNN:
    def __init__(self, k=3):
        """
        Method is called when an object is created
        """
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self, X_train, y_train):
        """
        Method is called to train (fit) the model
        """
        # Your code here
        return

    def predict(self, X_test) -> np.ndarray:
        """
        Method is called to predict labels for new data
        """
        # Your code here
        return predictions # predictions should be in format of Numpy array


Run the following cell to test your code.

Note: This action is required for grading purposes (10 points)

In [None]:
def test_knn():
    model = KNN(k=3)

    # Basic Test
    X_train = np.array([[1, 2], [2, 3], [3, 4], [5, 6], [6, 7]])
    y_train = np.array([0, 0, 0, 1, 1])
    X_test = np.array([[4, 5], [7, 8]])
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    assert np.array_equal(predictions, np.array([0, 1])), f"Expected [0, 1] but got {predictions}"

    # Edge Case Test: All points are equidistant from test point
    X_train = np.array([[1, 1], [1, 1], [1, 1]])
    y_train = np.array([0, 1, 2])
    X_test = np.array([[1, 1]])
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    assert predictions[0] in y_train, f"Prediction should be one of the training labels, got {predictions[0]}"

    # Different k-values Test
    X_train = np.array([[1, 2], [2, 3], [3, 4], [5, 6], [6, 7]])
    y_train = np.array([0, 0, 0, 1, 1])
    X_test = np.array([[4, 5]])

    model = KNN(k=1)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    assert predictions[0] == 0, f"Expected 0 but got {predictions[0]}"

    model = KNN(k=5)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    assert predictions[0] == 0, f"Expected 0 but got {predictions[0]}"

    # Tie Case Test
    X_train = np.array([[1, 2], [2, 3], [5, 6], [6, 7]])
    y_train = np.array([0, 0, 1, 1])
    X_test = np.array([[4, 5]])
    model = KNN(k=3)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    assert predictions[0] == 0 or predictions[0] == 1, f"Expected 0 or 1 but got {predictions[0]}"

    print("All tests passed! Your code works as expected")

test_knn()
