# Part A: Data Setup

In [1]:
import pandas as pd
import numpy as np

# A1. Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"
column_names = ["Sex", "Length", "Diameter", "Height", "Whole_weight",
                "Shucked_weight", "Viscera_weight", "Shell_weight", "Rings"]
df = pd.read_csv(url, names=column_names)

print(f"Number of rows: {len(df)}")
print(f"Column names: {list(df.columns)}")
print("First 5 rows:\n", df.head())

Number of rows: 4177
Column names: ['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings']
First 5 rows:
   Sex  Length  Diameter  Height  Whole_weight  Shucked_weight  Viscera_weight  \
0   M   0.455     0.365   0.095        0.5140          0.2245          0.1010   
1   M   0.350     0.265   0.090        0.2255          0.0995          0.0485   
2   F   0.530     0.420   0.135        0.6770          0.2565          0.1415   
3   M   0.440     0.365   0.125        0.5160          0.2155          0.1140   
4   I   0.330     0.255   0.080        0.2050          0.0895          0.0395   

   Shell_weight  Rings  
0         0.150     15  
1         0.070      7  
2         0.210      9  
3         0.155     10  
4         0.055      7  


In [2]:
# Checkpoint A1:
# What is input: The physical measurements of the abalone (e.g., Length, Diameter, Weight).
# What is output: The age of the abalone (derived from Rings).
# Why output is numeric: Age is a continuous, measurable quantity, making this a regression problem.

In [3]:
# A2. Convert target
y = df['Rings'].values + 1.5

In [4]:
# A3. Choose exactly 3 numeric features
X_raw = df[['Length', 'Diameter', 'Whole_weight']].values

In [5]:
# Justification:
# Feature 1 (Length): The longest shell measurement directly relates to the organism's growth duration.
# Feature 2 (Diameter): Perpendicular to length, providing a 2D understanding of size.
# Feature 3 (Whole_weight): Overall mass is a strong indicator of how long the abalone has been growing and feeding.

In [6]:
# A4. Train-test split (80/20 without sklearn)
np.random.seed(42)
indices = np.random.permutation(len(X_raw))
split_idx = int(0.8 * len(X_raw))

train_idx, test_idx = indices[:split_idx], indices[split_idx:]
X_train_raw, X_test_raw = X_raw[train_idx], X_raw[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

print(f"X_train shape: {X_train_raw.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test_raw.shape}, y_test shape: {y_test.shape}")

X_train shape: (3341, 3), y_train shape: (3341, 1)
X_test shape: (836, 3), y_test shape: (836, 1)


In [7]:
# A5. Normalize inputs
train_mean = np.mean(X_train_raw, axis=0)
train_std = np.std(X_train_raw, axis=0)

X_train = (X_train_raw - train_mean) / train_std
X_test = (X_test_raw - train_mean) / train_std

# Checkpoint A5:
# why normalization is needed for learning: It ensures all features are on the same scale, preventing features with larger numeric ranges from dominating the gradient updates, and helps gradient descent converge much faster.

# Part B: Define the model

In [8]:
# B. Define the model
def forward(X, w, b):
    return np.dot(X, w) + b

# Initialize dummy parameters to print shapes once as required
d = X_train.shape[1]
w_dummy = np.zeros((d, 1))
b_dummy = 0.0
y_hat_dummy = forward(X_train, w_dummy, b_dummy)

print(f"Shape of X: {X_train.shape}")
print(f"Shape of w: {w_dummy.shape}")
print(f"Shape of b: scalar")
print(f"Shape of y_hat: {y_hat_dummy.shape}")

# Checkpoint B:
# parameters are: w_1, w_2, w_3 (the weights) and b (the bias).
# number of parameters: 4 total parameters for d=3 (3 weights + 1 bias).

Shape of X: (3341, 3)
Shape of w: (3, 1)
Shape of b: scalar
Shape of y_hat: (3341, 1)


# Part C: Define Loss (MSE)

In [9]:
# C. Define Loss
def mse(y, y_hat):
    return np.mean((y - y_hat) ** 2)

# Checkpoint C:
# why square: Squaring ensures negative and positive errors don't cancel each other out, and makes the loss function differentiable (a smooth bowl shape).
# what mistakes are expensive: Large mistakes become exponentially more expensive due to the square, forcing the model to severely penalize outliers.

# Part D: The Learning Rule (Gradients)

In [10]:
# D. Implement gradients
def grad_w(X, y, y_hat):
    N = len(y)
    dW = (-2/N) * np.dot(X.T, (y - y_hat))
    return dW

def grad_b(y, y_hat):
    N = len(y)
    db = (-2/N) * np.sum(y - y_hat)
    return db

# Checkpoint D:
# what gradient means in words: The gradient points in the direction of the steepest slope (highest increase) of the loss function.
# why subtracting gradient reduces loss: Since the gradient points UP the slope, subtracting it moves us down the slope towards the minimum error.
# meaning of large gradient: A large gradient means the model's current predictions are very wrong, and it is on a steep part of the loss curve.
# effect of too-large learning rate: The updates will be too massive, causing the model to step over the minimum and potentially diverge (loss explodes to infinity).

# Part E: Training Loop

In [11]:
# E. Training Loop
np.random.seed(42)
w = np.random.randn(d, 1) * 0.01
b = 0.0
lr = 0.1
epochs = 200

for epoch in range(epochs):
    y_hat = forward(X_train, w, b)
    loss = mse(y_train, y_hat)
    dW = grad_w(X_train, y_train, y_hat)
    db = grad_b(y_train, y_hat)
    w = w - lr * dW
    b = b - lr * db

    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

# Checkpoint E:
# Initial expectation: The loss should go down rapidly at first because the random initial weights will produce massive errors, yielding large gradients.
# Revised expectation after training: It did drop fast initially, but then plateaued as it approached the minimum, slowing down because the gradients become very small near the bottom of the error curve.

Epoch 0, Loss: 141.4974
Epoch 20, Loss: 7.2441
Epoch 40, Loss: 7.2129
Epoch 60, Loss: 7.2010
Epoch 80, Loss: 7.1905
Epoch 100, Loss: 7.1811
Epoch 120, Loss: 7.1728
Epoch 140, Loss: 7.1654
Epoch 160, Loss: 7.1587
Epoch 180, Loss: 7.1527


# Part F: Evaluation

In [13]:
y_test_hat = forward(X_test, w, b)
test_mse = mse(y_test, y_test_hat)
test_mae = np.mean(np.abs(y_test - y_test_hat))

print(f"\nTest MSE: {test_mse:.4f}")
print(f"Test MAE: {test_mae:.4f}")

print("5 Example Predictions:")
for i in range(5):
    true_age = y_test[i][0]
    pred_age = y_test_hat[i][0]
    abs_err = abs(true_age - pred_age)
    print(f"Example {i+1}: True Age = {true_age:.2f} | Predicted = {pred_age:.2f} | Abs Error = {abs_err:.2f}")

# Checkpoint F:
# systematic errors: The model likely systematically underestimates the age of very old abalones, because linear regressions struggle with extreme non-linear tails.
# observed bias: There is a base prediction "bias" heavily anchored around the dataset's average age, struggling to perfectly capture variance just using 3 basic physical traits.


Test MSE: 6.1823
Test MAE: 1.8641
5 Example Predictions:
Example 1: True Age = 10.50 | Predicted = 9.99 | Abs Error = 0.51
Example 2: True Age = 11.50 | Predicted = 13.69 | Abs Error = 2.19
Example 3: True Age = 10.50 | Predicted = 12.49 | Abs Error = 1.99
Example 4: True Age = 11.50 | Predicted = 9.22 | Abs Error = 2.28
Example 5: True Age = 7.50 | Predicted = 8.41 | Abs Error = 0.91
