# Group 1 : Diksha Prasad, Nasrul Huda, Omkar Kondhalkar, Aqsa Mohsin

#Foundation models

#Prequisites
Please install Python and required libraries for this exercise, or do it in google colab.


# 1.
Foundation models are pre-trained on large datasets to learn general representations. They can then be adapted to specific tasks through multiple methods: zero-shot/few-shot prompting, fine-tuning, or linear probing / feature extraction.

Implement a foundation model that:
- Pre-trains on a large general dataset to learn useful feature weights
- Uses a) linear probing and b) full fine-tuning to train for a small task-specific dataset
- Compare performance with training from scratch (random initialization)

For simplicity reasons we use a simple linear regression model here.

(15 points)

In [1]:
# These are the libraries we used
import torch
import torch.nn as nn
import torch.optim as optim

class SimpleNN(nn.Module):
    def __init__(self, input_size=3, hidden_size=4):
        super().__init__()
        self.hidden = nn.Linear(input_size, hidden_size)
        self.output = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.sigmoid(self.hidden(x))
        x = self.sigmoid(self.output(x))
        return x

In [4]:
# === Solution helper + training functions ===
import torch
import torch.nn as nn
import torch.optim as optim

# Helper to convert list-of-tuples data into tensors
def prepare_tensors(data):
    """
    data: list of (input_list, label) pairs
    returns: X [N, 3], y [N, 1]
    """
    X = torch.tensor([x for x, _ in data], dtype=torch.float32)
    y = torch.tensor([[y] for _, y in data], dtype=torch.float32)
    return X, y

def pretrain(model, data, epochs, lr=0.1):
    """
    Pretrain the model on the large, general dataset.
    We train all parameters from random initialization.
    """
    model.train()
    X, y = prepare_tensors(data)

    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

    return model


def linear_probing(model, data, epochs, lr=0.1):
    """
    Freeze hidden layers, only train the output layer on the small, specific dataset.
    This corresponds to linear probing: using the pretrained features as fixed.
    """
    # Freeze hidden layer weights
    for param in model.hidden.parameters():
        param.requires_grad = False

    # Ensure output layer is trainable
    for param in model.output.parameters():
        param.requires_grad = True

    X, y = prepare_tensors(data)
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.output.parameters(), lr=lr)

    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

    return model


def full_fine_tune(model, data, epochs, lr=0.1):
    """
    Retrain all weights, but reset the last layer's weights first.
    This corresponds to full fine-tuning of the pretrained model.
    """
    # Reset last layer to break any bias inherited from pretraining
    model.output.reset_parameters()


    for param in model.parameters():
        param.requires_grad = True

    X, y = prepare_tensors(data)
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

    return model


def train_from_scratch(data, epochs, lr=0.1):
    """
    Train a new model from scratch using only the task-specific data.
    No pretraining here.
    """
    model = SimpleNN()
    X, y = prepare_tensors(data)

    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

    return model



def evaluate(model, data):
    """
    Evaluate a model on a dataset: returns accuracy and raw probabilities.
    """
    model.eval()
    X, y = prepare_tensors(data)

    with torch.no_grad():
        outputs = model(X)
        preds = (outputs >= 0.5).float()
        accuracy = (preds.eq(y).sum().item()) / len(data)

    return accuracy, outputs.squeeze().tolist()



In [5]:
general_data = [
    ([1, 1, 1], 1), ([1, 1, 0], 1), ([1, 0, 1], 1),
    ([0, 0, 0], 0), ([0, 0, 1], 0), ([0, 1, 0], 0),
    ([1, 1, 1], 1), ([0, 1, 1], 0), ([1, 0, 0], 1),
    ([0, 0, 0], 0), ([1, 1, 0], 1), ([0, 1, 1], 0),
]

specific_data = [
    ([1, 1, 1], 1), ([0, 0, 0], 0),
    ([1, 1, 0], 1), ([0, 1, 0], 0),
]

# Pretrain the model on the general data and use both transfer learning techniques
# separately to retrain the model for the specified data only

In [6]:
# Train a new model from scratch using only the specified data and compare the results

In [7]:
# === Pretraining and transfer learning on general/specific data (appended) ===
import copy

# 1) Pretrain a base model on the general data
base_model = SimpleNN()
pretrain_epochs = 500
pretrain(base_model, general_data, epochs=pretrain_epochs)

# 2) Create two copies for the two transfer-learning strategies
linear_model = copy.deepcopy(base_model)      # for linear probing
finetune_model = copy.deepcopy(base_model)    # for full fine-tuning

# 3) Apply linear probing (only output layer trained on specific_data)
linear_epochs = 200
linear_probing(linear_model, specific_data, epochs=linear_epochs)

# 4) Apply full fine-tuning (all layers, last layer reset) on specific_data
finetune_epochs = 200
full_fine_tune(finetune_model, specific_data, epochs=finetune_epochs)

# 5) Evaluate both models on the specific_data
acc_linear, _ = evaluate(linear_model, specific_data)
acc_finetune, _ = evaluate(finetune_model, specific_data)

print(f"Accuracy on specific data after linear probing:   {acc_linear:.2f}")
print(f"Accuracy on specific data after full fine-tuning: {acc_finetune:.2f}")


Accuracy on specific data after linear probing:   1.00
Accuracy on specific data after full fine-tuning: 1.00


In [8]:
# === Training from scratch on specific data and comparison (appended) ===
scratch_epochs = 200
scratch_model = train_from_scratch(specific_data, epochs=scratch_epochs)

# Evaluate the scratch model
acc_scratch, _ = evaluate(scratch_model, specific_data)

print(f"Accuracy on specific data when training from scratch: {acc_scratch:.2f}")

# Compare all three approaches
print("\nComparison on task-specific data:")
print(f"  Linear probing (pretrained features):   {acc_linear:.2f}")
print(f"  Full fine-tuning (pretrained model):    {acc_finetune:.2f}")
print(f"  Training from scratch (no pretraining): {acc_scratch:.2f}")


Accuracy on specific data when training from scratch: 1.00

Comparison on task-specific data:
  Linear probing (pretrained features):   1.00
  Full fine-tuning (pretrained model):    1.00
  Training from scratch (no pretraining): 1.00


### Conclusion

On this tiny, perfectly learnable dataset, all three approaches
(linear probing, full fine-tuning, and training from scratch) reach
100% accuracy. This is expected because the model is expressive enough
and we train for enough epochs to perfectly fit the data.

In a more realistic/high-dimensional setting, pretraining typically
helps when we have limited task-specific data: linear probing and full
fine-tuning often outperform training from scratch.