# Fetch ML Apprentice Assessment

## Task 1: Sentence Transformer Implementation

In [1]:
# Import the appropriate libraries

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertTokenizer

In [2]:
# Implement the sentence transformer model

class SentenceTransformerModel(torch.nn.Module):
    def __init__(self, model_name='bert-base-uncased', pooling='mean'):
        super(SentenceTransformerModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.pooling = pooling

    def forward(self, sentences):
        encoded_input = self.tokenizer(
            sentences, padding=True, truncation=True, return_tensors='pt'
        )
        with torch.no_grad():  # No training
            model_output = self.bert(**encoded_input)

        # Token embeddings (batch_size, seq_len, hidden_size)
        token_embeddings = model_output.last_hidden_state

        # Attention mask for ignoring padding tokens
        attention_mask = encoded_input['attention_mask'].unsqueeze(-1).expand(token_embeddings.size())

        if self.pooling == 'mean':
            # Mean Pooling: sum embeddings then divide by number of valid tokens
            summed = torch.sum(token_embeddings * attention_mask, dim=1)
            summed_mask = torch.clamp(attention_mask.sum(dim=1), min=1e-9)
            sentence_embeddings = summed / summed_mask
        elif self.pooling == 'cls':
            # CLS Pooling: use first token
            sentence_embeddings = token_embeddings[:, 0]
        else:
            raise ValueError("Unsupported pooling type")

        return sentence_embeddings

In [3]:
# Instantiate the model
model = SentenceTransformerModel(pooling='mean')

# Sample sentences
sentences = [
    "I love saving money with Fetch.",
    "I would be a great ML Apprentice.",
    "Fetch Rewards is a useful app.",
    "I enjoy creating ML models.",
    "My favorite color is blue."
]

# Encode and print embeddings
embeddings = model(sentences)
print(embeddings.shape)  # Should be [3, 768] for BERT base
print(embeddings)

torch.Size([5, 768])
tensor([[ 0.5121,  0.1326, -0.0188,  ..., -0.2387, -0.0629,  0.2001],
        [ 0.4206, -0.0512,  0.3857,  ..., -0.1994,  0.1497, -0.0953],
        [ 0.0431, -0.6125,  0.2400,  ..., -0.1788, -0.1408, -0.0080],
        [ 0.3266,  0.0353,  0.0498,  ..., -0.1261, -0.1274,  0.0138],
        [ 0.2340, -0.3160, -0.3238,  ..., -0.0248,  0.2804,  0.0624]])


### Explanation for the model architecture outside of the transformer backbone.

Mean pooling was used to encode input sentences into fixed-length embeddings. This is because mean pooling is simple yet effective. It works well for NLP as it reduces dimensionality and prevents overfitting. To keep things simple, no extra layers were added.

## Task 2: Multi-Task Learning Expansion

In [4]:
# Create the shared BERT encoder

class SharedBERTEncoder(nn.Module):
    def __init__(self, model_name='bert-base-uncased'):
        super(SharedBERTEncoder, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.tokenizer = BertTokenizer.from_pretrained(model_name)

    def forward(self, sentences):
        # Tokenize and encode sentences
        encoded_input = self.tokenizer(
            sentences, padding=True, truncation=True, return_tensors='pt'
        )
        
        # Get BERT model output (we don't fine-tune BERT in this case)
        with torch.no_grad():  # No training on BERT backbone
            model_output = self.bert(**encoded_input)
        
        return model_output.last_hidden_state, encoded_input['attention_mask']

In [5]:
# Task 2.A: Sentence Classification 

class TaskASentenceClassification(nn.Module):
    def __init__(self, bert_encoder, num_classes=3):
        super(TaskASentenceClassification, self).__init__()
        self.bert_encoder = bert_encoder
        self.task_a_fc = nn.Linear(self.bert_encoder.bert.config.hidden_size, num_classes)

    def forward(self, sentences):
        # Get sentence embeddings from shared encoder
        token_embeddings, attention_mask = self.bert_encoder(sentences)
        
        # We use mean pooling here
        summed = torch.sum(token_embeddings * attention_mask.unsqueeze(-1), dim=1)
        
        # Ensure summed_mask has the correct shape (batch_size, 1)
        summed_mask = torch.clamp(attention_mask.sum(dim=1), min=1e-9).unsqueeze(-1)  # (batch_size, 1)
        sentence_embeddings = summed / summed_mask  # Now the division works element-wise
        
        # Task A output: classification logits
        task_a_output = self.task_a_fc(sentence_embeddings)
        return task_a_output

In [6]:
# Example usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the shared encoder (BERT)
shared_bert_encoder = SharedBERTEncoder().to(device)

# Initialize Task A (Sentence Classification)
task_a_model = TaskASentenceClassification(bert_encoder=shared_bert_encoder, num_classes=3).to(device)

# Example sentences
sentences = [
    "I love saving money with Fetch.", 
    "I would be a great ML Apprentice.",
    "Fetch Rewards is a useful app.", 
    "I enjoy creating ML models.", 
    "My favorite color is blue."
]

# Get predictions for Task A (Sentence Classification)
task_a_output = task_a_model(sentences)

# Apply softmax to convert logits to probabilities
softmax = nn.Softmax(dim=1)
task_a_probs = softmax(task_a_output)

# Print the output probabilities for each class
print("Task A Output (Sentence Classification):")
for i, sentence in enumerate(sentences):
    print(f"Sentence: '{sentence}'")
    print(f"Predicted probabilities: {task_a_probs[i].detach().cpu().numpy()}")

Task A Output (Sentence Classification):
Sentence: 'I love saving money with Fetch.'
Predicted probabilities: [0.3767282  0.30354527 0.3197265 ]
Sentence: 'I would be a great ML Apprentice.'
Predicted probabilities: [0.40083712 0.3210542  0.27810866]
Sentence: 'Fetch Rewards is a useful app.'
Predicted probabilities: [0.30036512 0.34459034 0.35504457]
Sentence: 'I enjoy creating ML models.'
Predicted probabilities: [0.39686894 0.31306687 0.29006413]
Sentence: 'My favorite color is blue.'
Predicted probabilities: [0.31644395 0.27996066 0.40359542]


In [7]:
# Task 2.B: Sentiment Analysis

# Task B (Sentiment Analysis)
class TaskBSentimentAnalysis(nn.Module):
    def __init__(self, bert_encoder, num_classes=3):  # Positive, Neutral, Negative
        super(TaskBSentimentAnalysis, self).__init__()
        self.bert_encoder = bert_encoder
        self.task_b_fc = nn.Linear(self.bert_encoder.bert.config.hidden_size, num_classes)

    def forward(self, sentences):
        token_embeddings, attention_mask = self.bert_encoder(sentences)
        summed = torch.sum(token_embeddings * attention_mask.unsqueeze(-1), dim=1)
        summed_mask = torch.clamp(attention_mask.sum(dim=1), min=1e-9).unsqueeze(-1)
        sentence_embeddings = summed / summed_mask
        task_b_output = self.task_b_fc(sentence_embeddings)
        return task_b_output

In [8]:
# Example Usage

# Initialize Task B
task_b_model = TaskBSentimentAnalysis(bert_encoder=shared_bert_encoder, num_classes=3).to(device)

# Get predictions for Task B (Sentiment Analysis)
task_b_output = task_b_model(sentences)
task_b_probs = softmax(task_b_output)

# Define sentiment class names
sentiment_classes = ['Positive', 'Neutral', 'Negative']

# Print results
print("Task B Output (Sentiment Analysis):")
for i, sentence in enumerate(sentences):
    predicted_class = sentiment_classes[task_b_probs[i].argmax(dim=0).item()]
    print(f"Sentence: '{sentence}'")
    print(f"Predicted sentiment: {predicted_class}")
    print(f"Probabilities: {task_b_probs[i].detach().cpu().numpy()}")
    print('-' * 50)

Task B Output (Sentiment Analysis):
Sentence: 'I love saving money with Fetch.'
Predicted sentiment: Neutral
Probabilities: [0.29477343 0.44812414 0.2571024 ]
--------------------------------------------------
Sentence: 'I would be a great ML Apprentice.'
Predicted sentiment: Neutral
Probabilities: [0.35747185 0.39837918 0.24414898]
--------------------------------------------------
Sentence: 'Fetch Rewards is a useful app.'
Predicted sentiment: Neutral
Probabilities: [0.32055002 0.4254784  0.25397152]
--------------------------------------------------
Sentence: 'I enjoy creating ML models.'
Predicted sentiment: Neutral
Probabilities: [0.36590302 0.39256594 0.24153106]
--------------------------------------------------
Sentence: 'My favorite color is blue.'
Predicted sentiment: Neutral
Probabilities: [0.3419062  0.4364615  0.22163229]
--------------------------------------------------


### Changes made to the architecture to support multi-task learning.

The shared BERT encoder allows me to complete both tasks without recreating the same model. It also allows me to be able to add more tasks later if need be. 

In Task A I created a fully connected linear layer that uses the BERT vector to output the probability score for predefined sentence classes. These classes include personal statements, general facts, and app-related sentences.

In Task B I used the same BERT vector and created another fully connected linear layer that outputs the probability score for sentiment classes. These classes include positive, negative, and neutral sentences. 

## Task 3: Training Considerations
### Implications and advantages of scenarios and the rationale as to how the model should be trained
1. The entire network is frozen:
   - Implications and Advantages: All parameters like the transformer and task-specific heads are frozen. The model is then used as a static encoder. This is useful for feature extraction as training is fast and no gradient updates are required. Although it is rare, it would be useful if the pre-trained model performs significantly well on the tasks. This also would be less computationally expensive.
   - How the model should be trained: Use the transformer to extract embeddings (I used mean pooling). Pass the embeddings through a separate classifier and train it independently.
2. Only the transformer backbone should be frozen:
   - Implications and Advantages: The transformer stays stable which reduces overfitting on small datasets. Only task-specific classification heads are trainable; however, they are fast and easy to train. This would also preserve the linguistic knowledge that is captured during pre-training. 
   - How the model should be trained: Freeze the transformer parameters then use a loss function (I used cross-entropy) to define and train task-specific heads. Update the head parameters with a standard training loop.
3. Only one of the task-specific heads (either for Task A or Task B) should be frozen
   - Implications and Advantages: One task (Task A: Classification) is stable while the other task (Task B: Sentiment) is trained. This is useful for preventing catastrophic forgetting as it allows one task to improve without degrading the performance of the other task that performs well. The shared backbone is also updated. 
   - How the model should be trained: Freeze the well-performing task then update the head of the other task. Calculate the loss for only the updated task.

### Approaching the transfer learning process
The choice for the pre-trained model is the BERT-base transformer. This is because it is pre-trained on massive corpora which reduces the computational price and the need for large labeled datasets. It also provides a strong general language understanding and generates sufficient sentence embeddings. It even transfers well across tasks as it is widely supported and well-documented.
Initially, The embedding and transformer encoder layers are frozen. Train the task-specific heads. As training goes on, specifically mid-training, gradually unfreeze a few of the top layers of BERT so they may adapt to task-specific signals and improve performance. This also avoids catastrophic forgetting and reduces training time. Lastly, the entire model should be tuned with a low learning rate. During tuning, all BERT layers can be unfrozen. This allows the model to learn domain-specific context. 

## Task 4: Training Loop Implentation (BONUS)

In [9]:
# Hypothetical batch of input sentences
sentences = [
    "I love saving money with Fetch.",
    "I would be a great ML Apprentice.",
    "Fetch Rewards is a useful app.",
    "I enjoy creating ML models.",
    "My favorite color is blue."
]

# Dummy labels
task_a_labels = torch.tensor([0, 1, 0, 1, 2])  # Sentence classification (App, Statement, Fact)
task_b_labels = torch.tensor([2, 2, 2, 2, 1])  # Sentiment (Positive, Positive, Positive, Positive, Neutral)

# Move labels to device
task_a_labels = task_a_labels.to(device)
task_b_labels = task_b_labels.to(device)

# Initialize models
shared_encoder = SharedBERTEncoder().to(device)
task_a_model = TaskASentenceClassification(shared_encoder, num_classes=3).to(device)
task_b_model = TaskBSentimentAnalysis(shared_encoder, num_classes=3).to(device)

# Loss functions and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(list(task_a_model.parameters()) + list(task_b_model.parameters()), lr=2e-5)

# === Training Step (Single Batch Example) ===
task_a_model.train()
task_b_model.train()

# Zero gradients
optimizer.zero_grad()

# Forward passes
task_a_logits = task_a_model(sentences)           # (batch_size, num_classes)
task_b_logits = task_b_model(sentences)           # (batch_size, num_classes)

# Compute losses
task_a_loss = criterion(task_a_logits, task_a_labels)
task_b_loss = criterion(task_b_logits, task_b_labels)

# Combine losses (equal weight)
total_loss = task_a_loss + task_b_loss

# Backpropagation
total_loss.backward()
optimizer.step()

# Compute accuracy
task_a_preds = torch.argmax(task_a_logits, dim=1)
task_b_preds = torch.argmax(task_b_logits, dim=1)

task_a_acc = (task_a_preds == task_a_labels).float().mean()
task_b_acc = (task_b_preds == task_b_labels).float().mean()

# Print results
print(f"Task A Loss: {task_a_loss.item():.4f}, Accuracy: {task_a_acc.item():.2f}")
print(f"Task B Loss: {task_b_loss.item():.4f}, Accuracy: {task_b_acc.item():.2f}")

Task A Loss: 1.1074, Accuracy: 0.40
Task B Loss: 1.1614, Accuracy: 0.20


  return disable_fn(*args, **kwargs)


### Assumptions and Decisions

The sentences are embedded through a shared BERT encoder to reduce model size, training time, and computational cost. Since each task has different output requirements, I created two separate task-specific heads. Dummy labels were created as hypothetical data to focus on the architecture of the model. Assuming the losses for each task are comparable in class, I used cross-entropy loss as the loss function for both tasks. Accuracy is used as an evaluation metric as this simple metric works well at tracking classification performance. Only one batch was used for simplicity; however, in practice, multiple batches would be used.