# Sentiment aware embeddings
The focus of this repo is to investigate the possibility to create sentiment aware embeddings. What I mean by this is a system that produces strong positive embeddings for a pair of query term and sentence where the sentence is positive towards the query term, and negative if it expresses a negative sentiment. For example:

positive (cosine close to 1):
query term: Coca Cola
sentence: Love coca cola - best drink ever!

negative (cosine close to -1):
query term: Coca Cola
sentence: Hate coca cola - tastes like pepsi, just way worse!

One potential use case for such encoders would be for vector search within brand management. With this system you can vectorise and upload all sentences that mentions either your brand or a competitor to a vector database and then search for sentences that are postive or negative towards your brand or one of your competitors.

In [15]:
pip install -r requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
import torch
import torch.nn as nn
from torch.nn.functional import cosine_similarity
from datasets import load_dataset
from tqdm import tqdm
from model import TargetedSentimentEncoder

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
data = load_dataset('fhamborg/news_sentiment_newsmtsc')

In [9]:
class TargetedSentimentEncoder(nn.Module):
    def __init__(self, base_model: str):
        super(TargetedSentimentEncoder, self).__init__()
        
        # Separate BERT encoders for query and sentence
        self.tokenizer = AutoTokenizer.from_pretrained(base_model)
        self.query_encoder = AutoModel.from_pretrained(base_model)
        self.text_encoder = AutoModel.from_pretrained(base_model)
        
        # Freeze all layers except the last encoder stack for both networks
        #self._freeze_encoder(self.query_encoder)
        #self._freeze_encoder(self.text_encoder)
        
        self.dropout = nn.Dropout(0.3)

    def _freeze_encoder(self, encoder):
        """
        Freeze all layers except the last encoder stack.
        """
        for name, param in encoder.named_parameters():
            # Check if the layer belongs to the last encoder stack
            if 'layer' in name and 'layer.5' not in name:  # DistilBERT has 6 layers, we freeze up to layer 4
                param.requires_grad = False
    
    def forward(self, query_text, sentence_text):
        # Tokenize input
        query_inputs = self.tokenizer(query_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        sentence_inputs = self.tokenizer(sentence_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        
        # Get last hidden states for both query and sentence inputs
        query_hidden_states = self.query_encoder(**query_inputs).last_hidden_state  # Shape: [batch_size, seq_len, hidden_dim]
        sentence_hidden_states = self.text_encoder(**sentence_inputs).last_hidden_state  # Shape: [batch_size, seq_len, hidden_dim]

        # Apply max pooling across the sequence (dim=1)
        query_embeds, _ = torch.max(query_hidden_states, dim=1)  # Shape: [batch_size, hidden_dim]
        sentence_embeds, _ = torch.max(sentence_hidden_states, dim=1)  # Shape: [batch_size, hidden_dim]
        
        # Dropout layer
        query_embeds = self.dropout(query_embeds)
        sentence_embeds = self.dropout(sentence_embeds)
        
        return query_embeds, sentence_embeds

In [10]:
import torch.nn.functional as F

class TanhLoss(nn.Module):
    def __init__(self):
        super(TanhLoss, self).__init__()
        
    def forward(self, query_embeds, sentence_embeds, target):
        # Compute cosine similarity between query and sentence embeddings
        cos_sim = F.cosine_similarity(query_embeds, sentence_embeds)
        
        # Optionally apply tanh to the cosine similarity
        pred_tanh = torch.tanh(cos_sim)
        
        # Compute the difference between tanh of the prediction and the target
        loss = torch.mean((pred_tanh - target) ** 2)  # Mean squared error

        return loss

In [14]:
base_model = 'cross-encoder/nli-roberta-base'
model = TargetedSentimentEncoder(base_model=base_model, device=device).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
#criterion = nn.CosineEmbeddingLoss()
criterion = nn.MSELoss()
#criterion = TanhLoss()

Some weights of RobertaModel were not initialized from the model checkpoint at cross-encoder/nli-roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at cross-encoder/nli-roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomTextPairDataset(Dataset):
    def __init__(self, hf_dataset):
        """
        Args:
            hf_dataset: Hugging Face dataset, e.g., train, test, or validation split.
        """
        self.dataset = hf_dataset
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # Extract the 'mention' (query_sentence), 'sentence' (text_sentence), and 'polarity' (label)
        query_sentence = self.dataset[idx]['mention']
        text_sentence = self.dataset[idx]['sentence']
        label = torch.tensor(self.dataset[idx]['polarity'], dtype=torch.float)  # Assuming polarity is already -1, 0, or 1
        return query_sentence, text_sentence, label

In [10]:
# Create Dataset objects for train, test, and validation splits
train_dataset = CustomTextPairDataset(data['train'])
test_dataset = CustomTextPairDataset(data['test'])
validation_dataset = CustomTextPairDataset(data['validation'])

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)
validation_dataloader = DataLoader(validation_dataset, batch_size=16, shuffle=False)

In [15]:
# Set the model to training mode
model.train()

# Number of epochs
num_epochs = 10

# Training loop for 10 epochs
for epoch in range(num_epochs):
    total_loss = 0.0  # Initialize the total loss for the epoch
    
    # Training phase
    model.train()  # Ensure the model is in training mode
    for batch in tqdm(train_dataloader):
        query_text, sentence_text, label = batch  # Get the inputs from the dataloader
        label = label.to(device)

        # Forward pass
        query_embeds, sentence_embeds = model(query_text, sentence_text)

        # Compute similarity and loss
        similarity = cosine_similarity(query_embeds, sentence_embeds)
        loss = criterion(label, similarity)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Accumulate loss
        total_loss += loss.item()
    
    # Average training loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    
    # Print loss for each epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_loss:.4f}")

    # Validation phase
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0  # Initialize the validation loss
    
    with torch.no_grad():  # Disable gradient calculation for validation
        for batch in test_dataloader:  # Iterate over validation data
            query_text, sentence_text, label = batch  # Get the inputs
            label = label.to(device)

            # Forward pass
            query_embeds, sentence_embeds = model(query_text, sentence_text)

            # Compute similarity and loss
            similarity = cosine_similarity(query_embeds, sentence_embeds)
            loss = criterion(label, similarity)

            # Accumulate validation loss
            val_loss += loss.item()

    # Average validation loss for the epoch
    avg_val_loss = val_loss / len(test_dataloader)
    
    # Print validation loss for each epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Test Loss: {avg_val_loss:.4f}")


100%|██████████| 547/547 [01:22<00:00,  6.66it/s]


Epoch [1/10], Training Loss: 0.3942
Epoch [1/10], Test Loss: 0.5318


100%|██████████| 547/547 [01:21<00:00,  6.69it/s]


Epoch [2/10], Training Loss: 0.2718
Epoch [2/10], Test Loss: 0.3289


100%|██████████| 547/547 [01:22<00:00,  6.64it/s]


Epoch [3/10], Training Loss: 0.2332
Epoch [3/10], Test Loss: 0.3618


100%|██████████| 547/547 [01:22<00:00,  6.66it/s]


Epoch [4/10], Training Loss: 0.2119
Epoch [4/10], Test Loss: 0.4754


100%|██████████| 547/547 [01:22<00:00,  6.67it/s]


Epoch [5/10], Training Loss: 0.1955
Epoch [5/10], Test Loss: 0.3463


100%|██████████| 547/547 [01:22<00:00,  6.66it/s]


Epoch [6/10], Training Loss: 0.1843
Epoch [6/10], Test Loss: 0.4007


100%|██████████| 547/547 [01:22<00:00,  6.62it/s]


Epoch [7/10], Training Loss: 0.1710
Epoch [7/10], Test Loss: 0.3389


100%|██████████| 547/547 [01:22<00:00,  6.63it/s]


Epoch [8/10], Training Loss: 0.1700
Epoch [8/10], Test Loss: 0.3744


100%|██████████| 547/547 [01:22<00:00,  6.65it/s]


Epoch [9/10], Training Loss: 0.1620
Epoch [9/10], Test Loss: 0.3509


100%|██████████| 547/547 [01:21<00:00,  6.68it/s]


Epoch [10/10], Training Loss: 0.1555
Epoch [10/10], Test Loss: 0.3404


In [16]:
torch.save(model.state_dict(), './model/model.pth')

In [16]:
# Set the model to evaluation mode
model.eval()

# List to store predictions and actual labels
predictions = []
true_labels = []

# No gradient computation needed for evaluation
with torch.no_grad():
    for batch in test_dataloader:
        query_text, sentence_text, label = batch  # Get the inputs from the dataloader
        
        # Forward pass to get cosine similarity scores
        query_embeds, sentence_embeds = model(query_text, sentence_text)
        
        # Compute cosine similarity between query and sentence embeddings
        cos_sim = torch.nn.functional.cosine_similarity(query_embeds, sentence_embeds)

        # Convert cosine similarity to discrete values (-1, 0, 1)
        pred_labels = torch.where(cos_sim > 0.5, 1, torch.where(cos_sim < -0.5, -1, 0))

        # Store predictions and true labels
        predictions.extend(pred_labels.cpu().numpy())  # Move to CPU for easy handling
        true_labels.extend(label.cpu().numpy())        # Also move labels to CPU

In [17]:
from sklearn.metrics import classification_report

In [18]:
print(classification_report(true_labels, predictions))

              precision    recall  f1-score   support

        -1.0       0.73      0.82      0.77       295
         0.0       0.72      0.70      0.71       319
         1.0       0.78      0.66      0.72       189

    accuracy                           0.74       803
   macro avg       0.74      0.73      0.73       803
weighted avg       0.74      0.74      0.73       803

