In [1]:
!python -m pip install -U sentence-transformers



In [1]:
%cd ..
import math
import numpy as np

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

from model.builder import create_preprocessing_model

/Users/mihir/Desktop/Classes/CSCI-566/Project/csci566s-24


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class DataLoaderDUNES(Dataset):
    def __init__(self, data, preprocessing_model):
        self.data = data
        self.preprocessing_model = preprocessing_model
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        prev_tweet_embedding, prev_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector = self.preprocessing_model(
            item['prev_tweet'], item['prev_reddit']
        )

        prev_tweet_embedding, prev_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector = self.preprocessing_model(
            item['prev_tweet'], item['prev_reddit']
        )

        def to_tensor(obj):
            if not isinstance(obj, torch.Tensor):
                obj = torch.tensor(obj, dtype=torch.float)
            return obj

        prev_tweet_embedding = to_tensor(prev_tweet_embedding)
        prev_tweet_sentiment = to_tensor(prev_tweet_sentiment)
        prev_reddit_sentiment = to_tensor(prev_reddit_sentiment)
        prev_tweet_sector = to_tensor(prev_tweet_sector)

        return {
            'prev_tweet_embedding': prev_tweet_embedding,
            'prev_tweet_sentiment': prev_tweet_sentiment,
            'prev_reddit_sentiment': prev_reddit_sentiment,
            'prev_tweet_sector': prev_tweet_sector,
            'likes': torch.tensor(item['likes'], dtype=torch.float),
            'retweets': torch.tensor(item['retweets'], dtype=torch.float),
            'comments': torch.tensor(item['comments'], dtype=torch.float)
        }

In [3]:
data = [
    {
        'prev_tweet': "@WholeMarsBlog Headline is misleading. Starlink can obviously offer far more robust positioning than GPS, as it will have ~1000X more satellites over time. Not all will have line of sight to users, but still >10X GPS & far stronger signal. Just not today’s problem.",
        'curr_tweet': "@spideycyp_155 @BillyM2k If Russia faced calamitous defeat in conventional warfare for something as strategically critical as Crimea, the probability of using nuclear weapons is high",
        'prev_reddit': "We know who controls the media. The same corporations who have wreaked havoc on the globe for decades, if not centuries, the big banks who financed them, and the governments who turned a blind eye to the destruction. The same entities who have brought us to the precipice of destruction - quite possibly condemning us, and our progeny to an unlivable climate They have tried to stop you at every turn, and yet you persist for the good of humanity. We love you, Elon! Keep up the good work! As you have said, we must never let the light of human consciousness fade - never!",
        'likes': 100,  
        'retweets': 50, 
        'comments': 25  
    }
]

In [4]:
preprocessing_model = create_preprocessing_model(
    "mixedbread-ai/mxbai-embed-large-v1",
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    "bhadresh-savani/distilbert-base-uncased-emotion",
    "cardiffnlp/tweet-topic-latest-multi"
)

dataset = DataLoaderDUNES(data, preprocessing_model)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [11]:
class TransformerModel(nn.Module):
    def __init__(self, feature_sizes, d_model, nhead, num_encoder_layers, dim_feedforward, num_outputs):
        super(TransformerModel, self).__init__()
        self.positional_encoder = PositionalEncoding(d_model)
        
        # Ensure that the input sizes are correctly matched here
        self.projection_layers = nn.ModuleDict({
            'prev_tweet_embedding': nn.Linear(feature_sizes['tweet_embedding'], d_model),
            'prev_tweet_sentiment': nn.Linear(feature_sizes['tweet_sentiment'], d_model),
            'prev_reddit_sentiment': nn.Linear(feature_sizes['reddit_sentiment'], d_model),
            'prev_tweet_sector': nn.Linear(feature_sizes['tweet_sector'], d_model),
        })
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.output_linear = nn.Linear(d_model, num_outputs)

    def forward(self, features):
        projected_features = []
        for key, feature in features.items():
            # print(f"{key} shape before projection: {feature.shape}")
            feature = self.projection_layers[key](feature)
            if feature.dim() == 2:
                feature = feature.unsqueeze(1)  # Ensure it has a sequence dimension
            projected_features.append(feature)
        
        print(len(projected_features))
        # Concatenation and positional encoding remain the same
        src = torch.cat(projected_features, dim=1)
        print(src.shape)
        src = self.positional_encoder(src)
        
        # Transformer encoding and output processing remain unchanged
        output = self.transformer_encoder(src)
        output = output.mean(dim=0)
        output = self.output_linear(output)
        
        return output

In [12]:
model = TransformerModel(
    feature_sizes={
        'tweet_embedding': preprocessing_model.feature_size['twitter_embedding'],  # Size of the tweet embeddings
        'tweet_sentiment': preprocessing_model.feature_size['twitter_sentiment'],  # Size of the tweet sentiment vector
        'reddit_sentiment': preprocessing_model.feature_size['reddit_sentiment'],  # Size of the Reddit sentiment vector
        'tweet_sector': preprocessing_model.feature_size['twitter_sector'],  # Size of the tweet sector vector
    },
    d_model=512,  # Size of each projection layer
    nhead=8,  # Number of attention heads in the transformer encoder
    num_encoder_layers=3,  # Number of layers in the transformer encoder
    dim_feedforward=2048,  # Size of the feedforward network model in transformer encoder
    num_outputs=3  # Number of output values (e.g., predicting engagement metrics)
)



In [13]:
for name, param in model.named_parameters():
    print(f"{name}: {param.size()}")

projection_layers.prev_tweet_embedding.weight: torch.Size([512, 1024])
projection_layers.prev_tweet_embedding.bias: torch.Size([512])
projection_layers.prev_tweet_sentiment.weight: torch.Size([512, 3])
projection_layers.prev_tweet_sentiment.bias: torch.Size([512])
projection_layers.prev_reddit_sentiment.weight: torch.Size([512, 6])
projection_layers.prev_reddit_sentiment.bias: torch.Size([512])
projection_layers.prev_tweet_sector.weight: torch.Size([512, 19])
projection_layers.prev_tweet_sector.bias: torch.Size([512])
transformer_encoder.layers.0.self_attn.in_proj_weight: torch.Size([1536, 512])
transformer_encoder.layers.0.self_attn.in_proj_bias: torch.Size([1536])
transformer_encoder.layers.0.self_attn.out_proj.weight: torch.Size([512, 512])
transformer_encoder.layers.0.self_attn.out_proj.bias: torch.Size([512])
transformer_encoder.layers.0.linear1.weight: torch.Size([2048, 512])
transformer_encoder.layers.0.linear1.bias: torch.Size([2048])
transformer_encoder.layers.0.linear2.weight

In [14]:
criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=0.001)
num_epochs = 25

In [15]:
for epoch in range(num_epochs):
    # Training phase
    model.train()  
    train_loss = 0.0
    for batch in dataloader:
        optimizer.zero_grad()
        print(batch)
        features = {
            'prev_tweet_embedding': batch['prev_tweet_embedding'],
            'prev_tweet_sentiment': batch['prev_tweet_sentiment'],
            'prev_reddit_sentiment': batch['prev_reddit_sentiment'],
            'prev_tweet_sector': batch['prev_tweet_sector'],
        }
        targets = torch.stack((batch['likes'], batch['retweets'], batch['comments']), dim=1)
        outputs = model(features)
        loss = criterion(outputs, targets)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = train_loss / len(dataloader)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}')

{'prev_tweet_embedding': tensor([[[ 0.1027,  0.1121, -0.1494,  ..., -0.7572,  0.4243, -0.0119]]]), 'prev_tweet_sentiment': tensor([[-0.8701,  0.6669, -0.1567]]), 'prev_reddit_sentiment': tensor([[ 1.6429, -0.6358, -1.6813,  4.5306, -1.9786, -3.1855]]), 'prev_tweet_sector': tensor([[-4.9538, -2.0978, -4.0992, -3.5227, -4.9808, -5.4314, -3.7159, -3.9895,
         -4.8203, -4.2711, -3.6174, -4.7744, -1.1570, -3.2301, -4.5657,  3.7930,
         -4.5094, -4.4113, -4.7642]]), 'likes': tensor([100.]), 'retweets': tensor([50.]), 'comments': tensor([25.])}
4
torch.Size([1, 4, 512])
Epoch 1/25, Train Loss: 4426.8125


  return F.mse_loss(input, target, reduction=self.reduction)


{'prev_tweet_embedding': tensor([[[ 0.1027,  0.1121, -0.1494,  ..., -0.7572,  0.4243, -0.0119]]]), 'prev_tweet_sentiment': tensor([[-0.8701,  0.6669, -0.1567]]), 'prev_reddit_sentiment': tensor([[ 1.6429, -0.6358, -1.6813,  4.5306, -1.9786, -3.1855]]), 'prev_tweet_sector': tensor([[-4.9538, -2.0978, -4.0992, -3.5227, -4.9808, -5.4314, -3.7159, -3.9895,
         -4.8203, -4.2711, -3.6174, -4.7744, -1.1570, -3.2301, -4.5657,  3.7930,
         -4.5094, -4.4113, -4.7642]]), 'likes': tensor([100.]), 'retweets': tensor([50.]), 'comments': tensor([25.])}
4
torch.Size([1, 4, 512])
Epoch 2/25, Train Loss: 3638.9629
{'prev_tweet_embedding': tensor([[[ 0.1027,  0.1121, -0.1494,  ..., -0.7572,  0.4243, -0.0119]]]), 'prev_tweet_sentiment': tensor([[-0.8701,  0.6669, -0.1567]]), 'prev_reddit_sentiment': tensor([[ 1.6429, -0.6358, -1.6813,  4.5306, -1.9786, -3.1855]]), 'prev_tweet_sector': tensor([[-4.9538, -2.0978, -4.0992, -3.5227, -4.9808, -5.4314, -3.7159, -3.9895,
         -4.8203, -4.2711, -3.6

In [17]:
# Prediction phase
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient calculation
    total_predictions = []
    total_targets = []
    for batch in validation_dataloader:
        features = {
            'prev_tweet_embedding': batch['prev_tweet_embedding'],
            'prev_tweet_sentiment': batch['prev_tweet_sentiment'],
            'prev_reddit_sentiment': batch['prev_reddit_sentiment'],
            'prev_tweet_sector': batch['prev_tweet_sector'],
        }
        targets = torch.stack((batch['likes'], batch['retweets'], batch['comments']), dim=1)
        predictions = model(features)
        total_predictions.append(predictions)
        total_targets.append(targets)
    
    # Conversion to numpy for analysis (optional)
    predictions_np = torch.cat(total_predictions, dim=0).cpu().numpy()
    targets_np = torch.cat(total_targets, dim=0).cpu().numpy()
    # Use predictions_np and targets_np for further analysis
