### requirements

In [1]:
!python -m pip install -U sentence-transformers



In [2]:
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class PreDUNES(nn.Module):
    def __init__(
            self,
            twitter_embedding_model: nn.Module,
            twitter_sentiment_tokenizer,
            twitter_sentiment_model: nn.Module,
            reddit_sentiment_tokenizer,
            reddit_sentiment_model: nn.Module,
            twitter_sector_tokenizer,
            twitter_sector_model: nn.Module
        ):
        '''
        Initialize a DUNES model from a set of embeddings and sentiment models.
        Args:
            twitter_embedding_model: huggingface model for Twitter embeddings
            twitter_sentiment_model: huggingface model for Twitter sentiment
            reddit_sentiment_model: huggingface model for Reddit sentiment
            twitter_sector_model: huggingface model for Twitter sector
        '''
        super(PreDUNES, self).__init__()
        self.twitter_embedding_model = twitter_embedding_model
        self.twitter_sentiment_tokenizer = twitter_sentiment_tokenizer
        self.twitter_sentiment_model = twitter_sentiment_model
        self.reddit_sentiment_tokenizer = reddit_sentiment_tokenizer
        self.reddit_sentiment_model = reddit_sentiment_model
        self.twitter_sector_tokenizer = twitter_sector_tokenizer
        self.twitter_sector_model = twitter_sector_model

    def forward(self, prev_tweet, curr_tweet, prev_reddit):
        '''
        Forward pass for the DUNES model.
        Args:
            prev_tweet: previous tweet
            curr_tweet: current tweet
            prev_reddit: previous Reddit post
            curr_reddit: current Reddit post
        Returns:
            sentiment: sentiment of the current tweet
            sector: sector of the current tweet
        '''
        # Get the embeddings
        prev_tweet_embedding = self.twitter_embedding_model.encode([prev_tweet])
        curr_tweet_embedding = self.twitter_embedding_model.encode([curr_tweet])

        # Get the sentiment
        prev_tweet_tokens = self.twitter_sentiment_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sentiment = self.twitter_sentiment_model(**prev_tweet_tokens)[0][0].detach().numpy()
        curr_tweet_tokens = self.twitter_sentiment_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sentiment = self.twitter_sentiment_model(**curr_tweet_tokens)[0][0].detach().numpy()

        prev_reddit_tokens = self.reddit_sentiment_tokenizer(prev_reddit, return_tensors='pt')
        prev_reddit_sentiment = self.reddit_sentiment_model(**prev_reddit_tokens)[0][0].detach().numpy()

        # Get the sector
        prev_sector_tokens = self.twitter_sector_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sector = self.twitter_sector_model(**prev_sector_tokens)[0][0].detach().numpy()
        curr_sector_tokens = self.twitter_sector_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sector = self.twitter_sector_model(**curr_sector_tokens)[0][0].detach().numpy()


        return prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector


def create_preprocessing_model():
    twitter_embedding = "sentence-transformers/mixedbread-ai/mxbai-embed-large-v1"
    twitter_sentiment = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    reddit_sentiment = "SamLowe/roberta-base-go_emotions"
    twitter_sector = "cardiffnlp/tweet-topic-latest-multi"

    '''
    Initialize a DUNES model from a set of embeddings and sentiment models.
    Feeds the output of these models to a transformer for classification.
    Args:
        twitter_embedding: path to the Twitter embedding huggingface model
        twitter_sentiment: path to the Twitter sentiment huggingface model
        reddit_sentiment: path to the Reddit sentiment huggingface model
        twitter_sector: path to the Twitter sector huggingface model
    '''

    # Load the models
    twitter_embedding_model = SentenceTransformer(twitter_embedding)
    twitter_sentiment_tokenizer = AutoTokenizer.from_pretrained(twitter_sentiment)
    twitter_sentiment_model = AutoModelForSequenceClassification.from_pretrained(twitter_sentiment)
    reddit_sentiment_tokenizer = AutoTokenizer.from_pretrained(reddit_sentiment)
    reddit_sentiment_model = AutoModelForSequenceClassification.from_pretrained(reddit_sentiment)
    twitter_sector_tokenizer = AutoTokenizer.from_pretrained(twitter_sector)
    twitter_sector_model = AutoModelForSequenceClassification.from_pretrained(twitter_sector)

    # Freeze the models
    twitter_sentiment_model.eval()
    twitter_sentiment_model.requires_grad_(False)
    reddit_sentiment_model.eval()
    reddit_sentiment_model.requires_grad_(False)
    twitter_sector_model.eval()
    twitter_sector_model.requires_grad_(False)

    # Create the DUNES model
    model = PreDUNES(
        twitter_embedding_model,
        twitter_sentiment_tokenizer,
        twitter_sentiment_model,
        reddit_sentiment_tokenizer,
        reddit_sentiment_model,
        twitter_sector_tokenizer,
        twitter_sector_model
        )

    return model

In [11]:
# Example data list
data = [
    {
        'prev_tweet': "@WholeMarsBlog Headline is misleading. Starlink can obviously offer far more robust positioning than GPS, as it will have ~1000X more satellites over time. Not all will have line of sight to users, but still >10X GPS & far stronger signal. Just not today’s problem.",
        'curr_tweet': "@spideycyp_155 @BillyM2k If Russia faced calamitous defeat in conventional warfare for something as strategically critical as Crimea, the probability of using nuclear weapons is high",
        'prev_reddit': "We know who controls the media. The same corporations who have wreaked havoc on the globe for decades, if not centuries, the big banks who financed them, and the governments who turned a blind eye to the destruction. The same entities who have brought us to the precipice of destruction - quite possibly condemning us, and our progeny to an unlivable climate They have tried to stop you at every turn, and yet you persist for the good of humanity. We love you, Elon! Keep up the good work! As you have said, we must never let the light of human consciousness fade - never!",
        'likes': 100,  
        'retweets': 50, 
        'comments': 25  
    }
]

preprocessing_model = create_preprocessing_model()

ValueError: Path sentence-transformers/mixedbread-ai/mxbai-embed-large-v1 not found

In [8]:
class dataloaderDUNES(Dataset):
    def __init__(self, data, preprocessing_model):
        """
        Args:
            data (List[Dict]): Each dictionary contains raw text for 'prev_tweet', 'curr_tweet',
                               'prev_reddit', and engagement metrics ('likes', 'retweets', 'comments').
            preprocessing_model (PreDUNES): The model instance for preprocessing text data.
        """
        self.data = data
        self.preprocessing_model = preprocessing_model
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        # Process the raw texts through the PreDUNES model
        prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector = self.preprocessing_model(
            item['prev_tweet'], item['curr_tweet'], item['prev_reddit']
        )

        # Convert numpy arrays to tensors if they are not already
        def to_tensor(array):
            if isinstance(array, np.ndarray):
                return torch.tensor(array).float()
            return array
        
        prev_tweet_embedding = to_tensor(prev_tweet_embedding)
        curr_tweet_embedding = to_tensor(curr_tweet_embedding)
        prev_tweet_sentiment = to_tensor(prev_tweet_sentiment)
        curr_tweet_sentiment = to_tensor(curr_tweet_sentiment)
        prev_reddit_sentiment = to_tensor(prev_reddit_sentiment)
        prev_tweet_sector = to_tensor(prev_tweet_sector)
        curr_tweet_sector = to_tensor(curr_tweet_sector)
        
        # Engagement metrics
        likes = torch.tensor(item['likes'], dtype=torch.float)
        retweets = torch.tensor(item['retweets'], dtype=torch.float)
        comments = torch.tensor(item['comments'], dtype=torch.float)
        
        return {
            'prev_tweet_embedding': prev_tweet_embedding,
            'curr_tweet_embedding': curr_tweet_embedding,
            'prev_tweet_sentiment': prev_tweet_sentiment,
            'curr_tweet_sentiment': curr_tweet_sentiment,
            'prev_reddit_sentiment': prev_reddit_sentiment,
            'prev_tweet_sector': prev_tweet_sector,
            'curr_tweet_sector': curr_tweet_sector,
            'likes': likes,
            'retweets': retweets,
            'comments': comments
        }

In [10]:
dataset = DataLoaderDUNES(data, preprocessing_model)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)

ValueError: Path sentence-transformers/mixedbread-ai/mxbai-embed-large-v1 not found

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, feature_sizes, d_model, nhead, num_encoder_layers, dim_feedforward, num_outputs):
        super(TransformerModel, self).__init__()
        self.d_model = d_model
        self.positional_encoder = PositionalEncoding(d_model)
        
        self.projection_layers = nn.ModuleDict({
            'prev_tweet_embedding': nn.Linear(feature_sizes['tweet_embedding'], d_model),
            'curr_tweet_embedding': nn.Linear(feature_sizes['tweet_embedding'], d_model),
            'prev_tweet_sentiment': nn.Linear(feature_sizes['tweet_sentiment'], d_model),
            'curr_tweet_sentiment': nn.Linear(feature_sizes['tweet_sentiment'], d_model),
            'prev_reddit_sentiment': nn.Linear(feature_sizes['reddit_sentiment'], d_model),
            'prev_tweet_sector': nn.Linear(feature_sizes['tweet_sector'], d_model),
            'curr_tweet_sector': nn.Linear(feature_sizes['tweet_sector'], d_model),
        })
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.output_linear = nn.Linear(d_model, num_outputs)

    def forward(self, features):
        projected_features = []
        for key, feature in features.items():
            # Project and reshape each feature
            feature = self.projection_layers[key](feature)
            if feature.dim() == 2:
                feature = feature.unsqueeze(1)  # Add sequence dimension if missing
            projected_features.append(feature)
        
        # Concatenate all features along the sequence dimension
        src = torch.cat(projected_features, dim=1)
        
        # Apply positional encoding
        src = self.positional_encoder(src)
        
        # Transformer encoder
        output = self.transformer_encoder(src)
        
        # Aggregate and predict
        output = output.mean(dim=0)
        output = self.output_linear(output)
        
        return output

In [None]:
# Instantiate the model
model = TransformerModel(
    feature_sizes={
        'tweet_embedding': 1024,
        'tweet_sentiment': 3,
        'reddit_sentiment': 28,
        'tweet_sector': 19,
    },
    d_model=512, 
    nhead=8, 
    num_encoder_layers=3, 
    dim_feedforward=2048, 
    num_outputs=3
)

In [None]:
# Print the model's parameters
for name, param in model.named_parameters():
    print(f"{name}: {param.size()}")

In [None]:
# Define the loss function
criterion = nn.MSELoss()

# Define the optimizer
optimizer = Adam(model.parameters(), lr=0.001)

# Your training loop follows here
num_epochs = 25

In [None]:
for epoch in range(num_epochs):
    # Training phase
    model.train()  
    train_loss = 0.0
    for batch in dataloader:
        optimizer.zero_grad() 
        features = {
            'prev_tweet_embedding': batch['prev_tweet_embedding'],
            'curr_tweet_embedding': batch['curr_tweet_embedding'],
            'prev_tweet_sentiment': batch['prev_tweet_sentiment'],
            'curr_tweet_sentiment': batch['curr_tweet_sentiment'],
            'prev_reddit_sentiment': batch['prev_reddit_sentiment'],
            'prev_tweet_sector': batch['prev_tweet_sector'],
            'curr_tweet_sector': batch['curr_tweet_sector'],
        }
        targets = torch.stack((batch['likes'], batch['retweets'], batch['comments']), dim=1)
        outputs = model(features)
        loss = criterion(outputs, targets)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = train_loss / len(dataloader)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}')

    # Prediction phase
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        total_predictions = []
        total_targets = []
        for batch in validation_dataloader:
            features = {
                'prev_tweet_embedding': batch['prev_tweet_embedding'],
                'curr_tweet_embedding': batch['curr_tweet_embedding'],
                'prev_tweet_sentiment': batch['prev_tweet_sentiment'],
                'curr_tweet_sentiment': batch['curr_tweet_sentiment'],
                'prev_reddit_sentiment': batch['prev_reddit_sentiment'],
                'prev_tweet_sector': batch['prev_tweet_sector'],
                'curr_tweet_sector': batch['curr_tweet_sector'],
            }
            targets = torch.stack((batch['likes'], batch['retweets'], batch['comments']), dim=1)
            predictions = model(features)
            total_predictions.append(predictions)
            total_targets.append(targets)
        
        # Optionally, convert predictions and targets to a convenient format for analysis, e.g., numpy arrays
        predictions_np = torch.cat(total_predictions, dim=0).cpu().numpy()
        targets_np = torch.cat(total_targets, dim=0).cpu().numpy()
        # Now, predictions_np and targets_np can be used for further analysis, e.g., calculating metrics