In [1]:
!python -m pip install -U sentence-transformers



In [2]:
import math
import numpy as np
from typing import Dict
from scipy.special import expit, softmax


import torch
import torch.nn as nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.nn.functional import softmax
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class PreDUNES(nn.Module):
    def __init__(self, twitter_embedding_model, twitter_sentiment_tokenizer, twitter_sentiment_model, reddit_sentiment_tokenizer, reddit_sentiment_model, twitter_sector_tokenizer, twitter_sector_model):
        super(PreDUNES, self).__init__()
        self.twitter_embedding_model = twitter_embedding_model
        self.twitter_sentiment_tokenizer = twitter_sentiment_tokenizer
        self.twitter_sentiment_model = twitter_sentiment_model
        self.reddit_sentiment_tokenizer = reddit_sentiment_tokenizer
        self.reddit_sentiment_model = reddit_sentiment_model
        self.twitter_sector_tokenizer = twitter_sector_tokenizer
        self.twitter_sector_model = twitter_sector_model

    def forward(self, prev_tweet, prev_reddit):
        # Corrected to use convert_to_tensor=True
        prev_tweet_embedding = self.twitter_embedding_model.encode([prev_tweet], convert_to_tensor=True)
        prev_tweet_tokens = self.twitter_sentiment_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sentiment = self.twitter_sentiment_model(**prev_tweet_tokens).logits
        prev_reddit_tokens = self.reddit_sentiment_tokenizer(prev_reddit, return_tensors='pt')
        prev_reddit_sentiment = self.reddit_sentiment_model(**prev_reddit_tokens).logits
        prev_sector_tokens = self.twitter_sector_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sector = self.twitter_sector_model(**prev_sector_tokens).logits

        return prev_tweet_embedding, prev_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector

def create_preprocessing_model(twitter_embedding, twitter_sentiment, reddit_sentiment, twitter_sector):
    twitter_embedding_model = SentenceTransformer(twitter_embedding)
    twitter_sentiment_tokenizer = AutoTokenizer.from_pretrained(twitter_sentiment)
    twitter_sentiment_model = AutoModelForSequenceClassification.from_pretrained(twitter_sentiment)
    reddit_sentiment_tokenizer = AutoTokenizer.from_pretrained(reddit_sentiment)
    reddit_sentiment_model = AutoModelForSequenceClassification.from_pretrained(reddit_sentiment)
    twitter_sector_tokenizer = AutoTokenizer.from_pretrained(twitter_sector)
    twitter_sector_model = AutoModelForSequenceClassification.from_pretrained(twitter_sector)

    # Set models to evaluation mode
    twitter_sentiment_model.eval()
    reddit_sentiment_model.eval()
    twitter_sector_model.eval()

    # Disable gradients for models
    for model in [twitter_sentiment_model, reddit_sentiment_model, twitter_sector_model]:
        for param in model.parameters():
            param.requires_grad = False

    model = PreDUNES(
        twitter_embedding_model,
        twitter_sentiment_tokenizer,
        twitter_sentiment_model,
        reddit_sentiment_tokenizer,
        reddit_sentiment_model,
        twitter_sector_tokenizer,
        twitter_sector_model
    )

    return model

In [16]:
preprocessing_model = create_preprocessing_model(
    "mixedbread-ai/mxbai-embed-large-v1",
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    "bhadresh-savani/distilbert-base-uncased-emotion",
    "cardiffnlp/tweet-topic-latest-multi"
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
class DataLoaderDUNES(Dataset):
    def __init__(self, data, preprocessing_model, seq_len=5, stride=1):
        self.data = data
        self.preprocessing_model = preprocessing_model
        self.seq_len = seq_len
        self.stride = stride
        self.vector_size = self.calculate_vector_size() + 4 #could be 2, not sure

    def calculate_vector_size(self):
        sample = self.data[0]
        outputs = self.preprocessing_model(sample['prev_tweet'], sample['prev_reddit'])
        total_size = sum(output.size(-1) for output in outputs) 
        return total_size
    
    def __len__(self):
        return max(0, ((len(self.data) - self.seq_len) // self.stride) + 1)
    
    def __getitem__(self, idx):
        start_idx = idx * self.stride
        end_idx = start_idx + self.seq_len
        seq_features = torch.zeros((self.seq_len, self.vector_size))

        for i in range(self.seq_len):
            data_idx = start_idx + i
            if data_idx < len(self.data):
                item = self.data[data_idx]
                prev_tweet_embedding, prev_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector = self.preprocessing_model(
                    item['prev_tweet'], item['prev_reddit']
                )
                additional_data = torch.tensor([item['likes'], item['retweets'], item['comments'], item['reddit_score'], item['rcomments']], dtype=torch.float)

                all_features = torch.cat([
                    prev_tweet_embedding.flatten(), 
                    prev_tweet_sentiment.flatten(),
                    prev_reddit_sentiment.flatten(),
                    prev_tweet_sector.flatten(),
                    additional_data
                ])

                seq_features[i, :all_features.size(0)] = all_features

        return seq_features.unsqueeze(0)

In [25]:
data = [
    {
        'prev_tweet': "@WholeMarsBlog Headline is misleading. Starlink can obviously offer far more robust positioning than GPS, as it will have ~1000X more satellites over time. Not all will have line of sight to users, but still >10X GPS & far stronger signal. Just not today’s problem.",
        'curr_tweet': "@spideycyp_155 @BillyM2k If Russia faced calamitous defeat in conventional warfare for something as strategically critical as Crimea, the probability of using nuclear weapons is high",
        'prev_reddit': "We know who controls the media. The same corporations who have wreaked havoc on the globe for decades, if not centuries, the big banks who financed them, and the governments who turned a blind eye to the destruction. The same entities who have brought us to the precipice of destruction - quite possibly condemning us, and our progeny to an unlivable climate They have tried to stop you at every turn, and yet you persist for the good of humanity. We love you, Elon! Keep up the good work! As you have said, we must never let the light of human consciousness fade - never!",
        'likes': 100,  
        'retweets': 50, 
        'comments': 25,
        'reddit_score': 0.8,
        'rcomments': 45          
    }
]

In [26]:
dataset = DataLoaderDUNES(data, preprocessing_model, seq_len=1, stride=1)  #for immediate testing of first data point
print("Dataset length:", len(dataset))
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

Dataset length: 1


In [27]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [28]:
class TransformerModel(nn.Module):
    def __init__(self, feature_sizes, d_model, nhead, num_encoder_layers, dim_feedforward, num_outputs):
        super(TransformerModel, self).__init__()
        self.positional_encoder = PositionalEncoding(d_model)
        
        # Ensure that the input sizes are correctly matched here
        self.projection_layers = nn.ModuleDict({
            'prev_tweet_embedding': nn.Linear(feature_sizes['tweet_embedding'], d_model),
            'prev_tweet_sentiment': nn.Linear(feature_sizes['tweet_sentiment'], d_model),
            'prev_reddit_sentiment': nn.Linear(feature_sizes['reddit_sentiment'], d_model),
            'prev_tweet_sector': nn.Linear(feature_sizes['tweet_sector'], d_model),
        })
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.output_linear = nn.Linear(d_model, num_outputs)

    def forward(self, features):
        projected_features = []
        for key, feature in features.items():
            # print(f"{key} shape before projection: {feature.shape}")
            feature = self.projection_layers[key](feature)
            if feature.dim() == 2:
                feature = feature.unsqueeze(1)  # Ensure it has a sequence dimension
            projected_features.append(feature)
        
        # Concatenation and positional encoding remain the same
        src = torch.cat(projected_features, dim=1)
        src = self.positional_encoder(src)
        
        # Transformer encoding and output processing remain unchanged
        output = self.transformer_encoder(src)
        output = output.mean(dim=0)
        output = self.output_linear(output)
        
        return output

In [29]:
model = TransformerModel(
    feature_sizes={
        'tweet_embedding': 1024,  # Size of the tweet embeddings
        'tweet_sentiment': 3,  # Size of the tweet sentiment vector
        'reddit_sentiment': 6,  # Size of the Reddit sentiment vector
        'tweet_sector': 19,  # Size of the tweet sector vector
    },
    d_model=512,  # Size of each projection layer
    nhead=8,  # Number of attention heads in the transformer encoder
    num_encoder_layers=3,  # Number of layers in the transformer encoder
    dim_feedforward=2048,  # Size of the feedforward network model in transformer encoder
    num_outputs=3  # Number of output values (e.g., predicting engagement metrics)
)



In [30]:
for name, param in model.named_parameters():
    print(f"{name}: {param.size()}")

projection_layers.prev_tweet_embedding.weight: torch.Size([512, 1024])
projection_layers.prev_tweet_embedding.bias: torch.Size([512])
projection_layers.prev_tweet_sentiment.weight: torch.Size([512, 3])
projection_layers.prev_tweet_sentiment.bias: torch.Size([512])
projection_layers.prev_reddit_sentiment.weight: torch.Size([512, 6])
projection_layers.prev_reddit_sentiment.bias: torch.Size([512])
projection_layers.prev_tweet_sector.weight: torch.Size([512, 19])
projection_layers.prev_tweet_sector.bias: torch.Size([512])
transformer_encoder.layers.0.self_attn.in_proj_weight: torch.Size([1536, 512])
transformer_encoder.layers.0.self_attn.in_proj_bias: torch.Size([1536])
transformer_encoder.layers.0.self_attn.out_proj.weight: torch.Size([512, 512])
transformer_encoder.layers.0.self_attn.out_proj.bias: torch.Size([512])
transformer_encoder.layers.0.linear1.weight: torch.Size([2048, 512])
transformer_encoder.layers.0.linear1.bias: torch.Size([2048])
transformer_encoder.layers.0.linear2.weight

In [31]:
criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=0.001)
num_epochs = 25

In [32]:
for epoch in range(num_epochs):
    # Training phase
    model.train()  
    train_loss = 0.0
    for batch in dataloader:
        print(batch.shape) # Expected shape: (batch_size, seq_len, vector_size)
        break
        optimizer.zero_grad()
        features = {
            'prev_tweet_embedding': batch['prev_tweet_embedding'],
            'prev_tweet_sentiment': batch['prev_tweet_sentiment'],
            'prev_reddit_sentiment': batch['prev_reddit_sentiment'],
            'prev_tweet_sector': batch['prev_tweet_sector'],
        }
        targets = torch.stack((batch['likes'], batch['retweets'], batch['comments']), dim=1)
        outputs = model(features)
        loss = criterion(outputs, targets)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = train_loss / len(dataloader)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}')

RuntimeError: The expanded size of the tensor (1056) must match the existing size (1057) at non-singleton dimension 0.  Target sizes: [1056].  Tensor sizes: [1057]

In [None]:
# Prediction phase
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient calculation
    total_predictions = []
    total_targets = []
    for batch in validation_dataloader:
        features = {
            'prev_tweet_embedding': batch['prev_tweet_embedding'],
            'prev_tweet_sentiment': batch['prev_tweet_sentiment'],
            'prev_reddit_sentiment': batch['prev_reddit_sentiment'],
            'prev_tweet_sector': batch['prev_tweet_sector'],
        }
        targets = torch.stack((batch['likes'], batch['retweets'], batch['comments']), dim=1)
        predictions = model(features)
        total_predictions.append(predictions)
        total_targets.append(targets)
    
    # Conversion to numpy for analysis (optional)
    predictions_np = torch.cat(total_predictions, dim=0).cpu().numpy()
    targets_np = torch.cat(total_targets, dim=0).cpu().numpy()
    # Use predictions_np and targets_np for further analysis
