## preprocessing

### download transformers & librariees

In [3]:
!python -m pip install -U sentence-transformers



In [4]:
import numpy as np
from typing import Dict
from scipy.special import expit, softmax

import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


### preprocessing experimentation
The PreDUNES class serves as a PyTorch module for preprocessing textual data from Twitter and Reddit. Upon initialization, it receives various components including models and tokenizers for generating embeddings, predicting sentiment, and classifying sector information from Twitter and Reddit text. Its forward method executes the preprocessing steps, involving the extraction of embeddings, sentiment analysis, and sector classification for both the previous and current tweets, along with sentiment analysis for the previous Reddit post. The create_preprocessing_model function facilitates the initialization of the preprocessing module by loading the required models and setting them to evaluation mode, ensuring their parameters are frozen for training. This modular approach streamlines the setup of the preprocessing pipeline for downstream tasks such as sentiment analysis and sector classification on textual data.

In [5]:
class PreDUNES(nn.Module):
    def __init__(
            self,
            twitter_embedding_model: nn.Module,
            twitter_sentiment_tokenizer,
            twitter_sentiment_model: nn.Module,
            reddit_sentiment_tokenizer,
            reddit_sentiment_model: nn.Module,
            twitter_sector_tokenizer,
            twitter_sector_model: nn.Module
        ):
        '''
        Initialize a DUNES model from a set of embeddings and sentiment models.
        Args:
            twitter_embedding_model: huggingface model for Twitter embeddings
            twitter_sentiment_model: huggingface model for Twitter sentiment
            reddit_sentiment_model: huggingface model for Reddit sentiment
            twitter_sector_model: huggingface model for Twitter sector
        '''
        super(PreDUNES, self).__init__()
        self.twitter_embedding_model = twitter_embedding_model
        self.twitter_sentiment_tokenizer = twitter_sentiment_tokenizer
        self.twitter_sentiment_model = twitter_sentiment_model
        self.reddit_sentiment_tokenizer = reddit_sentiment_tokenizer
        self.reddit_sentiment_model = reddit_sentiment_model
        self.twitter_sector_tokenizer = twitter_sector_tokenizer
        self.twitter_sector_model = twitter_sector_model

    def forward(self, prev_tweet, curr_tweet, prev_reddit):
        '''
        Forward pass for the DUNES model.
        Args:
            prev_tweet: previous tweet
            curr_tweet: current tweet
            prev_reddit: previous Reddit post
            curr_reddit: current Reddit post
        Returns:
            sentiment: sentiment of the current tweet
            sector: sector of the current tweet
        '''
        # Get the embeddings
        prev_tweet_embedding = self.twitter_embedding_model.encode([prev_tweet])
        curr_tweet_embedding = self.twitter_embedding_model.encode([curr_tweet])

        # Get the sentiment
        prev_tweet_tokens = self.twitter_sentiment_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sentiment = self.twitter_sentiment_model(**prev_tweet_tokens)[0][0].detach().numpy()
        curr_tweet_tokens = self.twitter_sentiment_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sentiment = self.twitter_sentiment_model(**curr_tweet_tokens)[0][0].detach().numpy()

        prev_reddit_tokens = self.reddit_sentiment_tokenizer(prev_reddit, return_tensors='pt')
        prev_reddit_sentiment = self.reddit_sentiment_model(**prev_reddit_tokens)[0][0].detach().numpy()

        # Get the sector
        prev_sector_tokens = self.twitter_sector_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sector = self.twitter_sector_model(**prev_sector_tokens)[0][0].detach().numpy()
        curr_sector_tokens = self.twitter_sector_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sector = self.twitter_sector_model(**curr_sector_tokens)[0][0].detach().numpy()


        return prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector


def create_preprocessing_model(
        twitter_embedding: str,
        twitter_sentiment: str,
        reddit_sentiment: str,
        twitter_sector: str
):
    '''
    Initialize a DUNES model from a set of embeddings and sentiment models.
    Feeds the output of these models to a transformer for classification.
    Args:
        twitter_embedding: path to the Twitter embedding huggingface model
        twitter_sentiment: path to the Twitter sentiment huggingface model
        reddit_sentiment: path to the Reddit sentiment huggingface model
        twitter_sector: path to the Twitter sector huggingface model
    '''

    # Load the models
    twitter_embedding_model = SentenceTransformer(twitter_embedding)
    twitter_sentiment_tokenizer = AutoTokenizer.from_pretrained(twitter_sentiment)
    twitter_sentiment_model = AutoModelForSequenceClassification.from_pretrained(twitter_sentiment)
    reddit_sentiment_tokenizer = AutoTokenizer.from_pretrained(reddit_sentiment)
    reddit_sentiment_model = AutoModelForSequenceClassification.from_pretrained(reddit_sentiment)
    twitter_sector_tokenizer = AutoTokenizer.from_pretrained(twitter_sector)
    twitter_sector_model = AutoModelForSequenceClassification.from_pretrained(twitter_sector)

    # Freeze the models
    twitter_sentiment_model.eval()
    twitter_sentiment_model.requires_grad_(False)
    reddit_sentiment_model.eval()
    reddit_sentiment_model.requires_grad_(False)
    twitter_sector_model.eval()
    twitter_sector_model.requires_grad_(False)

    # Create the DUNES model
    model = PreDUNES(
        twitter_embedding_model,
        twitter_sentiment_tokenizer,
        twitter_sentiment_model,
        reddit_sentiment_tokenizer,
        reddit_sentiment_model,
        twitter_sector_tokenizer,
        twitter_sector_model
        )

    return model

In [6]:
class PreDUNES(nn.Module):
    def __init__(
            self,
            twitter_embedding_model: nn.Module,
            twitter_sentiment_tokenizer,
            twitter_sentiment_model: nn.Module,
            reddit_sentiment_tokenizer,
            reddit_sentiment_model: nn.Module,
            twitter_sector_tokenizer,
            twitter_sector_model: nn.Module
        ):
        '''
        Initialize a DUNES model from a set of embeddings and sentiment models.
        Args:
            twitter_embedding_model: huggingface model for Twitter embeddings
            twitter_sentiment_model: huggingface model for Twitter sentiment
            reddit_sentiment_model: huggingface model for Reddit sentiment
            twitter_sector_model: huggingface model for Twitter sector
        '''
        super(PreDUNES, self).__init__()
        self.twitter_embedding_model = twitter_embedding_model
        self.twitter_sentiment_tokenizer = twitter_sentiment_tokenizer
        self.twitter_sentiment_model = twitter_sentiment_model
        self.reddit_sentiment_tokenizer = reddit_sentiment_tokenizer
        self.reddit_sentiment_model = reddit_sentiment_model
        self.twitter_sector_tokenizer = twitter_sector_tokenizer
        self.twitter_sector_model = twitter_sector_model

    def forward(self, prev_tweet, curr_tweet, prev_reddit):
        '''
        Forward pass for the DUNES model.
        Args:
            prev_tweet: previous tweet
            curr_tweet: current tweet
            prev_reddit: previous Reddit post
            curr_reddit: current Reddit post
        Returns:
            sentiment: sentiment of the current tweet
            sector: sector of the current tweet
        '''
        # Get the embeddings
        prev_tweet_embedding = self.twitter_embedding_model.encode([prev_tweet])
        curr_tweet_embedding = self.twitter_embedding_model.encode([curr_tweet])

        # Get the sentiment
        prev_tweet_tokens = self.twitter_sentiment_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sentiment = self.twitter_sentiment_model(**prev_tweet_tokens)[0][0].detach().numpy()
        curr_tweet_tokens = self.twitter_sentiment_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sentiment = self.twitter_sentiment_model(**curr_tweet_tokens)[0][0].detach().numpy()

        prev_reddit_tokens = self.reddit_sentiment_tokenizer(prev_reddit, return_tensors='pt')
        prev_reddit_sentiment = self.reddit_sentiment_model(**prev_reddit_tokens)[0][0].detach().numpy()

        # Get the sector
        prev_sector_tokens = self.twitter_sector_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sector = self.twitter_sector_model(**prev_sector_tokens)[0][0].detach().numpy()
        curr_sector_tokens = self.twitter_sector_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sector = self.twitter_sector_model(**curr_sector_tokens)[0][0].detach().numpy()


        return prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector


def create_preprocessing_model(
        twitter_embedding: str,
        twitter_sentiment: str,
        reddit_sentiment: str,
        twitter_sector: str
):
    '''
    Initialize a DUNES model from a set of embeddings and sentiment models.
    Feeds the output of these models to a transformer for classification.
    Args:
        twitter_embedding: path to the Twitter embedding huggingface model
        twitter_sentiment: path to the Twitter sentiment huggingface model
        reddit_sentiment: path to the Reddit sentiment huggingface model
        twitter_sector: path to the Twitter sector huggingface model
    '''

    # Load the models
    twitter_embedding_model = SentenceTransformer(twitter_embedding)
    twitter_sentiment_tokenizer = AutoTokenizer.from_pretrained(twitter_sentiment)
    twitter_sentiment_model = AutoModelForSequenceClassification.from_pretrained(twitter_sentiment)
    reddit_sentiment_tokenizer = AutoTokenizer.from_pretrained(reddit_sentiment)
    reddit_sentiment_model = AutoModelForSequenceClassification.from_pretrained(reddit_sentiment)
    twitter_sector_tokenizer = AutoTokenizer.from_pretrained(twitter_sector)
    twitter_sector_model = AutoModelForSequenceClassification.from_pretrained(twitter_sector)

    # Freeze the models
    twitter_sentiment_model.eval()
    twitter_sentiment_model.requires_grad_(False)
    reddit_sentiment_model.eval()
    reddit_sentiment_model.requires_grad_(False)
    twitter_sector_model.eval()
    twitter_sector_model.requires_grad_(False)

    # Create the DUNES model
    model = PreDUNES(
        twitter_embedding_model,
        twitter_sentiment_tokenizer,
        twitter_sentiment_model,
        reddit_sentiment_tokenizer,
        reddit_sentiment_model,
        twitter_sector_tokenizer,
        twitter_sector_model
        )

    return model

In [7]:
preprocessing_model = create_preprocessing_model(
    "mixedbread-ai/mxbai-embed-large-v1",
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    "SamLowe/roberta-base-go_emotions",
    "cardiffnlp/tweet-topic-latest-multi"
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector = preprocessing_model(
    "@WholeMarsBlog Headline is misleading. Starlink can obviously offer far more robust positioning than GPS, as it will have ~1000X more satellites over time. Not all will have line of sight to users, but still &gt;10X GPS &amp; far stronger signal. Just not today’s problem.",
    "@spideycyp_155 @BillyM2k If Russia faced calamitous defeat in conventional warfare for something as strategically critical as Crimea, the probability of using nuclear weapons is high",
    "We know who controls the media. The same corporations who have wreaked havoc on the globe for decades, if not centuries, the big banks who financed them, and the governments who turned a blind eye to the destruction. The same entities who have brought us to the precipice of destruction - quite possibly condemning us, and our progeny to an unlivable climate They have tried to stop you at every turn, and yet you persist for the good of humanity. We love you, Elon! Keep up the good work! As you have said, we must never let the light of human consciousness fade - never!"
    )

In [9]:
predictions = softmax(prev_reddit_sentiment)

In [10]:
class_mapping = AutoModel.from_pretrained("SamLowe/roberta-base-go_emotions").config.id2label

ranking = np.argsort(predictions)
ranking = ranking[::-1]
for i in range(predictions.shape[0]):
    l = class_mapping[ranking[i]]
    s = predictions[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

Some weights of RobertaModel were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1) admiration 0.5544
2) love 0.2888
3) approval 0.0619
4) caring 0.023
5) neutral 0.0127
6) optimism 0.0125
7) disapproval 0.0091
8) annoyance 0.0063
9) anger 0.0055
10) disappointment 0.004
11) gratitude 0.0036
12) desire 0.0036
13) realization 0.0026
14) pride 0.0022
15) sadness 0.0021
16) joy 0.0013
17) excitement 0.0011
18) disgust 0.001
19) fear 0.0007
20) relief 0.0006
21) confusion 0.0006
22) grief 0.0005
23) remorse 0.0005
24) curiosity 0.0004
25) surprise 0.0003
26) nervousness 0.0003
27) embarrassment 0.0002
28) amusement 0.0001


### real pre-processing

In [11]:
class PreDUNES(nn.Module):
    def __init__(
            self,
            twitter_embedding_model: nn.Module,
            twitter_sentiment_tokenizer,
            twitter_sentiment_model: nn.Module,
            reddit_sentiment_tokenizer,
            reddit_sentiment_model: nn.Module,
            twitter_sector_tokenizer,
            twitter_sector_model: nn.Module
        ):
        '''
        Initialize a DUNES model from a set of embeddings and sentiment models.
        Args:
            twitter_embedding_model: huggingface model for Twitter embeddings
            twitter_sentiment_model: huggingface model for Twitter sentiment
            reddit_sentiment_model: huggingface model for Reddit sentiment
            twitter_sector_model: huggingface model for Twitter sector
        '''
        super(PreDUNES, self).__init__()
        self.twitter_embedding_model = twitter_embedding_model
        self.twitter_sentiment_tokenizer = twitter_sentiment_tokenizer
        self.twitter_sentiment_model = twitter_sentiment_model
        self.reddit_sentiment_tokenizer = reddit_sentiment_tokenizer
        self.reddit_sentiment_model = reddit_sentiment_model
        self.twitter_sector_tokenizer = twitter_sector_tokenizer
        self.twitter_sector_model = twitter_sector_model

    def forward(self, prev_tweet, curr_tweet, prev_reddit):
        '''
        Forward pass for the DUNES model.
        Args:
            prev_tweet: previous tweet
            curr_tweet: current tweet
            prev_reddit: previous Reddit post
            curr_reddit: current Reddit post
        Returns:
            sentiment: sentiment of the current tweet
            sector: sector of the current tweet
        '''
        # Get the embeddings
        prev_tweet_embedding = self.twitter_embedding_model.encode([prev_tweet])
        curr_tweet_embedding = self.twitter_embedding_model.encode([curr_tweet])

        # Get the sentiment
        prev_tweet_tokens = self.twitter_sentiment_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sentiment = self.twitter_sentiment_model(**prev_tweet_tokens)[0][0].detach().numpy()
        curr_tweet_tokens = self.twitter_sentiment_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sentiment = self.twitter_sentiment_model(**curr_tweet_tokens)[0][0].detach().numpy()

        prev_reddit_tokens = self.reddit_sentiment_tokenizer(prev_reddit, return_tensors='pt')
        prev_reddit_sentiment = self.reddit_sentiment_model(**prev_reddit_tokens)[0][0].detach().numpy()

        # Get the sector
        prev_sector_tokens = self.twitter_sector_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sector = self.twitter_sector_model(**prev_sector_tokens)[0][0].detach().numpy()
        curr_sector_tokens = self.twitter_sector_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sector = self.twitter_sector_model(**curr_sector_tokens)[0][0].detach().numpy()


        return prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector


def create_preprocessing_model(
        twitter_embedding: str,
        twitter_sentiment: str,
        reddit_sentiment: str,
        twitter_sector: str
):
    '''
    Initialize a DUNES model from a set of embeddings and sentiment models.
    Feeds the output of these models to a transformer for classification.
    Args:
        twitter_embedding: path to the Twitter embedding huggingface model
        twitter_sentiment: path to the Twitter sentiment huggingface model
        reddit_sentiment: path to the Reddit sentiment huggingface model
        twitter_sector: path to the Twitter sector huggingface model
    '''

    # Load the models
    twitter_embedding_model = SentenceTransformer(twitter_embedding)
    twitter_sentiment_tokenizer = AutoTokenizer.from_pretrained(twitter_sentiment)
    twitter_sentiment_model = AutoModelForSequenceClassification.from_pretrained(twitter_sentiment)
    reddit_sentiment_tokenizer = AutoTokenizer.from_pretrained(reddit_sentiment)
    reddit_sentiment_model = AutoModelForSequenceClassification.from_pretrained(reddit_sentiment)
    twitter_sector_tokenizer = AutoTokenizer.from_pretrained(twitter_sector)
    twitter_sector_model = AutoModelForSequenceClassification.from_pretrained(twitter_sector)

    # Freeze the models
    twitter_sentiment_model.eval()
    twitter_sentiment_model.requires_grad_(False)
    reddit_sentiment_model.eval()
    reddit_sentiment_model.requires_grad_(False)
    twitter_sector_model.eval()
    twitter_sector_model.requires_grad_(False)

    # Create the DUNES model
    model = PreDUNES(
        twitter_embedding_model,
        twitter_sentiment_tokenizer,
        twitter_sentiment_model,
        reddit_sentiment_tokenizer,
        reddit_sentiment_model,
        twitter_sector_tokenizer,
        twitter_sector_model
        )

    return model

def printClassMappings(model, predictions):
    class_mapping = AutoModel.from_pretrained(model).config.id2label
    predictions = softmax(predictions)
    ranking = np.argsort(predictions)
    ranking = ranking[::-1]
    for i in range(predictions.shape[0]):
        l = class_mapping[ranking[i]]
        s = predictions[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

def test():
    preprocessing_model = create_preprocessing_model(
        "mixedbread-ai/mxbai-embed-large-v1",
        "cardiffnlp/twitter-roberta-base-sentiment-latest",
        "SamLowe/roberta-base-go_emotions",
        "cardiffnlp/tweet-topic-latest-multi"
    )

    prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector = preprocessing_model(
        "@WholeMarsBlog Headline is misleading. Starlink can obviously offer far more robust positioning than GPS, as it will have ~1000X more satellites over time. Not all will have line of sight to users, but still &gt;10X GPS &amp; far stronger signal. Just not today’s problem.",
        "@spideycyp_155 @BillyM2k If Russia faced calamitous defeat in conventional warfare for something as strategically critical as Crimea, the probability of using nuclear weapons is high",
        "We know who controls the media. The same corporations who have wreaked havoc on the globe for decades, if not centuries, the big banks who financed them, and the governments who turned a blind eye to the destruction. The same entities who have brought us to the precipice of destruction - quite possibly condemning us, and our progeny to an unlivable climate They have tried to stop you at every turn, and yet you persist for the good of humanity. We love you, Elon! Keep up the good work! As you have said, we must never let the light of human consciousness fade - never!"
    )

    print("prev_tweet_embedding:", prev_tweet_embedding)
    print("curr_tweet_embedding:", curr_tweet_embedding)
    print("prev_tweet_sentiment:", softmax(prev_tweet_sentiment))
    print("curr_tweet_sentiment:", softmax(curr_tweet_sentiment))

    print("prev_reddit_sentiment:")
    printClassMappings("SamLowe/roberta-base-go_emotions", prev_reddit_sentiment)
    print("prev_tweet_sector:")
    printClassMappings("cardiffnlp/tweet-topic-latest-multi", prev_tweet_sector)
    print("curr_tweet_sector:")
    printClassMappings("cardiffnlp/tweet-topic-latest-multi", curr_tweet_sector)


In [12]:
test()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


prev_tweet_embedding: [[ 0.34489498  0.13846147 -0.32645515 ... -0.7176758   0.28787854
  -0.14000645]]
curr_tweet_embedding: [[ 0.66635376 -0.49894774  0.04492863 ... -0.9171951   0.18333018
  -0.4950185 ]]
prev_tweet_sentiment: [0.14040028 0.57469916 0.28490052]
curr_tweet_sentiment: [0.6610266  0.322822   0.01615136]
prev_reddit_sentiment:


Some weights of RobertaModel were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1) admiration 0.5544
2) love 0.2888
3) approval 0.0619
4) caring 0.023
5) neutral 0.0127
6) optimism 0.0125
7) disapproval 0.0091
8) annoyance 0.0063
9) anger 0.0055
10) disappointment 0.004
11) gratitude 0.0036
12) desire 0.0036
13) realization 0.0026
14) pride 0.0022
15) sadness 0.0021
16) joy 0.0013
17) excitement 0.0011
18) disgust 0.001
19) fear 0.0007
20) relief 0.0006
21) confusion 0.0006
22) grief 0.0005
23) remorse 0.0005
24) curiosity 0.0004
25) surprise 0.0003
26) nervousness 0.0003
27) embarrassment 0.0002
28) amusement 0.0001
prev_tweet_sector:


Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/tweet-topic-latest-multi and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1) science_&_technology 0.9853
2) news_&_social_concern 0.0066
3) business_&_entrepreneurs 0.0027
4) other_hobbies 0.0008
5) diaries_&_daily_life 0.0007
6) learning_&_educational 0.0006
7) film_tv_&_video 0.0005
8) fitness_&_health 0.0004
9) celebrity_&_pop_culture 0.0004
10) gaming 0.0003
11) travel_&_adventure 0.0003
12) sports 0.0002
13) relationships 0.0002
14) youth_&_student_life 0.0002
15) music 0.0002
16) food_&_dining 0.0002
17) arts_&_culture 0.0002
18) family 0.0002
19) fashion_&_style 0.0001
curr_tweet_sector:


Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/tweet-topic-latest-multi and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1) news_&_social_concern 0.9976
2) science_&_technology 0.001
3) celebrity_&_pop_culture 0.0003
4) film_tv_&_video 0.0003
5) other_hobbies 0.0002
6) gaming 0.0001
7) diaries_&_daily_life 0.0001
8) business_&_entrepreneurs 0.0001
9) learning_&_educational 0.0001
10) sports 0.0001
11) arts_&_culture 0.0
12) music 0.0
13) youth_&_student_life 0.0
14) fitness_&_health 0.0
15) travel_&_adventure 0.0
16) family 0.0
17) relationships 0.0
18) food_&_dining 0.0
19) fashion_&_style 0.0


## prediction

In [13]:
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np 
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.optim import Adam

### dataset loader

In [14]:
class dataloaderDUNEs(Dataset):
    def __init__(self, data, preprocessing_model):
        """
        Args:
            data (List[Dict]): Each dictionary contains raw text for 'prev_tweet', 'curr_tweet',
                               'prev_reddit', and engagement metrics ('likes', 'retweets', 'comments').
            preprocessing_model (PreDUNES): The model instance for preprocessing text data.
        """
        self.data = data
        self.preprocessing_model = preprocessing_model
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        # Process the raw texts through the PreDUNES model
        # Note: Ensure PreDUNES outputs PyTorch tensors; if it outputs numpy arrays, convert them to tensors
        prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector = self.preprocessing_model(
            item['prev_tweet'], item['curr_tweet'], item['prev_reddit']
        )

        # Convert numpy arrays to tensors if they are not already
        def to_tensor(array):
            if isinstance(array, np.ndarray):
                return torch.tensor(array).float()
            return array
        
        prev_tweet_embedding = to_tensor(prev_tweet_embedding)
        curr_tweet_embedding = to_tensor(curr_tweet_embedding)
        prev_tweet_sentiment = to_tensor(prev_tweet_sentiment)
        curr_tweet_sentiment = to_tensor(curr_tweet_sentiment)
        prev_reddit_sentiment = to_tensor(prev_reddit_sentiment)
        prev_tweet_sector = to_tensor(prev_tweet_sector)
        curr_tweet_sector = to_tensor(curr_tweet_sector)
        
        # Engagement metrics
        likes = torch.tensor(item['likes'], dtype=torch.float)
        retweets = torch.tensor(item['retweets'], dtype=torch.float)
        comments = torch.tensor(item['comments'], dtype=torch.float)
        
        return {
            'prev_tweet_embedding': prev_tweet_embedding,
            'curr_tweet_embedding': curr_tweet_embedding,
            'prev_tweet_sentiment': prev_tweet_sentiment,
            'curr_tweet_sentiment': curr_tweet_sentiment,
            'prev_reddit_sentiment': prev_reddit_sentiment,
            'prev_tweet_sector': prev_tweet_sector,
            'curr_tweet_sector': curr_tweet_sector,
            'likes': likes,
            'retweets': retweets,
            'comments': comments
        }

In [15]:
# Initialize PreDUNES model here
preprocessing_model = create_preprocessing_model(
    "mixedbread-ai/mxbai-embed-large-v1",
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    "SamLowe/roberta-base-go_emotions",
    "cardiffnlp/tweet-topic-latest-multi"
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
# Example data list
data = [
    {
        'prev_tweet': "@WholeMarsBlog Headline is misleading. Starlink can obviously offer far more robust positioning than GPS, as it will have ~1000X more satellites over time. Not all will have line of sight to users, but still >10X GPS & far stronger signal. Just not today’s problem.",
        'curr_tweet': "@spideycyp_155 @BillyM2k If Russia faced calamitous defeat in conventional warfare for something as strategically critical as Crimea, the probability of using nuclear weapons is high",
        'prev_reddit': "We know who controls the media. The same corporations who have wreaked havoc on the globe for decades, if not centuries, the big banks who financed them, and the governments who turned a blind eye to the destruction. The same entities who have brought us to the precipice of destruction - quite possibly condemning us, and our progeny to an unlivable climate They have tried to stop you at every turn, and yet you persist for the good of humanity. We love you, Elon! Keep up the good work! As you have said, we must never let the light of human consciousness fade - never!",
        'likes': 100,  
        'retweets': 50, 
        'comments': 25  
    }
]

# Create dataset and dataloader
dataset = dataloaderDUNEs(data, preprocessing_model)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)

In [29]:
# class PositionalEncoding(nn.Module):
#     def __init__(self, d_model, dropout=0.1, max_len=5000):
#         super(PositionalEncoding, self).__init__()
#         self.dropout = nn.Dropout(p=dropout)

#         position = torch.arange(0, max_len).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
#         pe = torch.zeros(max_len, d_model)
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0).transpose(0, 1)
#         self.register_buffer('pe', pe)

#     def forward(self, x):
#         x = x + self.pe[:x.size(0), :]
#         return self.dropout(x)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [30]:
class TransformerRegressionModel(nn.Module):
    def __init__(self, feature_sizes, d_model, nhead, num_encoder_layers, dim_feedforward, num_outputs):
        super(TransformerRegressionModel, self).__init__()
        self.d_model = d_model
        self.positional_encoder = PositionalEncoding(d_model)
        
        self.projection_layers = nn.ModuleDict({
            'prev_tweet_embedding': nn.Linear(feature_sizes['tweet_embedding'], d_model),
            'curr_tweet_embedding': nn.Linear(feature_sizes['tweet_embedding'], d_model),
            'prev_tweet_sentiment': nn.Linear(feature_sizes['tweet_sentiment'], d_model),
            'curr_tweet_sentiment': nn.Linear(feature_sizes['tweet_sentiment'], d_model),
            'prev_reddit_sentiment': nn.Linear(feature_sizes['reddit_sentiment'], d_model),
            'prev_tweet_sector': nn.Linear(feature_sizes['tweet_sector'], d_model),
            'curr_tweet_sector': nn.Linear(feature_sizes['tweet_sector'], d_model),
        })
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.output_linear = nn.Linear(d_model, num_outputs)

    def forward(self, features):
        projected_features = []
        for key, feature in features.items():
            # Project and reshape each feature
            feature = self.projection_layers[key](feature)
            if feature.dim() == 2:
                feature = feature.unsqueeze(1)  # Add sequence dimension if missing
            projected_features.append(feature)
        
        # Concatenate all features along the sequence dimension
        src = torch.cat(projected_features, dim=1)
        
        # Apply positional encoding
        src = self.positional_encoder(src)
        
        # Transformer encoder
        output = self.transformer_encoder(src)
        
        # Aggregate and predict
        output = output.mean(dim=0)
        output = self.output_linear(output)
        
        return output

In [31]:
# Instantiate the model
model = TransformerRegressionModel(
    feature_sizes={
        'tweet_embedding': 1024,
        'tweet_sentiment': 3,
        'reddit_sentiment': 28,
        'tweet_sector': 19,
    },
    d_model=512, 
    nhead=8, 
    num_encoder_layers=3, 
    dim_feedforward=2048, 
    num_outputs=3
)

In [32]:
# Print the model's parameters to diagnose the issue
for name, param in model.named_parameters():
    print(f"{name}: {param.size()}")

projection_layers.prev_tweet_embedding.weight: torch.Size([512, 1024])
projection_layers.prev_tweet_embedding.bias: torch.Size([512])
projection_layers.curr_tweet_embedding.weight: torch.Size([512, 1024])
projection_layers.curr_tweet_embedding.bias: torch.Size([512])
projection_layers.prev_tweet_sentiment.weight: torch.Size([512, 3])
projection_layers.prev_tweet_sentiment.bias: torch.Size([512])
projection_layers.curr_tweet_sentiment.weight: torch.Size([512, 3])
projection_layers.curr_tweet_sentiment.bias: torch.Size([512])
projection_layers.prev_reddit_sentiment.weight: torch.Size([512, 28])
projection_layers.prev_reddit_sentiment.bias: torch.Size([512])
projection_layers.prev_tweet_sector.weight: torch.Size([512, 19])
projection_layers.prev_tweet_sector.bias: torch.Size([512])
projection_layers.curr_tweet_sector.weight: torch.Size([512, 19])
projection_layers.curr_tweet_sector.bias: torch.Size([512])
transformer_encoder.layers.0.self_attn.in_proj_weight: torch.Size([1536, 512])
trans

In [33]:
# Define the loss function
criterion = nn.MSELoss()

# Define the optimizer
optimizer = Adam(model.parameters(), lr=0.001)

# Your training loop follows here
num_epochs = 25

for epoch in range(num_epochs):
    model.train()  
    total_loss = 0.0

    for batch in dataloader:
        optimizer.zero_grad() 

        # Prepare the features as a dictionary for the model's forward pass
        features = {
            'prev_tweet_embedding': batch['prev_tweet_embedding'],
            'curr_tweet_embedding': batch['curr_tweet_embedding'],
            'prev_tweet_sentiment': batch['prev_tweet_sentiment'],
            'curr_tweet_sentiment': batch['curr_tweet_sentiment'],
            'prev_reddit_sentiment': batch['prev_reddit_sentiment'],
            'prev_tweet_sector': batch['prev_tweet_sector'],
            'curr_tweet_sector': batch['curr_tweet_sector'],
        }

        # Ensure the target tensor is correctly shaped: [batch_size, num_outputs]
        targets = torch.stack((batch['likes'], batch['retweets'], batch['comments']), dim=1)

        # Forward pass - the model should now internally handle the feature transformation
        outputs = model(features)

        # Compute loss
        loss = criterion(outputs, targets)
        total_loss += loss.item()

        # Backpropagation
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(dataloader)  # Calculate the average loss for the epoch
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/25, Loss: 4398.6167
Epoch 2/25, Loss: 3671.6160
Epoch 3/25, Loss: 3487.2786
Epoch 4/25, Loss: 3437.3472
Epoch 5/25, Loss: 3400.0674
Epoch 6/25, Loss: 3357.5425
Epoch 7/25, Loss: 3315.9133
Epoch 8/25, Loss: 3273.5718
Epoch 9/25, Loss: 3225.9192
Epoch 10/25, Loss: 3190.2664
Epoch 11/25, Loss: 3141.5908
Epoch 12/25, Loss: 3104.4949
Epoch 13/25, Loss: 3057.1873
Epoch 14/25, Loss: 3008.4387
Epoch 15/25, Loss: 2968.7317
Epoch 16/25, Loss: 2922.1860
Epoch 17/25, Loss: 2875.3191
Epoch 18/25, Loss: 2829.0881
Epoch 19/25, Loss: 2783.7620
Epoch 20/25, Loss: 2738.0256
Epoch 21/25, Loss: 2691.6980
Epoch 22/25, Loss: 2647.4285
Epoch 23/25, Loss: 2604.8518
Epoch 24/25, Loss: 2559.6072
Epoch 25/25, Loss: 2515.9277
