## preprocessing

### download transformers & librariees

In [3]:
!python -m pip install -U sentence-transformers



In [4]:
import numpy as np
from typing import Dict
from scipy.special import expit, softmax

import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


### preprocessing experimentation
The PreDUNES class serves as a PyTorch module for preprocessing textual data from Twitter and Reddit. Upon initialization, it receives various components including models and tokenizers for generating embeddings, predicting sentiment, and classifying sector information from Twitter and Reddit text. Its forward method executes the preprocessing steps, involving the extraction of embeddings, sentiment analysis, and sector classification for both the previous and current tweets, along with sentiment analysis for the previous Reddit post. The create_preprocessing_model function facilitates the initialization of the preprocessing module by loading the required models and setting them to evaluation mode, ensuring their parameters are frozen for training. This modular approach streamlines the setup of the preprocessing pipeline for downstream tasks such as sentiment analysis and sector classification on textual data.

In [5]:
class PreDUNES(nn.Module):
    def __init__(
            self,
            twitter_embedding_model: nn.Module,
            twitter_sentiment_tokenizer,
            twitter_sentiment_model: nn.Module,
            reddit_sentiment_tokenizer,
            reddit_sentiment_model: nn.Module,
            twitter_sector_tokenizer,
            twitter_sector_model: nn.Module
        ):
        '''
        Initialize a DUNES model from a set of embeddings and sentiment models.
        Args:
            twitter_embedding_model: huggingface model for Twitter embeddings
            twitter_sentiment_model: huggingface model for Twitter sentiment
            reddit_sentiment_model: huggingface model for Reddit sentiment
            twitter_sector_model: huggingface model for Twitter sector
        '''
        super(PreDUNES, self).__init__()
        self.twitter_embedding_model = twitter_embedding_model
        self.twitter_sentiment_tokenizer = twitter_sentiment_tokenizer
        self.twitter_sentiment_model = twitter_sentiment_model
        self.reddit_sentiment_tokenizer = reddit_sentiment_tokenizer
        self.reddit_sentiment_model = reddit_sentiment_model
        self.twitter_sector_tokenizer = twitter_sector_tokenizer
        self.twitter_sector_model = twitter_sector_model

    def forward(self, prev_tweet, curr_tweet, prev_reddit):
        '''
        Forward pass for the DUNES model.
        Args:
            prev_tweet: previous tweet
            curr_tweet: current tweet
            prev_reddit: previous Reddit post
            curr_reddit: current Reddit post
        Returns:
            sentiment: sentiment of the current tweet
            sector: sector of the current tweet
        '''
        # Get the embeddings
        prev_tweet_embedding = self.twitter_embedding_model.encode([prev_tweet])
        curr_tweet_embedding = self.twitter_embedding_model.encode([curr_tweet])

        # Get the sentiment
        prev_tweet_tokens = self.twitter_sentiment_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sentiment = self.twitter_sentiment_model(**prev_tweet_tokens)[0][0].detach().numpy()
        curr_tweet_tokens = self.twitter_sentiment_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sentiment = self.twitter_sentiment_model(**curr_tweet_tokens)[0][0].detach().numpy()

        prev_reddit_tokens = self.reddit_sentiment_tokenizer(prev_reddit, return_tensors='pt')
        prev_reddit_sentiment = self.reddit_sentiment_model(**prev_reddit_tokens)[0][0].detach().numpy()

        # Get the sector
        prev_sector_tokens = self.twitter_sector_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sector = self.twitter_sector_model(**prev_sector_tokens)[0][0].detach().numpy()
        curr_sector_tokens = self.twitter_sector_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sector = self.twitter_sector_model(**curr_sector_tokens)[0][0].detach().numpy()


        return prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector


def create_preprocessing_model(
        twitter_embedding: str,
        twitter_sentiment: str,
        reddit_sentiment: str,
        twitter_sector: str
):
    '''
    Initialize a DUNES model from a set of embeddings and sentiment models.
    Feeds the output of these models to a transformer for classification.
    Args:
        twitter_embedding: path to the Twitter embedding huggingface model
        twitter_sentiment: path to the Twitter sentiment huggingface model
        reddit_sentiment: path to the Reddit sentiment huggingface model
        twitter_sector: path to the Twitter sector huggingface model
    '''

    # Load the models
    twitter_embedding_model = SentenceTransformer(twitter_embedding)
    twitter_sentiment_tokenizer = AutoTokenizer.from_pretrained(twitter_sentiment)
    twitter_sentiment_model = AutoModelForSequenceClassification.from_pretrained(twitter_sentiment)
    reddit_sentiment_tokenizer = AutoTokenizer.from_pretrained(reddit_sentiment)
    reddit_sentiment_model = AutoModelForSequenceClassification.from_pretrained(reddit_sentiment)
    twitter_sector_tokenizer = AutoTokenizer.from_pretrained(twitter_sector)
    twitter_sector_model = AutoModelForSequenceClassification.from_pretrained(twitter_sector)

    # Freeze the models
    twitter_sentiment_model.eval()
    twitter_sentiment_model.requires_grad_(False)
    reddit_sentiment_model.eval()
    reddit_sentiment_model.requires_grad_(False)
    twitter_sector_model.eval()
    twitter_sector_model.requires_grad_(False)

    # Create the DUNES model
    model = PreDUNES(
        twitter_embedding_model,
        twitter_sentiment_tokenizer,
        twitter_sentiment_model,
        reddit_sentiment_tokenizer,
        reddit_sentiment_model,
        twitter_sector_tokenizer,
        twitter_sector_model
        )

    return model

In [6]:
class PreDUNES(nn.Module):
    def __init__(
            self,
            twitter_embedding_model: nn.Module,
            twitter_sentiment_tokenizer,
            twitter_sentiment_model: nn.Module,
            reddit_sentiment_tokenizer,
            reddit_sentiment_model: nn.Module,
            twitter_sector_tokenizer,
            twitter_sector_model: nn.Module
        ):
        '''
        Initialize a DUNES model from a set of embeddings and sentiment models.
        Args:
            twitter_embedding_model: huggingface model for Twitter embeddings
            twitter_sentiment_model: huggingface model for Twitter sentiment
            reddit_sentiment_model: huggingface model for Reddit sentiment
            twitter_sector_model: huggingface model for Twitter sector
        '''
        super(PreDUNES, self).__init__()
        self.twitter_embedding_model = twitter_embedding_model
        self.twitter_sentiment_tokenizer = twitter_sentiment_tokenizer
        self.twitter_sentiment_model = twitter_sentiment_model
        self.reddit_sentiment_tokenizer = reddit_sentiment_tokenizer
        self.reddit_sentiment_model = reddit_sentiment_model
        self.twitter_sector_tokenizer = twitter_sector_tokenizer
        self.twitter_sector_model = twitter_sector_model

    def forward(self, prev_tweet, curr_tweet, prev_reddit):
        '''
        Forward pass for the DUNES model.
        Args:
            prev_tweet: previous tweet
            curr_tweet: current tweet
            prev_reddit: previous Reddit post
            curr_reddit: current Reddit post
        Returns:
            sentiment: sentiment of the current tweet
            sector: sector of the current tweet
        '''
        # Get the embeddings
        prev_tweet_embedding = self.twitter_embedding_model.encode([prev_tweet])
        curr_tweet_embedding = self.twitter_embedding_model.encode([curr_tweet])

        # Get the sentiment
        prev_tweet_tokens = self.twitter_sentiment_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sentiment = self.twitter_sentiment_model(**prev_tweet_tokens)[0][0].detach().numpy()
        curr_tweet_tokens = self.twitter_sentiment_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sentiment = self.twitter_sentiment_model(**curr_tweet_tokens)[0][0].detach().numpy()

        prev_reddit_tokens = self.reddit_sentiment_tokenizer(prev_reddit, return_tensors='pt')
        prev_reddit_sentiment = self.reddit_sentiment_model(**prev_reddit_tokens)[0][0].detach().numpy()

        # Get the sector
        prev_sector_tokens = self.twitter_sector_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sector = self.twitter_sector_model(**prev_sector_tokens)[0][0].detach().numpy()
        curr_sector_tokens = self.twitter_sector_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sector = self.twitter_sector_model(**curr_sector_tokens)[0][0].detach().numpy()


        return prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector


def create_preprocessing_model(
        twitter_embedding: str,
        twitter_sentiment: str,
        reddit_sentiment: str,
        twitter_sector: str
):
    '''
    Initialize a DUNES model from a set of embeddings and sentiment models.
    Feeds the output of these models to a transformer for classification.
    Args:
        twitter_embedding: path to the Twitter embedding huggingface model
        twitter_sentiment: path to the Twitter sentiment huggingface model
        reddit_sentiment: path to the Reddit sentiment huggingface model
        twitter_sector: path to the Twitter sector huggingface model
    '''

    # Load the models
    twitter_embedding_model = SentenceTransformer(twitter_embedding)
    twitter_sentiment_tokenizer = AutoTokenizer.from_pretrained(twitter_sentiment)
    twitter_sentiment_model = AutoModelForSequenceClassification.from_pretrained(twitter_sentiment)
    reddit_sentiment_tokenizer = AutoTokenizer.from_pretrained(reddit_sentiment)
    reddit_sentiment_model = AutoModelForSequenceClassification.from_pretrained(reddit_sentiment)
    twitter_sector_tokenizer = AutoTokenizer.from_pretrained(twitter_sector)
    twitter_sector_model = AutoModelForSequenceClassification.from_pretrained(twitter_sector)

    # Freeze the models
    twitter_sentiment_model.eval()
    twitter_sentiment_model.requires_grad_(False)
    reddit_sentiment_model.eval()
    reddit_sentiment_model.requires_grad_(False)
    twitter_sector_model.eval()
    twitter_sector_model.requires_grad_(False)

    # Create the DUNES model
    model = PreDUNES(
        twitter_embedding_model,
        twitter_sentiment_tokenizer,
        twitter_sentiment_model,
        reddit_sentiment_tokenizer,
        reddit_sentiment_model,
        twitter_sector_tokenizer,
        twitter_sector_model
        )

    return model

In [7]:
preprocessing_model = create_preprocessing_model(
    "mixedbread-ai/mxbai-embed-large-v1",
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    "SamLowe/roberta-base-go_emotions",
    "cardiffnlp/tweet-topic-latest-multi"
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector = preprocessing_model(
    "@WholeMarsBlog Headline is misleading. Starlink can obviously offer far more robust positioning than GPS, as it will have ~1000X more satellites over time. Not all will have line of sight to users, but still &gt;10X GPS &amp; far stronger signal. Just not today’s problem.",
    "@spideycyp_155 @BillyM2k If Russia faced calamitous defeat in conventional warfare for something as strategically critical as Crimea, the probability of using nuclear weapons is high",
    "We know who controls the media. The same corporations who have wreaked havoc on the globe for decades, if not centuries, the big banks who financed them, and the governments who turned a blind eye to the destruction. The same entities who have brought us to the precipice of destruction - quite possibly condemning us, and our progeny to an unlivable climate They have tried to stop you at every turn, and yet you persist for the good of humanity. We love you, Elon! Keep up the good work! As you have said, we must never let the light of human consciousness fade - never!"
    )

In [9]:
predictions = softmax(prev_reddit_sentiment)

In [10]:
class_mapping = AutoModel.from_pretrained("SamLowe/roberta-base-go_emotions").config.id2label

ranking = np.argsort(predictions)
ranking = ranking[::-1]
for i in range(predictions.shape[0]):
    l = class_mapping[ranking[i]]
    s = predictions[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

Some weights of RobertaModel were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1) admiration 0.5544
2) love 0.2888
3) approval 0.0619
4) caring 0.023
5) neutral 0.0127
6) optimism 0.0125
7) disapproval 0.0091
8) annoyance 0.0063
9) anger 0.0055
10) disappointment 0.004
11) gratitude 0.0036
12) desire 0.0036
13) realization 0.0026
14) pride 0.0022
15) sadness 0.0021
16) joy 0.0013
17) excitement 0.0011
18) disgust 0.001
19) fear 0.0007
20) relief 0.0006
21) confusion 0.0006
22) grief 0.0005
23) remorse 0.0005
24) curiosity 0.0004
25) surprise 0.0003
26) nervousness 0.0003
27) embarrassment 0.0002
28) amusement 0.0001


### real pre-processing

In [11]:
class PreDUNES(nn.Module):
    def __init__(
            self,
            twitter_embedding_model: nn.Module,
            twitter_sentiment_tokenizer,
            twitter_sentiment_model: nn.Module,
            reddit_sentiment_tokenizer,
            reddit_sentiment_model: nn.Module,
            twitter_sector_tokenizer,
            twitter_sector_model: nn.Module
        ):
        '''
        Initialize a DUNES model from a set of embeddings and sentiment models.
        Args:
            twitter_embedding_model: huggingface model for Twitter embeddings
            twitter_sentiment_model: huggingface model for Twitter sentiment
            reddit_sentiment_model: huggingface model for Reddit sentiment
            twitter_sector_model: huggingface model for Twitter sector
        '''
        super(PreDUNES, self).__init__()
        self.twitter_embedding_model = twitter_embedding_model
        self.twitter_sentiment_tokenizer = twitter_sentiment_tokenizer
        self.twitter_sentiment_model = twitter_sentiment_model
        self.reddit_sentiment_tokenizer = reddit_sentiment_tokenizer
        self.reddit_sentiment_model = reddit_sentiment_model
        self.twitter_sector_tokenizer = twitter_sector_tokenizer
        self.twitter_sector_model = twitter_sector_model

    def forward(self, prev_tweet, curr_tweet, prev_reddit):
        '''
        Forward pass for the DUNES model.
        Args:
            prev_tweet: previous tweet
            curr_tweet: current tweet
            prev_reddit: previous Reddit post
            curr_reddit: current Reddit post
        Returns:
            sentiment: sentiment of the current tweet
            sector: sector of the current tweet
        '''
        # Get the embeddings
        prev_tweet_embedding = self.twitter_embedding_model.encode([prev_tweet])
        curr_tweet_embedding = self.twitter_embedding_model.encode([curr_tweet])

        # Get the sentiment
        prev_tweet_tokens = self.twitter_sentiment_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sentiment = self.twitter_sentiment_model(**prev_tweet_tokens)[0][0].detach().numpy()
        curr_tweet_tokens = self.twitter_sentiment_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sentiment = self.twitter_sentiment_model(**curr_tweet_tokens)[0][0].detach().numpy()

        prev_reddit_tokens = self.reddit_sentiment_tokenizer(prev_reddit, return_tensors='pt')
        prev_reddit_sentiment = self.reddit_sentiment_model(**prev_reddit_tokens)[0][0].detach().numpy()

        # Get the sector
        prev_sector_tokens = self.twitter_sector_tokenizer(prev_tweet, return_tensors='pt')
        prev_tweet_sector = self.twitter_sector_model(**prev_sector_tokens)[0][0].detach().numpy()
        curr_sector_tokens = self.twitter_sector_tokenizer(curr_tweet, return_tensors='pt')
        curr_tweet_sector = self.twitter_sector_model(**curr_sector_tokens)[0][0].detach().numpy()


        return prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector


def create_preprocessing_model(
        twitter_embedding: str,
        twitter_sentiment: str,
        reddit_sentiment: str,
        twitter_sector: str
):
    '''
    Initialize a DUNES model from a set of embeddings and sentiment models.
    Feeds the output of these models to a transformer for classification.
    Args:
        twitter_embedding: path to the Twitter embedding huggingface model
        twitter_sentiment: path to the Twitter sentiment huggingface model
        reddit_sentiment: path to the Reddit sentiment huggingface model
        twitter_sector: path to the Twitter sector huggingface model
    '''

    # Load the models
    twitter_embedding_model = SentenceTransformer(twitter_embedding)
    twitter_sentiment_tokenizer = AutoTokenizer.from_pretrained(twitter_sentiment)
    twitter_sentiment_model = AutoModelForSequenceClassification.from_pretrained(twitter_sentiment)
    reddit_sentiment_tokenizer = AutoTokenizer.from_pretrained(reddit_sentiment)
    reddit_sentiment_model = AutoModelForSequenceClassification.from_pretrained(reddit_sentiment)
    twitter_sector_tokenizer = AutoTokenizer.from_pretrained(twitter_sector)
    twitter_sector_model = AutoModelForSequenceClassification.from_pretrained(twitter_sector)

    # Freeze the models
    twitter_sentiment_model.eval()
    twitter_sentiment_model.requires_grad_(False)
    reddit_sentiment_model.eval()
    reddit_sentiment_model.requires_grad_(False)
    twitter_sector_model.eval()
    twitter_sector_model.requires_grad_(False)

    # Create the DUNES model
    model = PreDUNES(
        twitter_embedding_model,
        twitter_sentiment_tokenizer,
        twitter_sentiment_model,
        reddit_sentiment_tokenizer,
        reddit_sentiment_model,
        twitter_sector_tokenizer,
        twitter_sector_model
        )

    return model

def printClassMappings(model, predictions):
    class_mapping = AutoModel.from_pretrained(model).config.id2label
    predictions = softmax(predictions)
    ranking = np.argsort(predictions)
    ranking = ranking[::-1]
    for i in range(predictions.shape[0]):
        l = class_mapping[ranking[i]]
        s = predictions[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

def test():
    preprocessing_model = create_preprocessing_model(
        "mixedbread-ai/mxbai-embed-large-v1",
        "cardiffnlp/twitter-roberta-base-sentiment-latest",
        "SamLowe/roberta-base-go_emotions",
        "cardiffnlp/tweet-topic-latest-multi"
    )

    prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector = preprocessing_model(
        "@WholeMarsBlog Headline is misleading. Starlink can obviously offer far more robust positioning than GPS, as it will have ~1000X more satellites over time. Not all will have line of sight to users, but still &gt;10X GPS &amp; far stronger signal. Just not today’s problem.",
        "@spideycyp_155 @BillyM2k If Russia faced calamitous defeat in conventional warfare for something as strategically critical as Crimea, the probability of using nuclear weapons is high",
        "We know who controls the media. The same corporations who have wreaked havoc on the globe for decades, if not centuries, the big banks who financed them, and the governments who turned a blind eye to the destruction. The same entities who have brought us to the precipice of destruction - quite possibly condemning us, and our progeny to an unlivable climate They have tried to stop you at every turn, and yet you persist for the good of humanity. We love you, Elon! Keep up the good work! As you have said, we must never let the light of human consciousness fade - never!"
    )

    print("prev_tweet_embedding:", prev_tweet_embedding)
    print("curr_tweet_embedding:", curr_tweet_embedding)
    print("prev_tweet_sentiment:", softmax(prev_tweet_sentiment))
    print("curr_tweet_sentiment:", softmax(curr_tweet_sentiment))

    print("prev_reddit_sentiment:")
    printClassMappings("SamLowe/roberta-base-go_emotions", prev_reddit_sentiment)
    print("prev_tweet_sector:")
    printClassMappings("cardiffnlp/tweet-topic-latest-multi", prev_tweet_sector)
    print("curr_tweet_sector:")
    printClassMappings("cardiffnlp/tweet-topic-latest-multi", curr_tweet_sector)


In [12]:
test()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


prev_tweet_embedding: [[ 0.34489498  0.13846147 -0.32645515 ... -0.7176758   0.28787854
  -0.14000645]]
curr_tweet_embedding: [[ 0.66635376 -0.49894774  0.04492863 ... -0.9171951   0.18333018
  -0.4950185 ]]
prev_tweet_sentiment: [0.14040028 0.57469916 0.28490052]
curr_tweet_sentiment: [0.6610266  0.322822   0.01615136]
prev_reddit_sentiment:


Some weights of RobertaModel were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1) admiration 0.5544
2) love 0.2888
3) approval 0.0619
4) caring 0.023
5) neutral 0.0127
6) optimism 0.0125
7) disapproval 0.0091
8) annoyance 0.0063
9) anger 0.0055
10) disappointment 0.004
11) gratitude 0.0036
12) desire 0.0036
13) realization 0.0026
14) pride 0.0022
15) sadness 0.0021
16) joy 0.0013
17) excitement 0.0011
18) disgust 0.001
19) fear 0.0007
20) relief 0.0006
21) confusion 0.0006
22) grief 0.0005
23) remorse 0.0005
24) curiosity 0.0004
25) surprise 0.0003
26) nervousness 0.0003
27) embarrassment 0.0002
28) amusement 0.0001
prev_tweet_sector:


Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/tweet-topic-latest-multi and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1) science_&_technology 0.9853
2) news_&_social_concern 0.0066
3) business_&_entrepreneurs 0.0027
4) other_hobbies 0.0008
5) diaries_&_daily_life 0.0007
6) learning_&_educational 0.0006
7) film_tv_&_video 0.0005
8) fitness_&_health 0.0004
9) celebrity_&_pop_culture 0.0004
10) gaming 0.0003
11) travel_&_adventure 0.0003
12) sports 0.0002
13) relationships 0.0002
14) youth_&_student_life 0.0002
15) music 0.0002
16) food_&_dining 0.0002
17) arts_&_culture 0.0002
18) family 0.0002
19) fashion_&_style 0.0001
curr_tweet_sector:


Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/tweet-topic-latest-multi and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1) news_&_social_concern 0.9976
2) science_&_technology 0.001
3) celebrity_&_pop_culture 0.0003
4) film_tv_&_video 0.0003
5) other_hobbies 0.0002
6) gaming 0.0001
7) diaries_&_daily_life 0.0001
8) business_&_entrepreneurs 0.0001
9) learning_&_educational 0.0001
10) sports 0.0001
11) arts_&_culture 0.0
12) music 0.0
13) youth_&_student_life 0.0
14) fitness_&_health 0.0
15) travel_&_adventure 0.0
16) family 0.0
17) relationships 0.0
18) food_&_dining 0.0
19) fashion_&_style 0.0


## prediction

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np 

### dataset loader

In [13]:

class dataloaderDUNEs(Dataset):
    def __init__(self, data, preprocessing_model):
        """
        Args:
            data (List[Dict]): Each dictionary contains raw text for 'prev_tweet', 'curr_tweet',
                               'prev_reddit', and engagement metrics ('likes', 'retweets', 'comments').
            preprocessing_model (PreDUNES): The model instance for preprocessing text data.
        """
        self.data = data
        self.preprocessing_model = preprocessing_model
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        # Process the raw texts through the PreDUNES model
        # Note: Ensure PreDUNES outputs PyTorch tensors; if it outputs numpy arrays, convert them to tensors
        prev_tweet_embedding, curr_tweet_embedding, prev_tweet_sentiment, curr_tweet_sentiment, prev_reddit_sentiment, prev_tweet_sector, curr_tweet_sector = self.preprocessing_model(
            item['prev_tweet'], item['curr_tweet'], item['prev_reddit']
        )

        # Convert numpy arrays to tensors if they are not already
        def to_tensor(array):
            if isinstance(array, np.ndarray):
                return torch.tensor(array).float()
            return array
        
        prev_tweet_embedding = to_tensor(prev_tweet_embedding)
        curr_tweet_embedding = to_tensor(curr_tweet_embedding)
        prev_tweet_sentiment = to_tensor(prev_tweet_sentiment)
        curr_tweet_sentiment = to_tensor(curr_tweet_sentiment)
        prev_reddit_sentiment = to_tensor(prev_reddit_sentiment)
        prev_tweet_sector = to_tensor(prev_tweet_sector)
        curr_tweet_sector = to_tensor(curr_tweet_sector)
        
        # Engagement metrics
        likes = torch.tensor(item['likes'], dtype=torch.float)
        retweets = torch.tensor(item['retweets'], dtype=torch.float)
        comments = torch.tensor(item['comments'], dtype=torch.float)
        
        return {
            'prev_tweet_embedding': prev_tweet_embedding,
            'curr_tweet_embedding': curr_tweet_embedding,
            'prev_tweet_sentiment': prev_tweet_sentiment,
            'curr_tweet_sentiment': curr_tweet_sentiment,
            'prev_reddit_sentiment': prev_reddit_sentiment,
            'prev_tweet_sector': prev_tweet_sector,
            'curr_tweet_sector': curr_tweet_sector,
            'likes': likes,
            'retweets': retweets,
            'comments': comments
        }

# Initialize PreDUNES model here
preprocessing_model = create_preprocessing_model(
    "mixedbread-ai/mxbai-embed-large-v1",
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    "SamLowe/roberta-base-go_emotions",
    "cardiffnlp/tweet-topic-latest-multi"
)

# Example data list
data = [
    {
        'prev_tweet': "@WholeMarsBlog Headline is misleading. Starlink can obviously offer far more robust positioning than GPS, as it will have ~1000X more satellites over time. Not all will have line of sight to users, but still >10X GPS & far stronger signal. Just not today’s problem.",
        'curr_tweet': "@spideycyp_155 @BillyM2k If Russia faced calamitous defeat in conventional warfare for something as strategically critical as Crimea, the probability of using nuclear weapons is high",
        'prev_reddit': "We know who controls the media. The same corporations who have wreaked havoc on the globe for decades, if not centuries, the big banks who financed them, and the governments who turned a blind eye to the destruction. The same entities who have brought us to the precipice of destruction - quite possibly condemning us, and our progeny to an unlivable climate They have tried to stop you at every turn, and yet you persist for the good of humanity. We love you, Elon! Keep up the good work! As you have said, we must never let the light of human consciousness fade - never!",
        'likes': 100,  
        'retweets': 50, 
        'comments': 25  
    }
]

# Create dataset and dataloader
dataset = dataloaderDUNEs(data, preprocessing_model)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
class EngagementPredictionModel(nn.Module):
    def __init__(self):
        super(EngagementPredictionModel, self).__init__()
        self.fc1 = nn.Linear(2120, 512)  
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 3) 

    def forward(self, concatenated_features):
        x = F.relu(self.fc1(concatenated_features))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = EngagementPredictionModel()

In [40]:
# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 25
for epoch in range(num_epochs):
    for batch in dataloader:
        def process_tensor(tensor):
            tensor = tensor.squeeze()
            # Ensure tensor is at least 2D (batch_size, features)
            if tensor.dim() == 1:
                tensor = tensor.unsqueeze(0)
            return tensor
        
        # Apply this processing to all tensors
        prev_tweet_embedding = process_tensor(batch['prev_tweet_embedding'])
        curr_tweet_embedding = process_tensor(batch['curr_tweet_embedding'])
        prev_tweet_sentiment = process_tensor(batch['prev_tweet_sentiment'])
        curr_tweet_sentiment = process_tensor(batch['curr_tweet_sentiment'])
        prev_reddit_sentiment = process_tensor(batch['prev_reddit_sentiment'])
        prev_tweet_sector = process_tensor(batch['prev_tweet_sector'])
        curr_tweet_sector = process_tensor(batch['curr_tweet_sector'])
        
        # Now, concatenate along dim=1 as all tensors are guaranteed to be at least 2D
        concatenated_features = torch.cat((
            prev_tweet_embedding, curr_tweet_embedding,
            prev_tweet_sentiment, curr_tweet_sentiment,
            prev_reddit_sentiment,
            prev_tweet_sector, curr_tweet_sector
        ), dim=1)

        # Prepare targets
        targets = torch.stack((batch['likes'], batch['retweets'], batch['comments']), dim=1)

        # Forward pass
        outputs = model(concatenated_features)

        # Compute loss
        loss = criterion(outputs, targets)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

Epoch [1/25], Loss: 3185.853271484375
Epoch [2/25], Loss: 2744.172607421875
Epoch [3/25], Loss: 2252.432861328125
Epoch [4/25], Loss: 1730.6173095703125
Epoch [5/25], Loss: 1209.2371826171875
Epoch [6/25], Loss: 730.9028930664062
Epoch [7/25], Loss: 351.5458679199219
Epoch [8/25], Loss: 133.73817443847656
Epoch [9/25], Loss: 116.4188003540039
Epoch [10/25], Loss: 256.1673889160156
Epoch [11/25], Loss: 427.4223937988281
Epoch [12/25], Loss: 510.9857177734375
Epoch [13/25], Loss: 472.5160217285156
Epoch [14/25], Loss: 351.9346618652344
Epoch [15/25], Loss: 211.14027404785156
Epoch [16/25], Loss: 98.5987777709961
Epoch [17/25], Loss: 37.24009323120117
Epoch [18/25], Loss: 26.705459594726562
Epoch [19/25], Loss: 52.2209358215332
Epoch [20/25], Loss: 94.04297637939453
Epoch [21/25], Loss: 135.4239501953125
Epoch [22/25], Loss: 164.58758544921875
Epoch [23/25], Loss: 175.559326171875
Epoch [24/25], Loss: 166.2635040283203
Epoch [25/25], Loss: 140.596435546875


In [None]:
import torch

def compute_accuracy(targets, predictions, threshold=10):
    """
    Computes a simple accuracy metric based on whether the predicted values
    are within a certain range (threshold) of the actual values.
    """
    correct = (torch.abs(targets - predictions) <= threshold).all(dim=1)
    accuracy = torch.mean(correct.float()) * 100
    return accuracy

# Training loop
num_epochs = 25
for epoch in range(num_epochs):
    epoch_losses = []
    epoch_accuracies = []
    for batch in dataloader:
        # Concatenate features, prepare targets, forward pass, compute loss, backward pass, and optimize
        # Similar to the provided snippet above
        
        # After loss.backward() and optimizer.step(), compute accuracy
        with torch.no_grad():  # Ensure no computation is recorded for gradient purposes
            # Assuming your model outputs predictions directly comparable to 'targets'
            accuracy = compute_accuracy(targets, outputs)
            epoch_accuracies.append(accuracy.item())
        
        epoch_losses.append(loss.item())
    
    # Compute the average loss and accuracy for the epoch
    epoch_loss = sum(epoch_losses) / len(epoch_losses)
    epoch_accuracy = sum(epoch_accuracies) / len(epoch_accuracies)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%')

    # Optionally, you can print predicted engagement metrics for the last batch
    if epoch == num_epochs - 1:  # Or choose any other epoch or condition for displaying
        print("Actual Engagement Metrics (Last Batch):", targets[-1].tolist())  # Last item in the batch
        print("Predicted Engagement Metrics (Last Batch):", outputs[-1].tolist())
