In [1]:
import pandas as pd
import numpy as np
import os
import re
import torch
import torch.nn.functional as F
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification
)

In [0]:
DATA_DIR = '/data/workspace_files/Twitter Datasets'
OUTPUT_DIR = '/data/workspace_files/RoBERTa Sentiment Scores'

### RoBERTa Sentiment Scoring

In this notebook, we determine the sentiment score for each tweet using the pretrained RoBERTa model. Running a forward pass through the model for more than 5.6 million tweets (approximately 800 thousand from the Top 100 dataset and approximately 4.8 million from the Random Tweet dataset), about 8 hours even with a GPU. 

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [3]:
def chunks(lst, n):
    """Chunk a list into parts of size n."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

The `PreTrainedTextModel` class below is a wrapper class for the Transformers `AutoTokenizer` and `AutoModelForSequenceClassification` classes, and handles the calculation of a single sentiment score from the softmax output from the model. 

In [4]:
class PreTrainedTextModel():
    def __init__(self, model_name, **kwargs):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                                        **kwargs)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def predict(self, text, batch_size=32, device=None):
        # set model up for prediction
        self.model.eval()
        if device:
            self.model.to(device)

        # predicting large amounts of text at once leads to memory errors
        # we chunk the list of text and predict in batches
        # additionally, torch.no_grad() stops gradient accumulation which can 
        # take up a lot of memory
        predictions = []
        with torch.no_grad():
            for chunk in chunks(text, batch_size):
                encodings = self.tokenizer(chunk, padding=True, truncation=True, 
                                        max_length=512, return_tensors='pt')

                # move model inputs to device
                if device:
                    encodings.to(device)

                output = self.model(**encodings)
                pred = F.softmax(output.logits, dim=-1)

                # label 0 is negative, label 1 is positive
                # pulling the positive score ensures that positive sentiment is 
                # close to 1.0 and negative sentiment is close to 0.0
                predictions.append(pred.cpu().numpy())

        predictions = np.concatenate(predictions, axis=0)

        # score is a weighted average using the output probabilities as weights
        scores = np.zeros(predictions.shape)
        scores[:, 0] = -np.ones(predictions.shape[0])  # first columns is negative
        scores[:, -1] = np.ones(predictions.shape[0])  # last column is positive

        score = (predictions * scores).sum(axis=1)        

        return predictions, score

First, we load the model and present its architecture. The model is a 12 layer transformer with a softmax output for predicting positive, negative, and neutral sentiment.

In [8]:
# model from: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment?text=I+like+you.+I+love+you
model_name = 'cardiffnlp/twitter-roberta-base-sentiment'
model = PreTrainedTextModel(model_name)
model.model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

First we predict the sentiment score for the Top 100 dataset.

In [3]:
dtypes = {
    'id': str,
    'author_id': str,
    'text': str,
    'clean_text': str,
    'dataset': str
}
data = pd.read_csv(os.path.join(DATA_DIR, 'twitter_dataset_top100.csv'),
                   encoding='utf-8', dtype=dtypes, parse_dates=['created_at'])
data.reset_index(drop=True, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 847540 entries, 0 to 847539
Data columns (total 19 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   id             847540 non-null  object        
 1   author_id      847540 non-null  object        
 2   created_at     847540 non-null  datetime64[ns]
 3   clean_text     847540 non-null  object        
 4   btc_price      814694 non-null  float64       
 5   eth_price      789475 non-null  float64       
 6   btc_ret_+0.5h  814263 non-null  float64       
 7   eth_ret_+0.5h  789035 non-null  float64       
 8   btc_ret_+1h    813823 non-null  float64       
 9   eth_ret_+1h    788593 non-null  float64       
 10  btc_ret_+3h    810518 non-null  float64       
 11  eth_ret_+3h    785292 non-null  float64       
 12  btc_ret_+8h    805402 non-null  float64       
 13  eth_ret_+8h    780172 non-null  float64       
 14  btc_ret_+12h   801252 non-null  float64       
 15  

In [0]:
cols = ['id', 'created_at', 'clean_text']
output_df = data.loc[:, cols]

# takes ~1 hour for ~850K tweets
text = data['clean_text'].to_list()
pred, score = model.predict(text, device=device)

output_df['score'] = score

In [9]:
output_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 847540 entries, 0 to 847539
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   id          847540 non-null  object        
 1   created_at  847540 non-null  datetime64[ns]
 2   clean_text  847540 non-null  object        
 3   score       847540 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 25.9+ MB


In [10]:
output_df.to_csv(os.path.join(OUTPUT_DIR), 'scores_twitter_roberta_pretrained_top100.csv')

Nest, we predict sentiment scores for the Random Tweet dataset. Because this is a much larger dataset and running the model for all 4.8 million tweets takes approximately 7 hours, we do predictions in batches of 500 thousand, saving the batches as we go. Once all tweets have a sentiment score, we combine the batch files into a single dataset. 

In [5]:
dtypes = {
    'id': str,
    'author_id': str,
    'text': str,
    'clean_text': str,
    'dataset': str
}
data = pd.read_csv('/data/workspace_files/twitter_dataset_random.csv',
                   encoding='utf-8', dtype=dtypes, parse_dates=['created_at'])
data.reset_index(drop=True, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4826627 entries, 0 to 4826626
Data columns (total 4 columns):
 #   Column      Dtype         
---  ------      -----         
 0   id          object        
 1   author_id   object        
 2   created_at  datetime64[ns]
 3   clean_text  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 147.3+ MB


In [8]:
# we run this in chunks of 500k as predicting takes a very long time 
# so saving periodically is really important
start_pos = 0
chunk_size = 500000
cols = ['id', 'created_at', 'clean_text']
use_data = data.iloc[start_pos:, :]

for i, chunk_df in enumerate(chunks(use_data, chunk_size)):
    output_df = chunk_df.loc[:, cols].copy()
    start = output_df.index.min()
    end = output_df.index.max()

    # takes ~1 hour for ~850K tweets
    text = output_df['clean_text'].to_list()
    pred, score = model.predict(text, device=device)

    output_df['score'] = score

    print('\nChunk {:,} to {:,}'.format(start, end))
    print(output_df.info())
    
    output_file = 'scores_twitter_roberta_pretrained_random_{}.csv'.format(end + 1)
    output_df.to_csv(os.path.join(OUTPUT_DIR, output_file), index=False)


Chunk 0 to 499,999
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   id          500000 non-null  object        
 1   created_at  500000 non-null  datetime64[ns]
 2   clean_text  500000 non-null  object        
 3   score       500000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 15.3+ MB
None

Chunk 500,000 to 999,999
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 500000 to 999999
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   id          500000 non-null  object        
 1   created_at  500000 non-null  datetime64[ns]
 2   clean_text  500000 non-null  object        
 3   score       500000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 1

In [6]:
# collect and save the different chunks into a single file
files = os.listdir(OUTPUT_DIR)

def sorter(x):
    try:
        return int(re.findall('_(\d+)[.]', x)[0])
    except IndexError:
        return np.inf
files.sort(key=sorter)

data = pd.DataFrame()
for f in files:
    if not re.search('_random_[0-9]+', f):
        continue

    df = pd.read_csv(os.path.join(OUTPUT_DIR, f), 
                     parse_dates=['created_at'], 
                     dtype={'id': str})
    data = data.append(df, ignore_index=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4826627 entries, 0 to 4826626
Data columns (total 4 columns):
 #   Column      Dtype         
---  ------      -----         
 0   id          object        
 1   created_at  datetime64[ns]
 2   clean_text  object        
 3   score       float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 147.3+ MB


In [10]:
output_file = 'scores_twitter_roberta_pretrained_random_all.csv'
data.to_csv(os.path.join(OUTPUT_DIR, output_file), index=False)