In [1]:
#import necessary libraries 
import torch
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import os
from google.cloud import storage
import io
from tqdm import tqdm

In [2]:
# Declare global variables
GCP_KEY = '/home/jupyter/secrets/ac215.json'
GCP_DATA_BUCKET = 'data-lnt'
GCP_SOURCE_FILENAME = 'raw/unlabeled.csv'
BATCH_SIZE = 64
MODEL_SPECIFICATION = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
OUTPUT_FILEPATH = 'processed/labeled_initial.csv'

In [3]:
#create GCP Client
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GCP_KEY
storage_client = storage.Client()
bucket = storage_client.bucket(GCP_DATA_BUCKET)
source_filename = GCP_SOURCE_FILENAME
blob = bucket.blob(source_filename)
content = blob.download_as_text()

In [4]:
# Check if a GPU is available
if torch.cuda.is_available():
    # Set the device to the first available GPU
    device = torch.device("cuda:0")
else:
    # If no GPU is available, use the CPU
    device = torch.device("cpu")

# Print the device being used
print(f"Using device: {device}")

Using device: cuda:0


In [5]:
def get_model(model_name):
    """
    Input: model_name (name of desired BERT model)
    Output: tokenizer, model

    >>> get_model("cardiffnlp/twitter-xlm-roberta-base-sentiment")
    tokenizer(model_name), model(model_name)
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

    return tokenizer, model

In [6]:
def tokenize(dataframe):
    """
    Input: Pandas dataframe (assumes text column = 'text')
    Output: tokenized text

    >>> tokenize(df)
    tokenized_texts
    """
    text_samples = dataframe['text'].tolist()
    tokenized_texts = tokenizer(text_samples, padding=True, return_tensors='pt')

    return tokenized_texts

In [7]:
def label(tokenized_texts, model, device, dataframe, batch_size=64):
    """
    Uses the pre-trained model to evaluate the unlabeled dataset. Sentiment scores and labels are added 
    to the dataframe based on the label provided by the model.

    Input: tokenized_texts, model, device, dataframe, batch_size
    Output: None
    """  
    #get input IDs and attention mask from tokenized text
    input_ids = tokenized_texts['input_ids'].to(device)
    attention_mask = tokenized_texts['attention_mask'].to(device)
    
    #define dataset from input IDs and attention mask
    dataset = TensorDataset(input_ids, attention_mask)

    #define batch size and create DataLoader
    batch_size = batch_size
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    #set model to evaluation mode
    model.eval()

    #create empty list to store labels for entire dataset
    labels = []
    
    #create a progress bar to track labeling process
    progress_bar = tqdm(total=len(dataloader), desc="Labeling")
    
    for batch_input_ids, batch_attention_mask in dataloader:
        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        
        #get output logits and convert to label confidence
        logits = outputs.logits
        
        #append batch labels to dataset label list
        batch_labels = torch.softmax(logits, dim=1)
        labels.append(batch_labels)
        
        #update progress bar
        progress_bar.update(1)
        
    #concatenate all labels
    labels = torch.cat(labels, dim=0)
    
    #move labels to CPU to append to dataframe
    labels = labels.cpu()
    
    #extract the raw scores for each sentiment class
    negative_scores = [score[0].item() for score in labels]
    neutral_scores = [score[1].item() for score in labels]
    positive_scores = [score[2].item() for score in labels]
    
    #define final sentiment label my max of sentiment scores
    initial_sentiment = []
    for neg, neut, pos in zip(negative_scores, neutral_scores, positive_scores):
        initial_sentiment.append([neg, neut, pos].index(max([neg, neut, pos])))

    #append the scores and predicted labels to the DataFrame
    dataframe['negative_score'] = negative_scores
    dataframe['neutral_score'] = neutral_scores
    dataframe['positive_score'] = positive_scores
    dataframe['initial_sentiment'] = initial_sentiment
        

In [8]:
def save_dataset(df, outfilepath):
    """
    Saves the labeled dataframe to GCP data bucket
    
    Input: Pandas dataframe, GCP file path
    Output: None

    >>> save_dataset(dataframe, 'filepath'):
    returns None
    """
    #convert DataFrame to a CSV string
    csv_string = df.to_csv(index=False)

    #upload the CSV string to GCP
    blob = bucket.blob(outfilepath)
    blob.upload_from_string(csv_string)

In [9]:
#import unlabeled dataset into dataframe
df = pd.read_csv(io.StringIO(content))
df = df.dropna()
#Sanity check
df.head(5)

Unnamed: 0.1,Unnamed: 0,first_name,last_name,party,network,date,text
0,0,Marianne,Williamson,D,FOXNEWSW,20230611,and . this despite a new poll from rasmussen t...
1,1,Marianne,Williamson,D,FBC,20230622,yesterday i spoke with democrat the presidenti...
2,2,Marianne,Williamson,D,CSPAN,20230823,this time he is doing the same think by senten...
3,3,Marianne,Williamson,D,CSPAN,20230731,"there is our little friend, her name is . she ..."
4,4,Marianne,Williamson,D,CSPAN,20230813,and speaking at the des moines register soapbo...


In [10]:
#define BERT model and tokenized text
tokenizer, model = get_model(MODEL_SPECIFICATION)
tokenized_texts = tokenize(df)

In [11]:
#get labels of tokenized texts and append to dataframe
label(tokenized_texts, model, device, df, 64)

Labeling: 100%|██████████| 670/670 [07:30<00:00,  1.49it/s]


In [12]:
df.head(-1)

Unnamed: 0.1,Unnamed: 0,first_name,last_name,party,network,date,text,negative_score,neutral_score,positive_score,initial_sentiment
0,0,Marianne,Williamson,D,FOXNEWSW,20230611,and . this despite a new poll from rasmussen t...,0.114825,0.215496,0.669679,2
1,1,Marianne,Williamson,D,FBC,20230622,yesterday i spoke with democrat the presidenti...,0.481302,0.351283,0.167416,0
2,2,Marianne,Williamson,D,CSPAN,20230823,this time he is doing the same think by senten...,0.847483,0.112738,0.039779,0
3,3,Marianne,Williamson,D,CSPAN,20230731,"there is our little friend, her name is . she ...",0.638722,0.265266,0.096012,0
4,4,Marianne,Williamson,D,CSPAN,20230813,and speaking at the des moines register soapbo...,0.183730,0.549078,0.267192,1
...,...,...,...,...,...,...,...,...,...,...,...
42859,42859,Robert,Kennedy,D,GBN,20231010,"in america, see this story and i worry at the ...",0.486088,0.393858,0.120054,0
42860,42860,Robert,Kennedy,D,GBN,20230702,"and j . edgar hoover, believe it or j. edgar h...",0.266653,0.537621,0.195726,1
42861,42861,Robert,Kennedy,D,GBN,20230702,"and j . edgar hoover, believe it or j. edgar h...",0.266653,0.537621,0.195726,1
42862,42862,Robert,Kennedy,D,CSPAN,20230720,he is more popular -- i hate to say this becau...,0.816158,0.138110,0.045732,0


In [13]:
df['initial_sentiment'].value_counts()

initial_sentiment
0    31147
1     8085
2     3632
Name: count, dtype: int64

In [14]:
#export dataframe to csv on GCP
save_dataset(df, 'processed/labeled_initial.csv')