In [16]:
#import necessary libraries 
import torch
from torch.nn.functional import softmax
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
from google.cloud import storage
import io
from tqdm import tqdm

In [None]:
# Declare global variables
GCP_KEY = '/home/jupyter/secrets/ac215.json'
GCP_DATA_BUCKET = 'data-lnt'
GCP_SOURCE_FILENAME = 'raw/unlabeled.csv'
BATCH_SIZE = 64
MODEL_SPECIFICATION = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
OUTPUT_FILEPATH = 'processed/labeled_initial.csv'

In [3]:
#create GCP Client
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GCP_KEY
storage_client = storage.Client()
bucket = storage_client.bucket(GCP_DATA_BUCKET)
source_filename = GCP_SOURCE_FILENAME
blob = bucket.blob(source_filename)
content = blob.download_as_text()

In [4]:
# Check if a GPU is available
if torch.cuda.is_available():
    # Set the device to the first available GPU
    device = torch.device("cuda:0")
else:
    # If no GPU is available, use the CPU
    device = torch.device("cpu")

# Print the device being used
print(f"Using device: {device}")

Using device: cuda:0


In [11]:
def get_model(model_name):
    """
    Input: model_name (name of desired BERT model)
    Output: tokenizer, model

    >>> get_model("cardiffnlp/twitter-xlm-roberta-base-sentiment")
    tokenizer(model_name), model(model_name)
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

    return tokenizer, model

In [6]:
def tokenize(dataframe):
    """
    Input: Pandas dataframe (assumes text column = 'text')
    Output: tokenized text

    >>> tokenize(df)
    tokenized_texts
    """
    text_samples = dataframe['text'].tolist()
    tokenized_texts = tokenizer(text_samples, padding=True, truncation=True, return_tensors="pt")

    return tokenized_texts

In [50]:
def label(tokenized_texts, model, device, dataframe, batch_size=64):
    """
    Uses the pre-trained model to evaluate the unlabeled dataset. Sentiment scores and labels are added 
    to the dataframe based on the label provided by the model.

    Input: tokenized_texts, model, device, dataframe, batch_size
    Output: None
    """  
    #get input IDs and attention mask from tokenized text
    input_ids = tokenized_texts['input_ids'].to(device)
    attention_mask = tokenized_texts['attention_mask'].to(device)
    
    #define dataset from input IDs and attention mask
    dataset = TensorDataset(input_ids, attention_mask)

    #define batch size and create DataLoader
    batch_size = batch_size
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    #set model to evaluation mode
    model.eval()

    #create empty list to store labels for entire dataset
    labels = []
    
    #create a progress bar to track labeling process
    progress_bar = tqdm(total=len(dataloader), desc="Labeling")
    
    for batch_input_ids, batch_attention_mask in dataloader:
        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        
        #get output logits and convert to label confidence
        logits = outputs.logits
        
        #append batch labels to dataset label list
        batch_labels = torch.softmax(logits, dim=1)
        labels.append(batch_labels)
        
        #update progress bar
        progress_bar.update(1)
        
    #concatenate all labels
    labels = torch.cat(labels, dim=0)
    
    #move labels to CPU to append to dataframe
    labels = labels.cpu()
    
    #extract the raw scores for each sentiment class
    negative_scores = [score[0].item() for score in labels]
    neutral_scores = [score[1].item() for score in labels]
    positive_scores = [score[2].item() for score in labels]
    
    #define final sentiment label my max of sentiment scores
    initial_sentiment = []
    for neg, neut, pos in zip(negative_scores, neutral_scores, positive_scores):
        initial_sentiment.append([neg, neut, pos].index(max([neg, neut, pos])))

    #append the scores and predicted labels to the DataFrame
    dataframe['negative_score'] = negative_scores
    dataframe['neutral_score'] = neutral_scores
    dataframe['positive_score'] = positive_scores
    dataframe['initial_sentiment'] = initial_sentiment


In [48]:
def save_dataset(df, outfilepath):
    """
    Saves the labeled dataframe to GCP data bucket
    
    Input: Pandas dataframe, GCP file path
    Output: None

    >>> save_dataset(dataframe, 'filepath'):
    returns None
    """
    #convert DataFrame to a CSV string
    csv_string = df.to_csv(index=False)

    #upload the CSV string to GCP
    blob = bucket.blob(outfilepath)
    blob.upload_from_string(csv_string)

In [38]:
#import unlabeled dataset into dataframe
df = pd.read_csv(io.StringIO(content), names= ['first', 'last', 'party', 'network', 'date', 'text'])
#Sanity check
df.head(5)

Unnamed: 0,first,last,party,network,date,text
0,Donald,Trump,R,BBCNEWS,20230928,a second donald trump term would be donald tru...
1,Donald,Trump,R,CNNW,20230928,by name this time. donald trump is missing in...
2,Donald,Trump,R,MSNBCW,20230926,that cuts at who donald trump is. when he flew...
3,Donald,Trump,R,BBCNEWS,20230928,donald trump is a part _ through donald trump?...
4,Donald,Trump,R,CNNW,20230927,when you think of donald trump being the front...


In [12]:
#define BERT model and tokenized text
tokenizer, model = get_model("cardiffnlp/twitter-xlm-roberta-base-sentiment")
tokenized_texts = tokenize(df)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [45]:
#get labels of tokenized texts and append to dataframe
label(tokenized_texts, model, device, df, 64)




Labeling:   0%|          | 0/58 [00:00<?, ?it/s][A[A[A


Labeling:   7%|▋         | 4/58 [00:00<00:02, 23.37it/s][A[A[A


Labeling:  12%|█▏        | 7/58 [00:03<00:31,  1.62it/s][A[A[A


Labeling:  16%|█▌        | 9/58 [00:05<00:39,  1.25it/s][A[A[A


Labeling:  17%|█▋        | 10/58 [00:07<00:41,  1.15it/s][A[A[A


Labeling:  19%|█▉        | 11/58 [00:08<00:43,  1.07it/s][A[A[A


Labeling:  21%|██        | 12/58 [00:09<00:45,  1.01it/s][A[A[A


Labeling:  22%|██▏       | 13/58 [00:10<00:46,  1.04s/it][A[A[A


Labeling:  24%|██▍       | 14/58 [00:11<00:47,  1.07s/it][A[A[A


Labeling:  26%|██▌       | 15/58 [00:12<00:47,  1.09s/it][A[A[A


Labeling:  28%|██▊       | 16/58 [00:14<00:46,  1.11s/it][A[A[A


Labeling:  29%|██▉       | 17/58 [00:15<00:46,  1.13s/it][A[A[A


Labeling:  31%|███       | 18/58 [00:16<00:45,  1.14s/it][A[A[A


Labeling:  33%|███▎      | 19/58 [00:17<00:44,  1.15s/it][A[A[A


Labeling:  34%|███▍      | 20/58 [00:18<00:

In [None]:
df.head(10)

Unnamed: 0,first,last,party,network,date,text,negative_score,neutral_score,positive_score,initial_sentiment
0,Donald,Trump,R,BBCNEWS,20230928,a second donald trump term would be donald tru...,0.356361,0.399088,0.244551,1
1,Donald,Trump,R,CNNW,20230928,by name this time. donald trump is missing in...,0.526962,0.329641,0.143397,0
2,Donald,Trump,R,MSNBCW,20230926,that cuts at who donald trump is. when he flew...,0.457405,0.336223,0.206371,0
3,Donald,Trump,R,BBCNEWS,20230928,donald trump is a part _ through donald trump?...,0.456128,0.344558,0.199313,0
4,Donald,Trump,R,CNNW,20230927,when you think of donald trump being the front...,0.52397,0.346868,0.129162,0
5,Donald,Trump,R,CNNW,20230928,donald trump is missing in action. he should b...,0.434991,0.321693,0.243316,0
6,Donald,Trump,R,BBCNEWS,20230928,- which is donald trump? what the re which is ...,0.796101,0.147924,0.055975,0
7,Donald,Trump,R,MSNBCW,20230929,"you take away donald trump, you dont get him a...",0.561329,0.289433,0.149237,0
8,Donald,Trump,R,MSNBCW,20230927,work for donald trump. you have a different pe...,0.845434,0.112779,0.041787,0
9,Donald,Trump,R,CNNW,20230928,donald trump is missing in action. he should b...,0.783439,0.165321,0.05124,0


In [49]:
#export dataframe to csv on GCP
save_dataset(df, 'processed/labeled_initial.csv')