In [None]:
!pip install --no-cache-dir transformers sentencepiece

In [1]:
#import necessary libraries 
import torch
from torch.nn.functional import softmax
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
from google.cloud import storage
import io

In [None]:
# Declare global variables
GCP_KEY = '/home/jupyter/secrets/ac215.json'
GCP_DATA_BUCKET = 'data-lnt'
GCP_SOURCE_FILENAME = 'raw/unlabeled.csv'
BATCH_SIZE = 64
MODEL_SPECIFICATION = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
OUTPUT_FILEPATH = 'processed/labeled_initial.csv'

In [2]:
#create GCP Client
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GCP_KEY
storage_client = storage.Client()
bucket = storage_client.bucket(GCP_DATA_BUCKET)
source_filename = GCP_SOURCE_FILENAME
blob = bucket.blob(source_filename)
content = blob.download_as_text()

In [3]:
def get_model(model_name):
    """
    Input: model_name (name of desired BERT model)
    Output: tokenizer, model

    >>> get_model("cardiffnlp/twitter-xlm-roberta-base-sentiment")
    tokenizer(model_name), model(model_name)
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    return tokenizer, model

In [4]:
def tokenize(dataframe):
    """
    Input: Pandas dataframe (assumes text column = 'text')
    Output: tokenized text

    >>> tokenize(df)
    tokenized_texts
    """
    text_samples = dataframe['text'].tolist()
    tokenized_texts = tokenizer(text_samples, padding=True, truncation=True, return_tensors="pt")

    return tokenized_texts

In [15]:
def label(tokenized_texts, model, dataframe, batch_size=BATCH_SIZE):
    """
    Input: tokenized text, batch size
    Output: Model output (labels)

    >>> label(tokenized_texts, 64)
    labels
    """    
    #get input IDs and attention mask from tokenized text
    input_ids = tokenized_texts['input_ids']
    attention_mask = tokenized_texts['attention_mask']
    
    #define dataset from input IDs and attention mask
    dataset = TensorDataset(input_ids, attention_mask)

    #define batch size and create DataLoader
    batch_size = batch_size
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    #set model to evaluation mode
    model.eval()

    #create empty list to store labels for entire dataset
    labels = []
    
    # Iterate through the DataLoader and use the model for labeling
    batch = 0
    num_batches = -(-len(input_ids) // batch_size)
    for batch_input_ids, batch_attention_mask in dataloader:
        batch += 1
        print(f"Batch {batch} of {num_batches}")

        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        
        #get output logits and convert to label confidence
        logits = outputs.logits
        
        #append batch labels to dataset label list
        batch_labels = torch.softmax(logits, dim=1)
        print(batch_labels)
        labels.append(batch_labels)
        
    #concatenate all labels
    labels = torch.cat(labels, dim=0)
    
    #extract the raw scores for each sentiment class
    negative_scores = [score[0] for score in labels]
    neutral_scores = [score[1] for score in labels]
    positive_scores = [score[2] for score in labels]

    #map the predicted labels to sentiment categories
    sentiment_labels = ["Negative", "Neutral", "Positive"]
    predicted_labels = [sentiment_labels[label] for label in torch.argmax(logits, dim=1).tolist()]

    #add the scores and predicted labels to the DataFrame
    dataframe['negative_score'] = negative_scores
    dataframe['neutral_score'] = neutral_scores
    dataframe['positive_score'] = positive_scores
    dataframe['predicted_sentiment'] = predicted_labels


In [17]:
def save_dataset(df, outfilepath):
    """
    Saves the labeled dataframe to GCP data bucket
    
    Input: Pandas dataframe, GCP file path
    Output: None

    >>> save_dataset(dataframe, 'filepath'):
    returns None
    """
    out_file = bucket.blob(outfilepath)
    df.to_csv(out_file, index=False)
    out_file.upload_from_filename(outfilepath)

In [None]:
#import unlabeled dataset into dataframe
df = pd.read_csv(io.StringIO(content), names= ['first', 'last', 'party', 'network', 'date', 'text'])
#Sanity check
df.head(5)

In [None]:
#define BERT model and tokenized text
tokenizer, model = get_model(MODEL_SPECIFICATION)
tokenized_texts = tokenize(df)

In [None]:
#get labels of tokenized texts and append to dataframe
label(tokenized_texts, model, df, BATCH_SIZE)

In [None]:
#export dataframe to csv on GCP
save_dataset(df, OUTPUT_FILEPATH)