In [37]:
#import necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, BertForSequenceClassification, BertTokenizer
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.functional import softmax
from google.cloud import storage 
import io
import tempfile
import shutil
from tqdm import tqdm

In [15]:
# Declare global variables
GCP_KEY = '/home/jupyter/secrets/ac215.json'
GCP_DATA_BUCKET = 'data-lnt'
GCP_SOURCE_FILENAME = 'processed/labeled_initial.csv'
GCP_MODELS_BUCKET = 'models-lnt'

HIGH_CONFIDENCE_THRESHOLD = 0.7
TEST_SIZE = 0.2
NUMBER_EPOCHS = 3
RANDOM_STATE = 215
ADAM_LEARNING_RATE = 1e-5
ADAM_BATCH_SIZE = 32
LABEL_BATCH_SIZE = 32

MODEL_DIR_FINETUNE = 'fine_tune_label'
MODEL_DIR_BERT = 'cardiffnlp/twitter-xlm-roberta-base-sentiment'

GCP_OUTPUT_PATH = 'processed/labeled_final.csv'

In [3]:
##create GCP Client
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GCP_KEY
source_filename = GCP_SOURCE_FILENAME
storage_client = storage.Client()
data_bucket = storage_client.bucket(GCP_DATA_BUCKET)
models_bucket = storage_client.bucket(GCP_MODELS_BUCKET)
blob = data_bucket.blob(GCP_SOURCE_FILENAME)
content = blob.download_as_text()

In [4]:
# Check if a GPU is available
if torch.cuda.is_available():
    # Set the device to the first available GPU
    device = torch.device("cuda:0")
else:
    # If no GPU is available, use the CPU
    device = torch.device("cpu")

# Print the device being used
print(f"Using device: {device}")

Using device: cuda:0


In [5]:
def get_model(model_directory):
    """
    Input: model_name (name of desired BERT model)
    Output: tokenizer, model

    >>> get_model("cardiffnlp/twitter-xlm-roberta-base-sentiment")
    tokenizer(model_name), model(model_name)
    """
    tokenizer = AutoTokenizer.from_pretrained(model_directory)
    model = AutoModelForSequenceClassification.from_pretrained(model_directory).to(device)

    return tokenizer, model

In [6]:
def get_high_confidence_df(df, threshold=HIGH_CONFIDENCE_THRESHOLD):
    """
    Filters an input dataframe to only include samples with label confidence above a defined threshold

    Input: dataframe with initial labels, desired confidence threshold
    Output: dataframe including only high confidence examples above specified threshold

    >>> get_high_confidence_df(df, 0.9)
    high_confidence_df
    """
    return df[(df['negative_score'] > threshold) |
    (df['neutral_score'] > threshold) |
    (df['positive_score'] > threshold)]

In [7]:
def tokenize(dataframe):
    """
    Returns tokenized text given a dataframe

    Input: Pandas dataframe (assumes text column = 'text')
    Output: tokenized text

    >>> tokenize(df)
    tokenized_texts
    """
    text_samples = dataframe['text'].tolist()
    tokenized_texts = tokenizer(text_samples, padding=True, truncation=True, return_tensors="pt")

    return tokenized_texts

In [8]:
def get_datasets(df, tokenizer, test_size=TEST_SIZE):
    """
    Returns training and validation datasets given a dataframe and tokenizer

    Input: panadas dataframe, tokenizer, 
    Output: tokenized text

    >>> tokenize(df)
    tokenized_texts
    """
    # Define training and valid dataframes
    train_df, valid_df = train_test_split(df, test_size=test_size, random_state=RANDOM_STATE)

    # Tokenize the training data
    train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, return_tensors='pt')
    train_labels = torch.tensor(train_df['initial_sentiment'].tolist())

    # Tokenize the validation data
    valid_encodings = tokenizer(valid_df['text'].tolist(), truncation=True, padding=True, return_tensors='pt')
    valid_labels = torch.tensor(valid_df['initial_sentiment'].tolist())

    # Create DataLoader objects
    train_dataset = TensorDataset(train_encodings.input_ids, train_encodings.attention_mask, train_labels)
    valid_dataset = TensorDataset(valid_encodings.input_ids, valid_encodings.attention_mask, valid_labels)

    return train_dataset, valid_dataset

In [9]:
def train_bert(model, train_dataset, valid_dataset, device, epochs=NUMBER_EPOCHS):
    """
    Fine tunes the pretrained BERT model based on the high confidence samples

    Input: BERT model, training dataset, validation dataset, number of epochs
    Output: None (Prints epoch progress)

    >>> train_bert(high_confidence_df, train_data, valid_data, epochs=4)
    Epoch 2/4: Validation Loss: 12.3452, Validation Accuracy: 0.8362
    """
    
    # Train loop
    optimizer = AdamW(model.parameters(), lr=ADAM_LEARNING_RATE)
    train_loader = DataLoader(train_dataset, batch_size=ADAM_BATCH_SIZE, shuffle=True)
    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

    # Validation loop
    valid_loader = DataLoader(valid_dataset, batch_size=ADAM_BATCH_SIZE)
    model.eval()
    with torch.no_grad():
        total_loss = 0.0
        correct = 0
        total = 0
        for batch in valid_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = torch.nn.functional.cross_entropy(logits, labels)
            total_loss += loss.item()

            _, predicted = torch.max(logits, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        accuracy = correct / total
        average_loss = total_loss / len(valid_loader)

        print(f'Epoch {epoch + 1}/{epochs}: Validation Loss: {average_loss:.4f}, Validation Accuracy: {accuracy:.4f}')

In [29]:
def save_model(output_directory, model, tokenizer):
    """
    Saves the final fine tuned model and tokenizer to GCP models bucket

    Input: GCP output directory, model, tokenizer
    Output: None
    """
    # Create a temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        
        # Serialize and save the model in the temporary directory
        model_path = os.path.join(temp_dir, 'model.pth')
        torch.save(model.state_dict(), model_path)

        # Save the tokenizer in the temporary directory
        tokenizer.save_pretrained(temp_dir)

        # Upload the serialized model to the GCS bucket
        model_blob = models_bucket.blob(f'{output_directory}/model.pth')
        model_blob.upload_from_filename(model_path)

        # Upload the contents of the temporary directory to the GCS bucket
        for root, dirs, files in os.walk(temp_dir):
            for file in files:
                file_path = os.path.join(root, file)
                gcs_path = f'{output_directory}/{os.path.relpath(file_path, start=temp_dir)}'
                blob = models_bucket.blob(gcs_path)
                blob.upload_from_filename(file_path)

In [38]:
  def label(tokenized_texts, model, device, dataframe, batch_size=64):
    """
    Uses the fine-tuned model to evaluate the labeled dataset. Sentiment scores and labels are added 
    to the dataframe based on the label provided by the model.

    Input: tokenized_texts, model, device, dataframe, batch_size
    Output: None
    """  
    #get input IDs and attention mask from tokenized text
    input_ids = tokenized_texts['input_ids'].to(device)
    attention_mask = tokenized_texts['attention_mask'].to(device)
    
    #define dataset from input IDs and attention mask
    dataset = TensorDataset(input_ids, attention_mask)

    #define batch size and create DataLoader
    batch_size = batch_size
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    #set model to evaluation mode
    model.eval()

    #create empty list to store labels for entire dataset
    labels = []
    
    #create a progress bar to track labeling process
    progress_bar = tqdm(total=len(dataloader), desc="Labeling")
    
    for batch_input_ids, batch_attention_mask in dataloader:
        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        
        #get output logits and convert to label confidence
        logits = outputs.logits
        
        #append batch labels to dataset label list
        batch_labels = torch.softmax(logits, dim=1)
        labels.append(batch_labels)
        
        #update progress bar
        progress_bar.update(1)
        
    #concatenate all labels
    labels = torch.cat(labels, dim=0)
    
    #move labels to CPU to append to dataframe
    labels = labels.cpu()
    
    #extract the raw scores for each sentiment class
    negative_scores = [score[0].item() for score in labels]
    neutral_scores = [score[1].item() for score in labels]
    positive_scores = [score[2].item() for score in labels]
    
    #define final sentiment label my max of sentiment scores
    initial_sentiment = []
    for neg, neut, pos in zip(negative_scores, neutral_scores, positive_scores):
        initial_sentiment.append([neg, neut, pos].index(max([neg, neut, pos])))

    #append the scores and predicted labels to the DataFrame
    dataframe['negative_score'] = negative_scores
    dataframe['neutral_score'] = neutral_scores
    dataframe['positive_score'] = positive_scores
    dataframe['initial_sentiment'] = initial_sentiment


In [42]:
def save_dataset(df, outfilepath):
    """
    Saves the labeled dataframe to GCP data bucket
    
    Input: Pandas dataframe, GCP file path
    Output: None

    >>> save_dataset(dataframe, 'filepath'):
    returns None
    """
    #convert DataFrame to a CSV string
    csv_string = df.to_csv(index=False)

    #upload the CSV string to GCP
    blob = data_bucket.blob(outfilepath)
    blob.upload_from_string(csv_string)

In [13]:
# Load the pre-trained model and tokenizer
try: 
    tokenizer, model = get_model(models_bucket.blob(MODEL_DIR_FINETUNE))
except:
    tokenizer, model = get_model(MODEL_DIR_BERT)

In [16]:
# Import labeled dataset 
initial_df = pd.read_csv(io.StringIO(content))

# Filter high-confidence examples based on predicted sentiment scores
high_confidence_df = get_high_confidence_df(initial_df, HIGH_CONFIDENCE_THRESHOLD)
high_confidence_df.head(5)

Unnamed: 0,first,last,party,network,date,text,negative_score,neutral_score,positive_score,initial_sentiment
6,Donald,Trump,R,BBCNEWS,20230928,- which is donald trump? what the re which is ...,0.796101,0.147924,0.055975,0
8,Donald,Trump,R,MSNBCW,20230927,work for donald trump. you have a different pe...,0.845434,0.112779,0.041787,0
9,Donald,Trump,R,CNNW,20230928,donald trump is missing in action. he should b...,0.783439,0.165321,0.05124,0
15,Donald,Trump,R,CNNW,20230927,this isnt about donald trump. donald trump is ...,0.82146,0.130095,0.048446,0
33,Donald,Trump,R,CNNW,20230926,"hutchinson tells me donald trump, quote, donal...",0.717126,0.193324,0.08955,0


In [17]:
# Get training and validation datasets 
train_data, valid_data = get_datasets(high_confidence_df, tokenizer, TEST_SIZE)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [18]:
# Fine-tune the BERT model
train_bert(model, train_data, valid_data, device, epochs=NUMBER_EPOCHS)



Epoch 3/3: Validation Loss: 0.0681, Validation Accuracy: 0.9880


In [30]:
# Save the fine-tuned model to GCP
save_model(MODEL_DIR_FINETUNE, model, tokenizer)

In [39]:
# Tokenize text from initial dataframe
tokenized_texts = tokenize(initial_df)

# Label the full dataset using the fine-tuned model
label(tokenized_texts, model, device, initial_df, batch_size=LABEL_BATCH_SIZE)

Labeling: 100%|██████████| 115/115 [01:09<00:00,  1.67it/s]


In [40]:
#sanity check
initial_df.head(5)

Unnamed: 0,first,last,party,network,date,text,negative_score,neutral_score,positive_score,initial_sentiment
0,Donald,Trump,R,BBCNEWS,20230928,a second donald trump term would be donald tru...,0.996725,0.001229,0.002046,0
1,Donald,Trump,R,CNNW,20230928,by name this time. donald trump is missing in...,0.996772,0.001228,0.002,0
2,Donald,Trump,R,MSNBCW,20230926,that cuts at who donald trump is. when he flew...,0.996843,0.00119,0.001966,0
3,Donald,Trump,R,BBCNEWS,20230928,donald trump is a part _ through donald trump?...,0.996895,0.001167,0.001938,0
4,Donald,Trump,R,CNNW,20230927,when you think of donald trump being the front...,0.996897,0.001194,0.001909,0


In [43]:
# Export final dataframe to GCP
save_dataset(initial_df, GCP_OUTPUT_PATH)