In [None]:
!pip install -U sentence-transformers
!pip install transformers

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import time

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

from sentence_transformers import SentenceTransformer, util
from transformers import BertModel, BertTokenizer

# from google.colab import auth
# auth.authenticate_user()

# import gspread
# from oauth2client.client import GoogleCredentials

from google.colab import drive
drive.mount('/drive')

device = "cuda" if torch.cuda.is_available() else 'cpu'

# Load Data

In [None]:
## Load manually labeled data
# From colab-setup, replace if changes were made to this file
# Initialize directories used

parent_dir = '/drive/MyDrive/spotify-misinformation'

modeling_output_dir = f"{parent_dir}/modeling-output"
trained_models_output_dir = f"{modeling_output_dir}/trained-models"

## Labeling outputs directories 
labeling_output_dir = f"{parent_dir}/labeling-output"
labeled_dataset = f"{labeling_output_dir}/manually-labeled-matched-pairs.csv"

labeled_data = pd.read_csv(labeled_dataset)

# split data, load into dataset
train_labeled_data, test_labeled_data = train_test_split(labeled_data, shuffle=True, train_size=0.8, random_state=10)

# Focal Loss Definition

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, weight=None, gamma=1):
        super(FocalLoss, self).__init__()
        assert gamma >= 0
        self.gamma = gamma
        self.weight = weight

    def forward(self, input, target):
        '''
        :param input: input predictions
        :param target: labels
        :return: tensor of focal loss in scalar
        '''
        loss = None
        # reference: https://github.com/kaidic/LDAM-DRW/blob/3193f05c1e6e8c4798c5419e97c5a479d991e3e9/losses.py#L13 
        ce_loss = F.cross_entropy(input=input, target=target, weight=self.weight, reduction='none')
        loss = ce_loss * (1 - torch.exp(-ce_loss)) ** self.gamma
        loss = loss.mean()
        return loss

# SentenceBERT Classifier

In [None]:
class SBERTAggregationClassifier(nn.Module):
  def __init__(self, input_size=384, output_size=6):
    super().__init__()

    self.aggregator = nn.Sequential(
        nn.Linear(input_size * 2, input_size, bias=True),
        nn.ReLU(),
        nn.Dropout(0.5),
        )
    
    self.classifier = nn.Sequential(
          nn.Linear(input_size, input_size, bias=True),
          nn.ReLU(),
          nn.Dropout(0.5),
          nn.Linear(input_size, output_size, bias=True),
      )
    
    self.softmax = nn.Softmax(dim=1)
    
  def forward(self, fc_embed, pod_embed):
    x = torch.cat((fc_embed, pod_embed), dim=1)

    x = self.aggregator(x)
    logits = self.classifier(x)
    logits = self.softmax(logits)
    return logits

In [None]:
class SBERTDataset(Dataset):
    def __init__(self, labeled_data, sbert_model):
      fc_claims = list(labeled_data['Fact Checked Claim'])

      # strip podcast claims
      pod_claims = [x.strip() for x in list(labeled_data['Podcast Claim'])]

      self.claim_embeddings = [sbert_model.encode(x, convert_to_tensor=True) for x in fc_claims]
      self.podcast_embeddings = [sbert_model.encode(x, convert_to_tensor=True) for x in pod_claims]

      # cast labels to ints, subtract one (goest from 1-6 to 0-5, easier for argmax comparison)
      self.labels = [int(x) - 1 for x in list(labeled_data['Stance Agreement'])]

    def __len__(self):
        return len(self.claim_embeddings)

    def __getitem__(self, idx):
        return self.claim_embeddings[idx], self.podcast_embeddings[idx], self.labels[idx]

In [None]:
# Load sentenceBert model

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Load train and test dataloaders

train_dataset = SBERTDataset(labeled_data=train_labeled_data, sbert_model=model)
test_dataset = SBERTDataset(labeled_data=test_labeled_data, sbert_model=model)

train_dataloader = DataLoader(train_dataset, batch_size=4)
test_dataloader = DataLoader(test_dataset, batch_size=4)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score

def compute_stats_sbert(model, dataloader, phase_str="", log=True):
  test_preds = []
  test_truth = []
  test_logits = []

  model.eval()

  ## test accuracy on test set
  for claim_embeds, pod_embeds, labels in dataloader:
      claim_embeds = claim_embeds.to(device)
      pod_embeds = pod_embeds.to(device)
      test_truth.append(labels.cpu().detach().numpy())

      with torch.no_grad():
        logits = model.forward(claim_embeds, pod_embeds) # (h1 = (10,100), h2 = (10,100))
        test_logits.append(logits.cpu().detach().numpy())
        preds = logits.argmax(dim=1)
        test_preds.append(preds.cpu().detach().numpy())

  test_preds = np.hstack(test_preds)
  test_truth = np.hstack(test_truth)

  test_logits = np.vstack(test_logits)
  
  test_truth_one_hot = np.zeros(test_logits.shape)
  test_truth_one_hot[np.stack((np.arange(len(test_truth)))), test_truth] = 1

  acc = accuracy_score(test_truth, test_preds)
  auc = roc_auc_score(test_truth_one_hot, test_logits, average='weighted', multi_class='ovr')

  if log:
    print(f"{phase_str}Accuracy: {acc}")
    # print(f"{phase_str}F1 Score: {f1_score(test_truth, test_preds, average='weighted')}")
    # print(f"{phase_str}Recall: {precision_score(test_truth, test_preds, average='weighted')}")
    # print(f"{phase_str}Precision: {recall_score(test_truth, test_preds, average='weighted')}")
    print(f"{phase_str}AUC Score: {auc}")
  
  return acc, auc

In [None]:
#compute_stats_sbert(sb_classifier, test_dataloader, phase_str=f"Epoch {epoch} Testing ")

In [None]:
## Train linear classifier using sentenceBert embeddings

sbert_model_name = "sbert_linear.pth"

sb_lr = 0.00001
sb_epochs = 1001

sb_classifier = SBERTAggregationClassifier()
sb_classifier = sb_classifier.to(device)
sb_optimizier = torch.optim.Adam(sb_classifier.parameters(), lr=sb_lr)

# sb_loss = nn.CrossEntropyLoss()
sb_loss = FocalLoss(gamma=1)

In [None]:
# Train Classifier and save model that performs best on test dataset
high_auc = 0
high_acc = 0
best_model_state_dict = None

accuracies = []
aucs = []

for epoch in range(sb_epochs):
  sb_classifier.train()

  for claim_embeds, pod_embeds, labels in train_dataloader:
    claim_embeds = claim_embeds.to(device)
    pod_embeds = pod_embeds.to(device)
    labels = labels.to(device)

    sb_classifier.zero_grad()
    
    outputs = sb_classifier.forward(claim_embeds, pod_embeds) # (h1 = (10,100), h2 = (10,100))

    # for focal loss
    # outputs = outputs.argmax(dim=1)

    loss = sb_loss(outputs, labels)
    loss.backward()
    sb_optimizier.step()
    
  acc, auc = compute_stats_sbert(sb_classifier, test_dataloader, log=False)

  if auc > high_auc:
    high_auc = auc
    high_acc = acc
    best_model_state_dict = sb_classifier.state_dict()

  accuracies.append(acc)
  aucs.append(auc)

  if epoch % (sb_epochs // 10) == 0:
    _,_ = compute_stats_sbert(sb_classifier, train_dataloader, phase_str=f"Epoch {epoch} Training ")
    _,_ = compute_stats_sbert(sb_classifier, test_dataloader, phase_str=f"Epoch {epoch} Testing ")
    print(f"Epoch {epoch}, loss: {loss.item()}")

## Save best model after training
torch.save(best_model_state_dict, f"{trained_models_output_dir}/{sbert_model_name}")
print(f"Best Model ROC AUC: {high_auc}")
print(f"Best Model Accuracy: {high_acc}")

In [None]:
print(max(accuracies))
print(max(aucs))

# Bert Embedding Classifier

In [None]:
class BERTAggregationClassifier(nn.Module):
  def __init__(self, bert_model, output_size=6):
    super().__init__()

    self.bert_model = bert_model

    input_size = list(bert_model.modules())[-2].out_features

    for param in self.bert_model.parameters():
      param.requires_grad = False
    
    self.classifier = nn.Sequential(
          nn.Linear(input_size, input_size // 2, bias=True),
          nn.ReLU(),
          nn.Dropout(0.5),
          nn.Linear(input_size // 2, output_size, bias=True),
      )
    
    self.softmax = nn.Softmax(dim=1)
    
  def forward(self, input_ids, token_type_ids, attention_mask):
    embed = self.bert_model(input_ids, token_type_ids, attention_mask)[1]
    logits = self.classifier(embed)
    logits = self.softmax(logits)
    return logits

In [None]:
class BERTDataset(Dataset):
    def __init__(self, labeled_data, bert_tokenizer, device='cuda'):
      fc_claims = list(labeled_data['Fact Checked Claim'])

      # strip podcast claims
      pod_claims = [x.strip() for x in list(labeled_data['Podcast Claim'])]

      self.embeddings = [tokenizer(x, y, return_tensors="pt", max_length=400, padding='max_length') for x, y in zip(fc_claims, pod_claims)]

      # cast labels to ints, subtract one (goest from 1-6 to 0-5, easier for argmax comparison)
      self.labels = [int(x) - 1 for x in list(labeled_data['Stance Agreement'])]

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
      # return self.embeddings[idx], self.labels[idx]
      return self.embeddings[idx]['input_ids'].squeeze(), self.embeddings[idx]['token_type_ids'].squeeze(), self.embeddings[idx]['attention_mask'].squeeze(), self.labels[idx]

In [None]:
# Load BERT model and tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')
model = model.to(device)

# split data, load into dataset
train_dataset = BERTDataset(labeled_data=train_labeled_data, bert_tokenizer=tokenizer, device=device)
test_dataset = BERTDataset(labeled_data=test_labeled_data, bert_tokenizer=tokenizer, device=device)

train_dataloader = DataLoader(train_dataset, batch_size=4)
test_dataloader = DataLoader(test_dataset, batch_size=4)

In [None]:
## Train linear classifier using sentenceBert embeddings

bert_frozen_model_name = "bert_frozen_linear.pth"

bert_lr = 0.0001
bert_epochs = 11

bert_classifier = BERTAggregationClassifier(bert_model=model)
bert_classifier = bert_classifier.to(device)
bert_optimizier = torch.optim.Adam(bert_classifier.parameters(), lr=bert_lr)

# bert_loss = nn.CrossEntropyLoss()
bert_loss = FocalLoss(gamma=1)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score

def compute_stats_bert(test_model, dataloader, phase_str="", log=True):
  test_preds = []
  test_truth = []
  test_logits = []

  test_model.eval()

  ## test accuracy on test set
  for input_ids, token_type_ids, attention_mask, labels in dataloader:
      input_ids = input_ids.to(device)
      token_type_ids = token_type_ids.to(device)
      attention_mask = attention_mask.to(device)
      test_truth.append(labels.cpu().detach().numpy())

      with torch.no_grad():
        logits = test_model.forward(input_ids, token_type_ids, attention_mask) # (h1 = (10,100), h2 = (10,100))
        test_logits.append(logits.cpu().detach().numpy())
        preds = logits.argmax(dim=1)
        test_preds.append(preds.cpu().detach().numpy())

  test_preds = np.hstack(test_preds)
  test_truth = np.hstack(test_truth)

  test_logits = np.vstack(test_logits)
  
  test_truth_one_hot = np.zeros(test_logits.shape)
  test_truth_one_hot[np.stack((np.arange(len(test_truth)))), test_truth] = 1

  acc = accuracy_score(test_truth, test_preds)
  auc = roc_auc_score(test_truth_one_hot, test_logits, average='weighted', multi_class='ovr')

  if log:
    print(f"{phase_str}Accuracy: {acc}")
    # print(f"{phase_str}F1 Score: {f1_score(test_truth, test_preds, average='weighted')}")
    # print(f"{phase_str}Recall: {precision_score(test_truth, test_preds, average='weighted')}")
    # print(f"{phase_str}Precision: {recall_score(test_truth, test_preds, average='weighted')}")
    print(f"{phase_str}AUC Score: {auc}")
  
  return acc, auc

In [None]:
# Train Classifier
high_auc = 0
high_acc = 0
best_model_state_dict = None

accuracies = []
aucs = []

for epoch in range(bert_epochs):
  bert_classifier.train()

  for input_ids, token_type_ids, attention_mask, labels in train_dataloader:
    input_ids = input_ids.to(device)
    token_type_ids = token_type_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)

    bert_classifier.zero_grad()
    
    outputs = bert_classifier.forward(input_ids, token_type_ids, attention_mask)
    loss = bert_loss(outputs, labels)
    loss.backward()
    bert_optimizier.step()

  acc, auc = compute_stats_bert(bert_classifier, test_dataloader, log=False)

  if auc > high_auc:
    high_auc = auc
    high_acc = acc
    best_model_state_dict = bert_classifier.state_dict()

  accuracies.append(acc)
  aucs.append(auc)

  if epoch % (bert_epochs // 10) == 0:
    _,_ = compute_stats_bert(bert_classifier, train_dataloader, phase_str=f"Epoch {epoch} Training ")
    _,_ = compute_stats_bert(bert_classifier, test_dataloader, phase_str=f"Epoch {epoch} Testing ")
    print(f"Epoch {epoch}, loss: {loss.item()}")

## Save best model after training
torch.save(best_model_state_dict, f"{trained_models_output_dir}/{bert_frozen_model_name}")
print(f"Best Model ROC AUC: {high_auc}")
print(f"Best Model Accuracy: {high_acc}")

In [None]:
print(max(accuracies))
print(max(aucs))
# compute_stats_bert(bert_classifier, test_dataloader, phase_str=f"Epoch {epoch} Testing")

# Fine Tuning BERT Classifier

In [None]:
class BERTFTAggregationClassifier(nn.Module):
  def __init__(self, bert_model, output_size=6):
    super().__init__()

    self.bert_model = bert_model

    input_size = list(bert_model.modules())[-2].out_features

    # for param in self.bert_model.parameters():
    #   param.requires_grad = False
    
    self.classifier = nn.Sequential(
          nn.Linear(input_size, input_size // 2, bias=True),
          nn.ReLU(),
          nn.Dropout(0.5),
          nn.Linear(input_size // 2, output_size, bias=True),
      )
    
    self.softmax = nn.Softmax(dim=1)
    
  def forward(self, input_ids, token_type_ids, attention_mask):
    embed = self.bert_model(input_ids, token_type_ids, attention_mask)[1]
    logits = self.classifier(embed)
    logits = self.softmax(logits)
    return logits

In [None]:
class BERTDataset(Dataset):
    def __init__(self, labeled_data, bert_tokenizer, device='cuda'):
      fc_claims = list(labeled_data['Fact Checked Claim'])

      # strip podcast claims
      pod_claims = [x.strip() for x in list(labeled_data['Podcast Claim'])]

      self.embeddings = [tokenizer(x, y, return_tensors="pt", max_length=400, padding='max_length') for x, y in zip(fc_claims, pod_claims)]

      # cast labels to ints, subtract one (goest from 1-6 to 0-5, easier for argmax comparison)
      self.labels = [int(x) - 1 for x in list(labeled_data['Stance Agreement'])]

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
      # return self.embeddings[idx], self.labels[idx]
      return self.embeddings[idx]['input_ids'].squeeze(), self.embeddings[idx]['token_type_ids'].squeeze(), self.embeddings[idx]['attention_mask'].squeeze(), self.labels[idx]

In [None]:
# Load BERT model and tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')
model = model.to(device)

# split data, load into dataset
train_dataset = BERTDataset(labeled_data=train_labeled_data, bert_tokenizer=tokenizer, device=device)
test_dataset = BERTDataset(labeled_data=test_labeled_data, bert_tokenizer=tokenizer, device=device)

train_dataloader = DataLoader(train_dataset, batch_size=4)
test_dataloader = DataLoader(test_dataset, batch_size=4)

In [None]:
## Train linear classifier using sentenceBert embeddings

bert_ft_frozen_model_name = "bert_finetuned_linear.pth"

bert_ft_lr = 0.0001
bert_ft_epochs = 11

bert_ft_classifier = BERTFTAggregationClassifier(bert_model=model)
bert_ft_classifier = bert_ft_classifier.to(device)
bert_ft_optimizier = torch.optim.Adam(bert_ft_classifier.parameters(), lr=bert_ft_lr)

# bert_loss = nn.CrossEntropyLoss()
bert_ft_loss = FocalLoss(gamma=1)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score

def compute_stats_bert(test_model, dataloader, phase_str="", log=True):
  test_preds = []
  test_truth = []
  test_logits = []

  test_model.eval()

  ## test accuracy on test set
  for input_ids, token_type_ids, attention_mask, labels in dataloader:
      input_ids = input_ids.to(device)
      token_type_ids = token_type_ids.to(device)
      attention_mask = attention_mask.to(device)
      test_truth.append(labels.cpu().detach().numpy())

      with torch.no_grad():
        logits = test_model.forward(input_ids, token_type_ids, attention_mask) # (h1 = (10,100), h2 = (10,100))
        test_logits.append(logits.cpu().detach().numpy())
        preds = logits.argmax(dim=1)
        test_preds.append(preds.cpu().detach().numpy())

  test_preds = np.hstack(test_preds)
  test_truth = np.hstack(test_truth)

  test_logits = np.vstack(test_logits)
  
  test_truth_one_hot = np.zeros(test_logits.shape)
  test_truth_one_hot[np.stack((np.arange(len(test_truth)))), test_truth] = 1

  acc = accuracy_score(test_truth, test_preds)
  auc = roc_auc_score(test_truth_one_hot, test_logits, average='weighted', multi_class='ovr')

  if log:
    print(f"{phase_str}Accuracy: {acc}")
    # print(f"{phase_str}F1 Score: {f1_score(test_truth, test_preds, average='weighted')}")
    # print(f"{phase_str}Recall: {precision_score(test_truth, test_preds, average='weighted')}")
    # print(f"{phase_str}Precision: {recall_score(test_truth, test_preds, average='weighted')}")
    print(f"{phase_str}AUC Score: {auc}")
  
  return acc, auc

In [None]:
# Train Classifier
high_auc = 0
high_acc = 0
best_model_state_dict = None

accuracies = []
aucs = []

for epoch in range(bert_ft_epochs):
  bert_ft_classifier.train()

  for input_ids, token_type_ids, attention_mask, labels in train_dataloader:
    input_ids = input_ids.to(device)
    token_type_ids = token_type_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)

    bert_ft_classifier.zero_grad()
    
    outputs = bert_ft_classifier.forward(input_ids, token_type_ids, attention_mask)
    loss = bert_ft_loss(outputs, labels)
    loss.backward()
    bert_ft_optimizier.step()

  acc, auc = compute_stats_bert(bert_ft_classifier, test_dataloader, log=False)

  if auc > high_auc:
    high_auc = auc
    high_acc = acc
    best_model_state_dict = bert_ft_classifier.state_dict()

  accuracies.append(acc)
  aucs.append(auc)

  if epoch % (bert_ft_epochs // 10) == 0:
    _,_ = compute_stats_bert(bert_ft_classifier, train_dataloader, phase_str=f"Epoch {epoch} Training ")
    _,_ = compute_stats_bert(bert_ft_classifier, test_dataloader, phase_str=f"Epoch {epoch} Testing ")
    print(f"Epoch {epoch}, loss: {loss.item()}")

## Save best model after training
torch.save(best_model_state_dict, f"{trained_models_output_dir}/{bert_ft_frozen_model_name}")
print(f"Best Model ROC AUC: {high_auc}")
print(f"Best Model Accuracy: {high_acc}")

In [None]:
print(max(accuracies))
print(max(aucs))
compute_stats_bert(bert_ft_classifier, test_dataloader, phase_str=f"Epoch {epoch} Testing")