In [None]:
!pip install -U sentence-transformers
# !pip install transformers

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import time
import datetime

from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, util

from google.colab import drive
drive.mount('/drive')

device = "cuda" if torch.cuda.is_available() else 'cpu'

# Load Data

In [None]:
class PodcastClaims: 

	def __init__(self, show_uri, episode_uri, transcript_claims, index):
		self.show_uri = show_uri
		self.episode_uri = episode_uri
		self.center_claim = transcript_claims[len(transcript_claims) // 2]
		self.context_claim = ".".join(transcript_claims)
		self.index = index

In [None]:
## Load manually labeled data
# From colab-setup, replace if changes were made to this file
# Initialize directories used

parent_dir = '/drive/MyDrive/spotify-misinformation'

# Model output directories

modeling_output_dir = f"{parent_dir}/modeling-output"
trained_models_output_dir = f"{modeling_output_dir}/trained-models"

# Define file paths for where podcast claims and fact checked claims are located

preprocessing_output_dir = f"{parent_dir}/preprocessing-output"
matched_claims_output_dir = f"{parent_dir}/matched-claims-output"

fact_checked_claims_fp = f"{preprocessing_output_dir}/politifact_filtered.csv"
transcript_claims_fp = f"{preprocessing_output_dir}/podcast_claims_context_2.tsv"

# Define filepath for matched claims

matched_claims_fp = f"{matched_claims_output_dir}/matched_claims_context_2.txt"

# Define filepath for predicted labels

predicted_mc_fp = f"{modeling_output_dir}/only_predicted_label_predicted_mc_context_2.txt"
predicted_mc_veracity_fp = f"{modeling_output_dir}/predicted_mc_context_2.txt"

# utility functions to read data

claims_df = pd.read_csv(fact_checked_claims_fp)
pc_claims = []

with open(transcript_claims_fp, 'r') as all_transcripts:
  for idx, line in enumerate(all_transcripts):
    # split_line = line.strip().split("\t")
    pc_claims.append(line.strip().split("\t"))

def get_kb_claim(kb_idx):
  return claims_df['Statement'][int(kb_idx)]

def get_pc_claim(pc_idx):
  return ".".join(pc_claims[int(pc_idx)][2:])

def get_kb_claim_date(kb_idx):
  return claims_df['Date'][int(kb_idx)]

In [None]:
columns = ['Fact Checked Claim Index', 'Podcast Claim Index', 'Cosine Similarity Score']
mc_df = pd.read_csv(matched_claims_fp, names = columns)
# mc_df = mc_df.sort_values(by=['Cosine Similarity Score'], ascending=False)[3000:]
mc_df = mc_df.sort_values(by=['Cosine Similarity Score'], ascending=False)

# SentenceBERT

In [None]:
class SBERTAggregationClassifier(nn.Module):
  def __init__(self, input_size=384, output_size=6):
    super().__init__()

    self.aggregator = nn.Sequential(
        nn.Linear(input_size * 2, input_size, bias=True),
        nn.ReLU(),
        nn.Dropout(0.5),
        )
    
    self.classifier = nn.Sequential(
          nn.Linear(input_size, input_size, bias=True),
          nn.ReLU(),
          nn.Dropout(0.5),
          nn.Linear(input_size, output_size, bias=True),
      )
    
    self.softmax = nn.Softmax(dim=1)
    
  def forward(self, fc_embed, pod_embed):
    x = torch.cat((fc_embed, pod_embed), dim=1)

    x = self.aggregator(x)
    logits = self.classifier(x)
    logits = self.softmax(logits)
    return logits

In [None]:
class SBERTPredictionDataset(Dataset):
    def __init__(self, kb_claims, podcast_claims, sbert_model):
      self.claim_embeddings = sbert_model.encode(kb_claims, convert_to_tensor=True)
      self.podcast_embeddings = sbert_model.encode(podcast_claims, convert_to_tensor=True)

    def __len__(self):
        return len(self.claim_embeddings)

    def __getitem__(self, idx):
        return self.claim_embeddings[idx], self.podcast_embeddings[idx]

In [None]:
def get_predictions(model, dataloader):
  test_preds = []

  model.eval()

  ## test accuracy on test set
  for claim_embeds, pod_embeds in dataloader:
      claim_embeds = claim_embeds.to(device)
      pod_embeds = pod_embeds.to(device)

      with torch.no_grad():
        logits = model.forward(claim_embeds, pod_embeds) # (h1 = (10,100), h2 = (10,100))
        preds = logits.argmax(dim=1)
        test_preds.append(preds.cpu().detach().numpy())

  return list(np.hstack(test_preds))

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

sbert_model_name = "sbert_linear.pth"

sb_classifier = SBERTAggregationClassifier()
sb_classifier.load_state_dict(torch.load(f"{trained_models_output_dir}/{sbert_model_name}"))
sb_classifier = sb_classifier.to(device)
sb_classifier.eval()

In [None]:
## Uncomment if runtime disconnects

# count = 0

# with open(predicted_mc_fp, 'r') as matched_claims_predicted:
#   for line in matched_claims_predicted:
#     count += 1

# print(count)


In [None]:
# randomly sample 
mc_df = mc_df.sample(frac=0.5, random_state=11)
print(len(mc_df))

In [None]:
batch_size = 40000
start_time = time.time()

with open(predicted_mc_fp, 'a') as matched_claims_predicted:

  mc_rows = []

  for idx, row in enumerate(mc_df.iloc):

    # used for when runtime stops
      # remember to change file open to append instead of write
      # if idx <= 12450000:
      #   continue

    mc_rows.append(row)

    if idx % batch_size == 0 and idx != 0:
      kb_claims = [claims_df['Statement'][int(x['Fact Checked Claim Index'])] for x in mc_rows]
      podcast_claims = [".".join(pc_claims[int(x['Podcast Claim Index'])]) for x in mc_rows]

      pred_dataset = SBERTPredictionDataset(kb_claims, podcast_claims, sbert_model=model)
      pred_dataloader = DataLoader(pred_dataset, batch_size=4096)

      preds = get_predictions(sb_classifier, pred_dataloader)

      for row, pred in zip(mc_rows, preds):
        matched_claims_predicted.write(f"{int(row['Fact Checked Claim Index'])}, {int(row['Podcast Claim Index'])}, {row['Cosine Similarity Score']}, {pred}\n")

      mc_rows = []

      print(idx, time.time() - start_time)

  ## Run for the last incomplete batch

  kb_claims = [claims_df['Statement'][int(x['Fact Checked Claim Index'])] for x in mc_rows]
  podcast_claims = [".".join(pc_claims[int(x['Podcast Claim Index'])]) for x in mc_rows]

  pred_dataset = SBERTPredictionDataset(kb_claims, podcast_claims, sbert_model=model)
  pred_dataloader = DataLoader(pred_dataset, batch_size=4096)

  preds = get_predictions(sb_classifier, pred_dataloader)

  for row, pred in zip(mc_rows, preds):
    matched_claims_predicted.write(f"{int(row['Fact Checked Claim Index'])}, {int(row['Podcast Claim Index'])}, {row['Cosine Similarity Score']}, {pred}\n")

  mc_rows = []

  print(idx, time.time() - start_time)

## Adding ground truth label and misinformation label

In [None]:
# FINE-GRAINED MAPPING

#Create a list of the values we want to assign for each condition
true_mapping = ['True', 'Potentially True', 'Misinformation', 
                'Potential Misinformation', 'Unrelated', 'Inconclusive']
mostly_true_mapping = ['True', 'Potentially True', 'Misinformation', 
                       'Potential Misinformation', 'Unrelated', 'Inconclusive']
# HALF-TRUE LABELS ARE TRICKY 
half_true_mapping = ['Misinformation', 'Potential Misinformation', 'Potential Misinformation', 
                      'Potential Misinformation', 'Unrelated', 'Inconclusive']
# Barely true and mostly false can be the same
barely_true_mapping = ['Misinformation', 'Potential Misinformation', 'True', 
                      'Potentially True', 'Unrelated', 'Inconclusive']                     
mostly_false_mapping = ['Misinformation', 'Potential Misinformation', 'True', 
                      'Potentially True', 'Unrelated', 'Inconclusive']
false_mapping = ['Misinformation', 'Potential Misinformation', 'True', 
                 'Potentially True', 'Unrelated', 'Inconclusive']
pants_fire_mapping = ['Misinformation', 'Potential Misinformation', 'True', 
                 'Potentially True', 'Unrelated', 'Inconclusive']

# Add lists for final mapping
mappings = [true_mapping, mostly_true_mapping, half_true_mapping, barely_true_mapping, mostly_false_mapping, false_mapping, pants_fire_mapping]

# Create dictionary for mapping politifact labels to mapping list index
claim_indexing = {
    'true':0,
    'mostly-true':1,
    'half-true':2,
    'barely-true':3,
    'mostly-false':4,
    'false':5,
    'pants-fire':6,
    }

In [None]:
claims_df['Label'].value_counts()

In [None]:
with open(predicted_mc_fp, 'r') as matched_claims_predicted:

  with open(predicted_mc_veracity_fp, 'w') as mc_veracity:

    for line in matched_claims_predicted:
      t = line.strip().split(',')
      print(t)
      prediction = int(float(t[3]))
      kb_label = claims_df['Label'][int(float(t[0]))]
      misinformation_label = mappings[claim_indexing[kb_label]][prediction]

      mc_veracity.write(f"{t[0]}, {t[1]}, {t[2]}, {t[3]}, {kb_label}, {misinformation_label}\n")