In [None]:
%%capture
!pip install -U pip
!pip install transformers
!pip install transformers-domain-adaptation

In [None]:
from pathlib import Path
import itertools as it
from typing import Sequence, Union, Generator
import random
import pandas as pd

from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers_domain_adaptation import DataSelector, VocabAugmentor
from datasets import load_dataset

from google.colab import auth
from google.auth import default
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
from google.colab import drive
drive.mount('/drive')

creds, _ = default()
gc = gspread.authorize(creds)

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [None]:
def df_from_google_sheets(path, columns):

  """
  Get a pandas dataframe object from Google Sheets
  """

  worksheet = gc.open(path).sheet1
  rows = worksheet.get_all_values()
  rows = [x[:5] for x in rows]

  return pd.DataFrame.from_records(rows[1:], columns=columns)

def get_pc_claims(podcast_claims):
  pc_claims = []

  with open(podcast_claims, 'r') as all_transcripts:
    for idx, line in enumerate(all_transcripts):
      pc_claims.append(line.strip().split("\t"))
  
  return pc_claims

def get_train_data(pc_claims):

  """
  Format pc_claims list to get rid of repeated claims in the tsv file
  """

  pc_claims_list = []
  training_texts_spotify = []
  sentences = 0
  for i in range(int(len(pc_claims)/5)):
    pc_claims_list.append(pc_claims[sentences][2:])
    training_texts_spotify.append('.'.join(str(x) for x in pc_claims_list[i][:]))
    sentences += 5 # five elements of list after current are similar but with different context
  
  return training_texts_spotify

# Importing Labeled Data

In [None]:
column_names = ['Fact Checked Claim Index', 'Podcast Claim Index', 'Fact Checked Claim', 'Podcast Claim', 'Stance Agreement']

december_labeled_data_path = '/drive/My Drive/spotify-misinformation/labeling-output/manually-labeled-matched-pairs.csv'
december_labeled_data = pd.read_csv(december_labeled_data_path)
omar_individual = df_from_google_sheets('top_3000_context_2_single_sentence_omar_individual', columns=column_names)
omar_individual_data = omar_individual[200:800].copy()
jon_individual = df_from_google_sheets('top_3000_context_2_single_sentence_jon_individual', columns=column_names)
jon_individual_data = jon_individual[200:800].copy()
omar_predictions_data = df_from_google_sheets('spotifact-predictions-labeling-omar', columns=column_names)
omar_predictions_data_2 = df_from_google_sheets('spotifact-predictions-labeling-omar-2', columns=column_names)
omar_predictions_data_3 = df_from_google_sheets('spotifact-predictions-labeling-omar-3', columns=column_names)
omar_predictions_data_retrain = df_from_google_sheets('spotifact-predictions-labeling-omar-retrain', columns=column_names)
omar_predictions_partials = df_from_google_sheets('spotifact-predictions-labeling-omar-partials-3k-13k', columns=column_names)

frames = [december_labeled_data, omar_individual_data, jon_individual_data, omar_predictions_data, 
          omar_predictions_data_2, omar_predictions_data_retrain, omar_predictions_partials]

total_labeled_data = pd.concat(frames) # Concatenated df with overlap and recent Jon (Omar) and Omar's 2022 labels
fine_tuning_texts_spotifact = total_labeled_data['Podcast Claim'].tolist()

textfile = open("/drive/My Drive/pretraining/data/ft_corpus_train.txt", "w")
for line in fine_tuning_texts_spotifact:
    textfile.write(line + "\n")
textfile.close()

In [None]:
podcast_claims = '/drive/MyDrive/spotify-misinformation/preprocessing-output/podcast_claims_context_2_reupload.tsv'
dpt_corpus_train_data_selected_spotify = '/drive/My Drive/pretraining/data/spotify_subset_train_data_selected.txt' 
dpt_corpus_val_spotify = '/drive/My Drive/pretraining/data/spotify_subset_val.txt' 
ft_corpus_train = '/drive/My Drive/pretraining/data/ft_corpus_train.txt'

'\npc_claims = []\n\nwith open(podcast_claims, \'r\') as all_transcripts:\n  for idx, line in enumerate(all_transcripts):\n    pc_claims.append(line.strip().split("\t"))\n'

In [None]:
pc_claims = get_pc_claims(podcast_claims)

In [None]:
training_texts_spotify = get_train_data(pc_claims)

In [None]:
model_card = 'bert-base-uncased'
device = 'cuda'
model = AutoModelForMaskedLM.from_pretrained(model_card)
tokenizer = AutoTokenizer.from_pretrained(model_card)
model = model.to(device)

# Data Selection

In [None]:
selector = DataSelector(
    keep=0.5, 
    tokenizer=tokenizer,
    similarity_metrics=['euclidean'],
    diversity_metrics=[
        "type_token_ratio",
        "entropy",
    ],
)

# Fit on fine-tuning corpus
selector.fit(fine_tuning_texts_spotifact)

# Select relevant documents from in-domain training corpus
selected_corpus = training_texts_spotify
selected_corpus = selected_corpus[0:int(0.8*len(selected_corpus))]
selected_corpus_val = selected_corpus[int(0.8*len(selected_corpus)):len(selected_corpus)]

# Save selected corpus to disk under `dpt_corpus_train_data_selected`
Path(dpt_corpus_train_data_selected_spotify).write_text('\n'.join(selected_corpus));
Path(dpt_corpus_val_spotify).write_text('\n'.join(selected_corpus_val));

Token indices sequence length is longer than the specified maximum sequence length for this model (347881 > 512). Running this sequence through the model will result in indexing errors


In [None]:
len(training_texts_spotify), len(selected_corpus)

(2497241, 1997792)

In [None]:
selected_corpus[10]

" You can make money from your podcast with no minimum listenership. It's everything you need to make a podcast in one place and makes everything super easy for you guys. So guys download the free anchor app or go to Anchor dot. F m-- to get started. I want to give a huge huge disclaimer to everyone that's listening"

# Vocabulary Augmentation

In [None]:
target_vocab_size = 31_000  

augmentor = VocabAugmentor(
    tokenizer=tokenizer, 
    cased=False, 
    target_vocab_size=target_vocab_size
)

# Obtain new domain-specific terminology based on the fine-tuning corpus
new_tokens = augmentor.get_new_tokens(ft_corpus_train)

Below are some examples of extracted domain-specific terminology based on the fine-tuning corpus. 

In [None]:
print(new_tokens[:20])

['?.', 'coronavirus', 'parenthood', 'vaccinated', 'abortions', 'mmr', 'instagram', 'millennials', 'bitcoin', 'vaccinations', 'opioid', 'cbd', 'ebola', 'obamacare', 'shootings', 'measles', 'biden', ',.', 'vaccinate', 'earthers']


In [None]:
tokenizer.add_tokens(new_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(31000, 768)

# Domain Adaptation (Pre-Training)

In [None]:
datasets = load_dataset(
    'text', 
    data_files={
        "train": dpt_corpus_train_data_selected_spotify, 
        "val": dpt_corpus_val_spotify
    }
)

tokenized_datasets = datasets.map(
    lambda examples: tokenizer(examples['text'], truncation=True, max_length=model.config.max_position_embeddings), 
    batched=True
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir="/drive/My Drive/pretraining/domain-pretraining",
    overwrite_output_dir=True,
    max_steps=1000,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    save_steps=50,
    save_total_limit=2,
    logging_steps=50,
    seed=42,
    # fp16=True,
    dataloader_num_workers=2,
    disable_tqdm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
    data_collator=data_collator,
    tokenizer=tokenizer,  # This tokenizer has new tokens
)

Using custom data configuration default


Downloading and preparing dataset text/default-64682f2c81e01b20 (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/text/default-64682f2c81e01b20/0.0.0/daf90a707a433ac193b369c8cc1772139bb6cca21a9c7fe83bdd16aad9b9b6ab...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-64682f2c81e01b20/0.0.0/daf90a707a433ac193b369c8cc1772139bb6cca21a9c7fe83bdd16aad9b9b6ab. Subsequent calls will reuse this data.


HBox(children=(FloatProgress(value=0.0, max=1998.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))




In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
50,3.106,2.861948,6825.4778,58.539
100,2.9396,2.796834,6827.9339,58.518
150,2.9156,2.738439,6723.4428,59.428
200,2.7519,2.716185,6744.4081,59.243
250,2.8871,2.693067,6640.2264,60.172
300,2.7343,2.686414,6711.2622,59.536
350,2.7061,2.670737,6838.1475,58.431
400,2.7819,2.661411,6826.8294,58.528


In [None]:
"""
checkpoint = '/drive/My Drive/pretraining/domain-pretraining/checkpoint-400'

model_adapted = AutoModelForMaskedLM.from_pretrained(checkpoint)
tokenizer_adapted = AutoTokenizer.from_pretrained(checkpoint)
model_adapted = model_adapted.to(device)
"""