In [1]:
!pip install torch
!pip install transformers
!pip install numpy
!pip install pandas



In [2]:
from transformers import BertForMaskedLM, BertTokenizer
import torch
import numpy as np

In [3]:
# set up functions
def load_model_and_tokenizer(version):
    model = BertForMaskedLM.from_pretrained(version)
    tokenizer = BertTokenizer.from_pretrained(version)
    return model, tokenizer

def set_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

In [4]:
from random import randint, choice
from math import ceil
import pandas as pd
import tqdm

CHARS = "abcdefghijklmnopqrstuvwxyz1234567890"
FILE_PREFIX = ""
#FILE_PREFIX = "../"
MASK = '[MASK]'
PAD = '[PAD]'
CLS = '[CLS]'
SEP = '[SEP]'

#CLS_TENSOR = torch.Tensor([CLS])
#SEP_TENSOR = torch.Tensor([SEP])

def encode_txt(tokenizer, fname=None, sent_list=None, max_len=512):
    #inputs_df = pd.DataFrame(columns = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'],
                             #dtype = torch.Tensor(int64))
    
    data_dict = {'input_ids': {}, 'attention_mask': {}, 'token_type_ids': {}, 'labels': {}}
    
    if fname == None and sent_list == None:
        raise ValueError("Must provide text file or list of sentences to encode")
    else:
        print("Beginning encoding...")
        if (fname != None):
            line_num = 0
            f = open("COMP_550_Source_Files/"+ fname + ".txt")
            
            for line in f:
                line_input = encode_line(line, tokenizer, max_len)
                
                for k in line_input:
                    data_dict[k][line_num] = line_input[k]  
                    
                if (line_num%2500 == 0):
                    print("%%Completed encoding " + str(line_num) + " lines%%")
                
                line_num += 1
        
        
        
            
        
        else:
            for sent in sent_list:
                line_input = encode_line(sent, tokenizer, max_len)

                print(line_input)
                for k in line_input:
                    data_dict[k][line_num] = line_input[k]
                    
                
                    
    inputs_df = pd.DataFrame(data=data_dict,
                            columns = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'],
                             )
    
    return inputs_df
    
def encode_line(line, tokenizer, max_len):
    line_input = {"input_ids": torch.Tensor([]).to(torch.int64), 
                  "attention_mask": torch.Tensor([]).to(torch.int64),
                  "token_type_ids": torch.Tensor([]).to(torch.int64),
                  "labels": torch.Tensor([]).to(torch.int64),
                 }

    tokens = tokenizer.tokenize(line)
    num_padded = 0
    
    to_do, num = trunc_or_pad(tokens, max_len)
    if to_do == 'truncate':
        tokens = tokens[:len(tokens) - num + 1]
        tokens = tokens
    elif to_do == 'pad':
        padding = [PAD for i in range(num)]
        tokens = tokens + padding
        #line += " ".join(padding)
        num_padded = num

    line = tokenizer.convert_tokens_to_string(tokens)
    line = line + " "
    tokens = [CLS] + tokens + [SEP]
    # up to 15% of words can be masked in any one sentence
    to_mask = randint(1, ceil((len(tokens)-num_padded)*.15))
    num_masked = 0
    index = 0
    to_check = [i for i in range(1, len(tokens)-num_padded)]
    num_iter= 1
    while (num_masked < to_mask):

        index = choice(to_check)
        to_check.remove(index)
        if len(to_check) == 0:
          break

        if tokens[index] != MASK and tokens[index][0] in CHARS and len(tokens[index]) > 1:
          #tokens[index] = get_mask_with_punc(tokens[index])
          if tokens[index].startswith("##"):
              tokens[index] = "##" + MASK 
          else:
              tokens[index] = MASK

          num_masked += 1
          
        num_iter+= 1
        
        
    ids = torch.Tensor(tokenizer.convert_tokens_to_ids(tokens))
    line_input["input_ids"] = ids 
    line_input["attention_mask"] = torch.ones_like(ids)
    line_input["token_type_ids"] = torch.zeros_like(ids)
    line_input["labels"] = tokenizer(line, return_tensors='pt')["input_ids"][0]

    diff = len(line_input["input_ids"]) - len(line_input["labels"])
    if diff != 0:
      print("ERRROR: " + str(diff))
      #print(line)
    
    return line_input

def trunc_or_pad(tokens, max_len):
    #line_labels = tokenzier(line, return_tensor='pt')["input_ids"]
    num_tokens = len(tokens) - 1

    if num_tokens > max_len:
        trunc = num_tokens - max_len
        return "truncate", trunc

    elif num_tokens < max_len:
        pad = max_len - num_tokens
        #padding = [PAD for i in range(pad)] + [SEP]
        return "pad", pad
    else:
        return "none", 0

In [5]:
from torch.utils.data import TensorDataset, random_split
#from transformers import BatchEncoding

def get_tr_and_vld_sets(inputs, num_lines, split):
    print("Splitting into datasets...")
    tr_size = int(split * num_lines)
    vld_size = num_lines - tr_size

    tr_df = inputs.sample(n=tr_size)
    vld_df = inputs.sample(n=vld_size)

    tr_set = TensorDataset(torch.cat(list(tr_df.loc[:, "input_ids"])).to(torch.int64), 
                            torch.cat(list(tr_df.loc[:, "attention_mask"])).to(torch.int64), 
                            torch.cat(list(tr_df.loc[:, "labels"])).to(torch.int64),
                           )
    
    vld_set = TensorDataset(torch.cat(list(vld_df.loc[:, "input_ids"])).to(torch.int64), 
                            torch.cat(list(vld_df.loc[:, "attention_mask"])).to(torch.int64), 
                            torch.cat(list(vld_df.loc[:, "labels"])).to(torch.int64),
                           )
  
    return tr_set, vld_set
  

def get_batches(tr_set, vld_set, batch_size):
    tr_batches = []
    vld_batches = []
    print("getting batches...")
  
    for i in range(0, len(tr_set) - len(tr_set)%batch_size, batch_size):
      if i + batch_size < len(vld_set):
        vld_batch = {}
        vld_batch_tup = vld_set[i:i+batch_size]

        vld_batch["input_ids"] = vld_batch_tup[0]
        vld_batch["attention_mask"] = vld_batch_tup[1]
        vld_batch["labels"] = vld_batch_tup[2]
            
        vld_batches.append(vld_batch)
      
      tr_batch = {}
      tr_batch_tup = tr_set[i:i+batch_size]

      tr_batch["input_ids"] = tr_batch_tup[0]
      tr_batch["attention_mask"] = tr_batch_tup[1]
      tr_batch["labels"] = tr_batch_tup[2]

      tr_batches.append(tr_batch)

    return tr_batches, vld_batches

In [6]:
# training functions
from transformers import Trainer, TrainingArguments
from torch.utils.data import RandomSampler, DataLoader

def get_data(tokenizer, max_len, fname=None, sent_list=None, save=False):
  inputs = encode_txt(tokenizer, fname=fname, max_len=max_len)
  if save:
    inputs.to_csv("encoded_text_files/" + fname + "_pp.csv", sep="\t", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])
  
  return inputs


def train(model, tr_set, vld_set, epochs=3, bs=32, eps=1.5e-5, lr=5e-5, max_len=128):
    tr_batches, vld_batches = get_batches(tr_set, vld_set, bs)
    args = TrainingArguments("test_trainer",
                             per_device_train_batch_size = bs,
                             per_device_eval_batch_size = bs,
                             learning_rate = lr,
                             num_train_epochs = epochs,
                             adam_epsilon = eps)
  

    trainer = Trainer(model= model,
                      args=args,
                      train_dataset = tr_batches,
                      eval_dataset = vld_batches,)
  
    trainer.train()

    return model


In [7]:
'''
PUBLISHERS = ['ChicagoTribune', 'NPR', 'Politico', 'Reuters', 'WallStreetJournal']

HGGNG_SFX = '_model'
set_device()

model, tokenizer = load_model_and_tokenizer("bert-base-uncased")
for p in PUBLISHERS:
    print("Beginning to get data for "  + p)
    inputs = get_data(tokenizer, 132, fname=p, save=True)
'''

'\nPUBLISHERS = [\'ChicagoTribune\', \'NPR\', \'Politico\', \'Reuters\', \'WallStreetJournal\']\n\nHGGNG_SFX = \'_model\'\nset_device()\n\nmodel, tokenizer = load_model_and_tokenizer("bert-base-uncased")\nfor p in PUBLISHERS:\n    print("Beginning to get data for "  + p)\n    inputs = get_data(tokenizer, 132, fname=p, save=True)\n'

In [None]:
#PUBLISHERS = ['NPR', 'Politico', 'Reuters', 'WallStreetJournal']
PUBLISHERS = ['NewYorkTimes']
for p in PUBLISHERS:
    #bdfr_sample = pd.read_csv('encoded_text_files/' + p + '_sample_v1.csv', sep = ",", converters = {"input_ids": str_to_tensor, "attention_mask": str_to_tensor, "labels": str_to_tensor})
    #line_nums = sample.loc[:, 'Unnamed: 0']

    data = pd.read_csv('encoded_text_files/' + p + '_pp.csv', delimiter = "\t", converters = {"input_ids": str_to_tensor, "attention_mask": str_to_tensor, "labels": str_to_tensor})

    sample = data.sample(n=5000)
    sample.to_csv('encoded_text_files/' + p + '_sample_v2.csv', sep="\t", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])
    

    
    

  return torch.tensor(init, dtype=torch.int64)


KeyError: 'Unnamed: 0'

In [None]:
def str_to_tensor(s):
  init = s[7: -1].strip('][').split(',')
  for i in range(len(init)):
    init[i] = float(init[i])

  return torch.tensor(init, dtype=torch.int64)

PUBLISHERS = ['Breitbart', 'ChicagoTribune', 'CNN', 'FoxNews', 'HuffPost', 'NPR', 'NewYorkTimes', 'Politico', 'Reuters', 'WallStreetJournal']
#[epochs, bs, eps, split, lr, max_len]
#PUBLISHERS = ['NewYorkTimes']
hyper_params = {2: [3, 16, 1.5e-5, 0.9, 5e-5, 132], 3: [2, 32, 1.5e-5, 0.9, 5e-5, 132], 4: [2, 16, 1.5e-5, 0.9, 5e-5, 132], 5: [3, 32, 1.5e-5, 0.9, 5e-5, 256]}

for p in PUBLISHERS:
  sample = pd.read_csv('encoded_text_files/' + p + '_sample_v2.csv', sep = "\t", converters = {"input_ids": str_to_tensor, "attention_mask": str_to_tensor, "labels": str_to_tensor})
  for k in hyper_params:
    if k == 2:
      hp = hyper_params[k]
      model, tokenizer = load_model_and_tokenizer("bert-base-uncased")

      tr_set, vld_set = get_tr_and_vld_sets(sample, 1500 , hp[3])
      model = train(model, tr_set, vld_set, epochs=hp[0], bs=hp[1], eps=hp[2], lr=hp[4], max_len=hp[5])
      if p == 'Breitbart' or p == 'ChicagoTribune': 
        model.push_to_hub(p+ "_model_v7")
      else:
        model.push_to_hub(p + "_model_v6")

'''
  data = pd.read_csv('encoded_text_files/' + p + '_pp.csv', 
                     delimiter = "\t",
                     converters = {"input_ids": str_to_tensor,
                                   "attention_mask": str_to_tensor,
                                   "labels": str_to_tensor})

  sample = data.sample(n=1500)
  sample.to_csv('encoded_text_files/' + p + "_sample_v1.csv")
  '''

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/dgiltz/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from

Splitting into datasets...


KeyError: 'input_ids'

In [31]:
data = pd.read_csv('encoded_text_files/NewYorkTimes_pp.csv', delimiter = "\t", converters = {"input_ids": str_to_tensor, "attention_mask": str_to_tensor, "labels": str_to_tensor})
sample = data.sample(n=1500)

sample.to_csv('encoded_text_files/NewYorkTimes_sample_v1.csv', sep="\t", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])
for k in hyper_params:
    hp = hyper_params[k]
    model, tokenizer = load_model_and_tokenizer("bert-base-uncased")

    tr_set, vld_set = get_tr_and_vld_sets(sample, 1500 , hp[3])
    model = train(model, tr_set, vld_set, epochs=hp[0], bs=hp[1], eps=hp[2], lr=hp[4], max_len=hp[5])
    if p == 'Breitbart' or p == 'ChicagoTribune': 
      model.push_to_hub(p+ "_model_v" + str(k +1))
    else:
      model.push_to_hub(p + "_model_v" + str(k))

  return torch.tensor(init, dtype=torch.int64)
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/dgiltz/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-b

Splitting into datasets...
getting batches...


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 11390
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2136
 23%|██▎       | 500/2136 [23:21<1:13:27,  2.69s/it]Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json


{'loss': 0.4317, 'learning_rate': 3.829588014981274e-05, 'epoch': 0.7}


Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
 47%|████▋     | 1000/2136 [47:25<55:44,  2.94s/it]Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json


{'loss': 0.0723, 'learning_rate': 2.6591760299625466e-05, 'epoch': 1.4}


Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
 70%|███████   | 1500/2136 [1:20:08<38:04,  3.59s/it]Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json


{'loss': 0.0553, 'learning_rate': 1.4887640449438203e-05, 'epoch': 2.11}


Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
 94%|█████████▎| 2000/2136 [2:24:42<06:04,  2.68s/it]Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_trainer/checkpoint-2000/config.json


{'loss': 0.0246, 'learning_rate': 3.1835205992509364e-06, 'epoch': 2.81}


Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
100%|██████████| 2136/2136 [2:30:04<00:00,  2.12s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 2136/2136 [2:30:04<00:00,  4.22s/it]


{'train_runtime': 9004.846, 'train_samples_per_second': 3.795, 'train_steps_per_second': 0.237, 'train_loss': 0.13837697722492145, 'epoch': 3.0}


Cloning https://huggingface.co/Declan/NewYorkTimes_model_v2 into local empty directory.
Configuration saved in NewYorkTimes_model_v2/config.json
Model weights saved in NewYorkTimes_model_v2/pytorch_model.bin
Upload file pytorch_model.bin:  99%|█████████▉| 415M/418M [01:07<00:00, 6.89MB/s]To https://huggingface.co/Declan/NewYorkTimes_model_v2
   b64bca3..6a0f46f  main -> main

Upload file pytorch_model.bin: 100%|██████████| 418M/418M [01:08<00:00, 6.41MB/s]
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/dgiltz/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "

Splitting into datasets...
getting batches...


***** Running training *****
  Num examples = 5695
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 356
 71%|███████▏  | 254/356 [22:56<10:13,  6.01s/it]

KeyboardInterrupt: 