In [2]:
!pip install pytorch-transformers



In [3]:
import os
import sys
import zipfile
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML
from pytorch_transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
                                  BertConfig, BertForMaskedLM, BertTokenizer)

import random
manualSeed = 999
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)

MODEL_CLASSES = {
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer)
}

Random Seed:  999


In [4]:
from urllib import request
import os
from zipfile import ZipFile
from typing import *

save_dir = '/content/corpus/'
file_list: List[str] = ['condensed_2009.json.zip', 'condensed_2010.json.zip', 'condensed_2011.json.zip', 'condensed_2012.json.zip', 'condensed_2013.json.zip', 'condensed_2014.json.zip', 'condensed_2015.json.zip', 'condensed_2016.json.zip', 'condensed_2017.json.zip', 'condensed_2018.json.zip']
url_root = 'https://github.com/bpb27/trump_tweet_data_archive/raw/master/'

# Download Trump tweets
os.makedirs(save_dir, exist_ok=True)
for file_name in file_list:
  print(f'Downloading {file_name}..')
  file_path = save_dir + file_name
  request.urlretrieve(url_root + file_name, file_path)
  with ZipFile(file_path, 'r') as zip:
    zip.extractall(save_dir)
  os.remove(file_path)

Downloading condensed_2009.json.zip..
Downloading condensed_2010.json.zip..
Downloading condensed_2011.json.zip..
Downloading condensed_2012.json.zip..
Downloading condensed_2013.json.zip..
Downloading condensed_2014.json.zip..
Downloading condensed_2015.json.zip..
Downloading condensed_2016.json.zip..
Downloading condensed_2017.json.zip..
Downloading condensed_2018.json.zip..


In [5]:
import json

MAX_TWEET_LENGTH = 150
tweets = []

file_list = list(map(lambda s: s.replace('.zip', ''), file_list))
for f in file_list:
  with open(save_dir + f, 'r', encoding='utf-8') as fp:
    raw_tweets = json.load(fp)
    for raw_tweet in raw_tweets:
      text = raw_tweet["text"]
      if len(text) < (MAX_TWEET_LENGTH - 2): # -2 for begin and end tokens
        tweets.append(text)

print(str(len(tweets)) + " tweets")
for tweet in tweets[:5]:
  print(tweet)
print("...")

33721 tweets
From Donald Trump: Wishing everyone a wonderful holiday & a happy, healthy, prosperous New Year. Let’s think like champions in 2010!
Trump International Tower in Chicago ranked 6th tallest building in world by Council on Tall Buildings & Urban Habitat http://bit.ly/sqvQq
Wishing you and yours a very Happy and Bountiful Thanksgiving!
Donald Trump Partners with TV1 on New Reality Series Entitled, Omarosa's Ultimate Merger: http://tinyurl.com/yk5m3lc
--Work has begun, ahead of schedule, to build the greatest golf course in history: Trump International – Scotland.
...


In [0]:
# Tokenize tweets
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
examples = []
for tweet in tweets:
  tweet = "[CLS]" + tweet + "[SEP]"
  tokenized = tokenizer.tokenize(tweet)
  while len(tokenized) < MAX_TWEET_LENGTH:
    tokenized.append(tokenizer.pad_token)
  tokenized_ids = tokenizer.convert_tokens_to_ids(tokenized)
  examples.append(tokenized_ids)

In [7]:
# Load model etc.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
    

In [0]:
# Optimization parameters
lr = 1e-3
max_grad_norm = 1.0
num_total_steps = 1000
num_warmup_steps = 100
warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1

optimizerG = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
schedulerG = WarmupLinearSchedule(optimizerG, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler

In [0]:
# mask parts of a tweet, adapted from https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py
def mask_tokens(inputs, tokenizer):
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    for i in range(len(inputs)):
      irow = inputs[i]
      lrow = labels[i]
      
      # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
      pad_start = 0
      for j in range(len(irow)):
        if irow[j] == tokenizer.convert_tokens_to_ids(tokenizer.pad_token):
          pad_start = j
          break
      masked_indices = torch.bernoulli(torch.full((pad_start,), 0.5)).to(torch.bool)
      for j in range(len(lrow)):
        if j >= pad_start or not masked_indices[j]:
          lrow[j] = -1
      #print(lrow)
      #print(masked_indices)

      # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
      indices_replaced = torch.bernoulli(torch.full((pad_start,), 0.8)).to(torch.bool)
      for j in range(pad_start):
        if indices_replaced[j] and masked_indices[j]:
          irow[j] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
      #print(irow)

      # 10% of the time, we replace masked input tokens with random word
      indices_random = torch.bernoulli(torch.full((pad_start,), 0.5)).to(torch.bool)
      for j in range(pad_start):
        if indices_random[j] and not indices_replaced[j] and masked_indices[j]:
          irow[j] = np.random.randint(len(tokenizer))
      #print(irow)

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

In [11]:
num_epochs = 50
batch_size = 25

print("Starting Training Loop...")
for i in range(num_epochs):
  batch = list(np.random.permutation(examples)[:batch_size])
  inputs = torch.as_tensor(batch, dtype=torch.int64)
  inputs, labels = mask_tokens(inputs, tokenizer)
  inputs = inputs.to(device)
  labels = labels.to(device)
  model.train()
  outputs = model(inputs, masked_lm_labels=labels)
  print(str(i + 1) + "/" + str(num_epochs) + " epochs")
  print("loss: " + str(outputs[0]))
  if (i + 1) % 5 == 0:
    print("sample output:")
    for j in range(min(batch_size, 5)):
      prediction = []
      for k in range(MAX_TWEET_LENGTH):
        if labels[j][k] == -1:
          prediction.append(inputs[j][k].item())
        else:
          predicted_index = torch.argmax(outputs[1][j][k]).item()
          prediction.append(predicted_index)
      print(tokenizer.decode(prediction))
  loss = outputs[0]
  loss.backward()
  torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
  optimizerG.step()
  schedulerG.step()
  optimizerG.zero_grad()

Starting Training Loop...
1/50 epochs
loss: tensor(6.3690, grad_fn=<NllLossBackward>)
2/50 epochs
loss: tensor(6.3834, grad_fn=<NllLossBackward>)
3/50 epochs
loss: tensor(6.0521, grad_fn=<NllLossBackward>)
4/50 epochs
loss: tensor(5.2978, grad_fn=<NllLossBackward>)
5/50 epochs
loss: tensor(5.7616, grad_fn=<NllLossBackward>)
sample output:
[' i got back in realville, california california, where they gave a massive rally of the spiritdontrump ', ' @trump @ ', ' ', ' mag ', ' ', ' [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 