In [1]:
!pip install pytorch-transformers



In [2]:
import os
import sys
import zipfile
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML
from pytorch_transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
                                  BertConfig, BertForMaskedLM, BertTokenizer)

import random
manualSeed = 999
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)

MODEL_CLASSES = {
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer)
}

Random Seed:  999


In [3]:
from urllib import request
import os
from zipfile import ZipFile
from typing import *

save_dir = '/content/corpus/'
file_list: List[str] = ['condensed_2009.json.zip', 'condensed_2010.json.zip', 'condensed_2011.json.zip', 'condensed_2012.json.zip', 'condensed_2013.json.zip', 'condensed_2014.json.zip', 'condensed_2015.json.zip', 'condensed_2016.json.zip', 'condensed_2017.json.zip', 'condensed_2018.json.zip']
url_root = 'https://github.com/bpb27/trump_tweet_data_archive/raw/master/'

# Download Trump tweets
os.makedirs(save_dir, exist_ok=True)
for file_name in file_list:
  print(f'Downloading {file_name}..')
  file_path = save_dir + file_name
  request.urlretrieve(url_root + file_name, file_path)
  with ZipFile(file_path, 'r') as zip:
    zip.extractall(save_dir)
  os.remove(file_path)

Downloading condensed_2009.json.zip..
Downloading condensed_2010.json.zip..
Downloading condensed_2011.json.zip..
Downloading condensed_2012.json.zip..
Downloading condensed_2013.json.zip..
Downloading condensed_2014.json.zip..
Downloading condensed_2015.json.zip..
Downloading condensed_2016.json.zip..
Downloading condensed_2017.json.zip..
Downloading condensed_2018.json.zip..


In [4]:
import json

tweets = []
file_list = list(map(lambda s: s.replace('.zip', ''), file_list))
for f in file_list:
  with open(save_dir + f, 'r', encoding='utf-8') as fp:
    raw_tweets = json.load(fp)
    for raw_tweet in raw_tweets:
      text = raw_tweet["text"]
      tweets.append(text)

print(str(len(tweets)) + " tweets")
for tweet in tweets[:5]:
  print(tweet)

36307 tweets
From Donald Trump: Wishing everyone a wonderful holiday & a happy, healthy, prosperous New Year. Let’s think like champions in 2010!
Trump International Tower in Chicago ranked 6th tallest building in world by Council on Tall Buildings & Urban Habitat http://bit.ly/sqvQq
Wishing you and yours a very Happy and Bountiful Thanksgiving!
Donald Trump Partners with TV1 on New Reality Series Entitled, Omarosa's Ultimate Merger: http://tinyurl.com/yk5m3lc
--Work has begun, ahead of schedule, to build the greatest golf course in history: Trump International – Scotland.


In [191]:
# Tokenize tweets
MAX_TWEET_LENGTH = 50

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
examples = []
for tweet in tweets:
  tokenized = tokenizer.tokenize(tweet)
  if len(tokenized) < MAX_TWEET_LENGTH:
    while len(tokenized) < MAX_TWEET_LENGTH:
      tokenized.append(tokenizer.pad_token)
    tokenized_ids = tokenizer.convert_tokens_to_ids(tokenized)
    examples.append(tokenized_ids)


print(str(len(examples)) + " examples")
for example in examples[:5]:
  print(example)

33622 examples
[2013, 6221, 8398, 1024, 10261, 3071, 1037, 6919, 6209, 1004, 1037, 3407, 1010, 7965, 1010, 18241, 2047, 2095, 1012, 2292, 1521, 1055, 2228, 2066, 3966, 1999, 2230, 999, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[8398, 2248, 3578, 1999, 3190, 4396, 5351, 13747, 2311, 1999, 2088, 2011, 2473, 2006, 4206, 3121, 1004, 3923, 6552, 8299, 1024, 1013, 1013, 2978, 1012, 1048, 2100, 1013, 5490, 2615, 4160, 4160, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[10261, 2017, 1998, 6737, 1037, 2200, 3407, 1998, 8945, 16671, 18424, 15060, 999, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[6221, 8398, 5826, 2007, 2694, 2487, 2006, 2047, 4507, 2186, 4709, 1010, 13192, 8820, 1005, 1055, 7209, 7660, 1024, 8299, 1024, 1013, 1013, 4714, 3126, 2140, 1012, 4012, 1013, 1061, 2243, 2629, 2213, 2509, 15472, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1011, 1011, 2147, 2038, 5625, 1010, 3805, 1997, 6

In [0]:
# mask parts of a tweet, adapted from https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py
def mask_tokens(inputs, tokenizer):
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    for i in range(len(inputs)):
      irow = inputs[i]
      lrow = labels[i]
      
      # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
      pad_start = 0
      for j in range(len(irow)):
        if irow[j] == tokenizer.convert_tokens_to_ids(tokenizer.pad_token):
          pad_start = j
          break
      masked_indices = torch.bernoulli(torch.full((pad_start,), 0.25)).to(torch.bool)
      for j in range(len(lrow)):
        if j >= pad_start or not masked_indices[j]:
          lrow[j] = -1
      #print(lrow)
      #print(masked_indices)

      # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
      indices_replaced = torch.bernoulli(torch.full((pad_start,), 0.8)).to(torch.bool)
      for j in range(pad_start):
        if indices_replaced[j] and masked_indices[j]:
          irow[j] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
      #print(irow)

      # 10% of the time, we replace masked input tokens with random word
      indices_random = torch.bernoulli(torch.full((pad_start,), 0.5)).to(torch.bool)
      for j in range(pad_start):
        if indices_random[j] and not indices_replaced[j] and masked_indices[j]:
          irow[j] = np.random.randint(len(tokenizer))
      #print(irow)

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

In [0]:
def predict(inputs, outputs, labels):
  predictions = []
  for j in range(len(inputs)):
    prediction = []
    for k in range(len(inputs[j])):
      if labels[j][k] == -1:
        prediction.append(inputs[j][k].item())
      else:
        predicted_index = torch.argmax(outputs[j][k]).item()
        prediction.append(predicted_index)
    predictions.append(prediction)
  return predictions

In [192]:
# Load model etc.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
#model = BertForMaskedLM.from_pretrained('models')
model = model.to(device)

# Optimization parameters
lr = 1e-4
max_grad_norm = 1.0
num_total_steps = 100
num_warmup_steps = 10

optimizerG = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
schedulerG = WarmupLinearSchedule(optimizerG, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler

batch_size = 100
print_interval = 5
print_size = 1

print("Starting Training Loop...")
for i in range(num_total_steps):
  permuted_examples = list(np.random.permutation(examples))
  #for batch_start in range(0, len(permuted_examples) - batch_size, batch_size):
  #  batch_end = batch_start + batch_size
  batch = permuted_examples[:batch_size]
  inputs = torch.as_tensor(batch, dtype=torch.int64)
  inputs, labels = mask_tokens(inputs, tokenizer)
  inputs = inputs.to(device)
  labels = labels.to(device)
  model.train()
  outputs = model(inputs, masked_lm_labels=labels)
  loss = outputs[0]
  loss.backward()
  torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
  optimizerG.step()
  schedulerG.step()
  optimizerG.zero_grad()
  print(str(i + 1) + "/" + str(num_total_steps) + " steps")
  print("loss: " + str(loss.item()))
  if (i + 1) % print_interval == 0:
    print("sample predictions:")
    predictions = predict(inputs, outputs[1], labels)
    for j in range(min(batch_size, print_size)):
      print("original : " + tokenizer.decode(batch[j], skip_special_tokens=True))
      print("predicted: " + tokenizer.decode(predictions[j], skip_special_tokens=True))

Starting Training Loop...
1/100 steps
loss: 5.747446060180664
2/100 steps
loss: 5.8749518394470215
3/100 steps
loss: 5.5652055740356445
4/100 steps
loss: 5.305379867553711
5/100 steps
loss: 4.9442901611328125
sample predictions:
original : the benghazi terrorist is getting speedier care than our vets at the va. obama has his priorities.
predicted: the benwalazi terrorist is getting speedier care than our vets can...a has his rights.
6/100 steps
loss: 4.397333145141602
7/100 steps
loss: 5.479149341583252
8/100 steps
loss: 4.6567864418029785
9/100 steps
loss: 4.40280294418335
10/100 steps
loss: 4.603474140167236
sample predictions:
original : let the arab league take care of syria. why are these rich arab countries not paying us for the tremendous cost of such an attack?
predicted: let the arab countries take care of syria. why are sure the american countries countries paying us for the tremendous cost of such an attack?
11/100 steps
loss: 4.095590114593506
12/100 steps
loss: 4.340753555

In [0]:
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)
model.save_pretrained(model_dir)

In [349]:
#generating_model = BertForMaskedLM.from_pretrained('models')
generating_model = model
generating_model = generating_model.to(device)

# Bert as Text Generator: https://arxiv.org/pdf/1902.04094.pdf
def generate_tweet(model, length):
  tweet = np.random.randint(len(tokenizer), size=(length))
  
  # uncomment the next line to initialise with real tweet instead
  #tweet = examples[np.random.randint(len(examples))]
  
  # uncomment some of the next lines to use a fixed sentence
  #sentence = "Learn Git and GitHub without any code! Using the Hello World guide, you’ll start a branch, write comments, and open a pull request."
  sentence = "Looking forward to the machine learning course at Goethe University Frankfurt! #ifi"
  tweet = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))
  length = len(tweet)
  order = np.random.permutation(range(length))
  for i in order:
    tweet[i] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
    inputs = torch.as_tensor([tweet], dtype=torch.int64)
    labels = inputs.clone()
    labels = torch.full((1, length), -1, dtype=torch.int64)
    labels[0][i] = inputs[0][i]
    inputs = inputs.to(device)
    labels = labels.to(device)
    model.eval()
    with torch.no_grad():
      outputs = model(inputs)
    predictions = predict(inputs, outputs[0], labels)
    tweet[i] = predictions[0][i]
    print(tokenizer.decode(tweet))
  return tweet

# Generate a tweet
tweet = generate_tweet(generating_model, 20)

looking forward to the machine learning course at goethe university frankfurt! # ifi
looking forward to the machine learning course at goethe university frankfurt! # miti
looking forward to the machine learning course at goethe university frankfurt! # miti
looking forward to the machine learning course at goethe university frankfurt! # miti
looking forward to the machine learning course at goethe university frankfurt! # miti
looking forward to the machine golf course at goethe university frankfurt! # miti
looking forward to the machine golf course at goethe university frankfurt! # miti
looking forward to the machine golf course at goethe university frankfurt! # mitt
looking forward to the machine golf course at goethe university today! # mitt
looking forward to the great golf course at goethe university today! # mitt
i forward to the great golf course at goethe university today! # mitt
i welcome to the great golf course at goethe university today! # mitt
i welcome to the great golf cou