In [0]:
!pip install alchemy-catalyst
!pip install transformers
!pip install -U catalyst

In [0]:
!pip install --upgrade wandb
!wandb login

In [4]:
import wandb
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np

import torch
from torch.utils.data import DataLoader
from torchtext  import data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.vocab import Vectors

import catalyst.dl as dl
from collections import OrderedDict
from catalyst.dl.callbacks  import AccuracyCallback, EarlyStoppingCallback, WandbLogger

from transformers import BertTokenizer, BertModel, GPT2Model, GPT2Tokenizer
from tokenizers import SentencePieceBPETokenizer

import nltk
from nltk import tokenize
nltk.download('punkt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


device(type='cuda')

In [0]:
# TODO shuffle final version of df

# Data

In [0]:
def open_file(file):
    with open(file, 'r', encoding='utf-8') as f:
        text_list = [line for line in f.readlines()]
    return text_list

In [71]:
# uncomment if google colab:
import os 
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/')
# !wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
# !unzip 'wikitext-2-v1.zip'

df = pd.read_csv("dpl_dataset.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [72]:
df.head()

Unnamed: 0,text,label,sampling
0,"HAMBURG, Germany, June 3  As he left the socc...",fake,temperature
1,"WASHINGTON, Dec. 23 - The National Security Ag...",fake,temperature
2,IF outsized executive pay has indeed become a ...,fake,temperature
3,"BY A.J. Miller, Jr. The three men will make t...",fake,temperature
4,Spinach has terrorized generations of veggie-p...,fake,temperature


In [73]:
df.shape

(400000, 3)

## Tokenization and embeddings

### SentencePieceBPETokenizer and  no embeddings

In [8]:
tokenization = 'sentencepiece'
tokenizer = SentencePieceBPETokenizer()
tokenizer.train(['wikitext-2/wiki.test.tokens', 'wikitext-2/wiki.train.tokens', 'wikitext-2/wiki.valid.tokens'], special_tokens=['<eos>', '<unk>', '<start>'], vocab_size=30000)

def tokenize(text, tokenizer=tokenizer):
    return tokenizer.encode(text).tokens

print(tokenize(fake[0]))
print(tokenize(real[0]))

['▁Spin', 'ach', '▁has', '▁terror', 'ized', '▁generations', '▁of', '▁ve', 'gg', 'ie', '-', 'ph', 'obic', '▁kids', ',', '▁and', '▁many', '▁grown', 'up', 's', '▁don', "'", 't', '▁much', '▁like', '▁it', ',', '▁either', '..', '"', 'I', '▁think', '▁it', "'", 's', '▁a', '▁little', '▁bit', '▁of', '▁a', '▁shock', '▁to', '▁see', '▁that', '▁he', "'", 's', '▁been', '▁able', '▁to', '▁do', '▁this', ',', '"', '▁']
['▁Spin', 'ach', '▁has', '▁terror', 'ized', '▁generations', '▁of', '▁ve', 'gg', 'ie', '-', 'ph', 'obic', '▁kids', ',', '▁and', '▁many', '▁grown', 'up', 's', '▁don', "'", 't', '▁much', '▁like', '▁it', ',', '▁either', '.', '▁But', '▁when', '▁it', "'", 's', '▁combined', '▁with', '▁season', 'ings', '▁and', '▁fet', 'a', '▁cheese', '▁and', '▁wrapped', '▁in', '▁a', '▁golden', '▁cr', 'isp', '▁p', 'hy', 'llo', '▁d', 'ough', '▁crust', ',', '▁even', '▁those', '▁who', '▁desp', 'ise', '▁Pope', 'y', 'e', "'", 's', '▁favorite', '▁food', '▁ask', '▁for', '▁seconds', '.', '▁']


In [9]:
MAX_VOCAB_SIZE = 50000
classes={'fake': 0, 'real': 1}


TEXT = data.Field(sequential=True, include_lengths=False, batch_first=True, tokenize=tokenize, 
             pad_first=True, lower=True, eos_token='<eos>') 
LABEL = data.LabelField(dtype=torch.float, use_vocab=True, preprocessing=lambda x: classes[x])


dataset = data.TabularDataset('dpl_dataset.csv', 
                                format='csv', fields=[('text', TEXT), ('label',LABEL), (None, None)], 
                                skip_header=True)

TEXT.build_vocab(dataset,  max_size=MAX_VOCAB_SIZE, min_freq=2)
LABEL.build_vocab(dataset)
vocab = TEXT.vocab
print('Vocab size:', len(TEXT.vocab.itos))

train, test = dataset.split(0.8, stratified=True)
train, valid = train.split(0.8, stratified=True)

Vocab size: 20655


In [0]:
EMBEDDINGS_DIM = 100
VOCAB_SIZE = len(TEXT.vocab.itos)
EMB_PRETRAINED = False
embeddings_pretrained = None

### Bert Tokenizer and Embeddings

In [22]:
tokenization = 'bert'
pretrained_weights = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights)

HBox(children=(IntProgress(value=0, description='Downloading', max=213450, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=435779157, style=ProgressStyle(description_…




In [23]:
embeddings_pretrained = model.get_input_embeddings()
embeddings_pretrained

Embedding(28996, 768, padding_idx=0)

In [0]:
def tokenize(text, tokenizer=tokenizer):
    return tokenizer.encode(text)[:512]

In [0]:
# os.chdir('/content/drive/My Drive/')

In [30]:
MAX_VOCAB_SIZE = 50000
classes={'fake': 0, 'real': 1}


TEXT = data.Field(sequential=True, include_lengths=False, batch_first=True, tokenize=tokenize, 
             pad_first=True, lower=False) 
LABEL = data.LabelField(dtype=torch.float, use_vocab=True, preprocessing=lambda x: classes[x])


dataset = data.TabularDataset('dpl_dataset.csv', 
                                format='csv', fields=[('text', TEXT), ('label',LABEL), (None, None)], 
                                skip_header=True)

TEXT.build_vocab(dataset,  max_size=MAX_VOCAB_SIZE, min_freq=2)
LABEL.build_vocab(dataset)
vocab = TEXT.vocab
print('Vocab size:', len(TEXT.vocab.itos))

train, test = dataset.split(0.8, stratified=True)
train, valid = train.split(0.8, stratified=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (567 > 512). Running this sequence through the model will result in indexing errors


Vocab size: 25022


In [0]:
EMBEDDINGS_DIM = embeddings_pretrained.embedding_dim
VOCAB_SIZE = embeddings_pretrained.num_embeddings
EMB_PRETRAINED = True

### GPT Tokenizer and Embeddings

In [44]:
tokenization = 'gpt2'
pretrained_weights = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_weights)
model = GPT2Model.from_pretrained(pretrained_weights)

HBox(children=(IntProgress(value=0, description='Downloading', max=1042301, style=ProgressStyle(description_wi…




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=224, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=548118077, style=ProgressStyle(description_…




In [45]:
embeddings_pretrained = model.get_input_embeddings()
embeddings_pretrained

Embedding(50257, 768)

In [0]:
def tokenize(text, tokenizer=tokenizer):
    return tokenizer.encode(text)[:1024]

In [0]:
# os.chdir('/content/drive/My Drive/')

In [74]:
MAX_VOCAB_SIZE = 50000
classes={'fake': 0, 'real': 1}


TEXT = data.Field(sequential=True, include_lengths=False, batch_first=True, tokenize=tokenize, 
             pad_first=True, lower=False) 
LABEL = data.LabelField(dtype=torch.float, use_vocab=True, preprocessing=lambda x: classes[x])


dataset = data.TabularDataset('dpl_dataset.csv', 
                                format='csv', fields=[('text', TEXT), ('label',LABEL), (None, None)], 
                                skip_header=True)

TEXT.build_vocab(dataset,  max_size=MAX_VOCAB_SIZE, min_freq=2)
LABEL.build_vocab(dataset)
vocab = TEXT.vocab
print('Vocab size:', len(TEXT.vocab.itos))

train, test = dataset.split(0.8, stratified=True)
train, valid = train.split(0.8, stratified=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (1418 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2739 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1244 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1914 > 1024). Running this sequence through the model will result in indexing errors


Vocab size: 47922


In [0]:
EMBEDDINGS_DIM = embeddings_pretrained.embedding_dim
VOCAB_SIZE = embeddings_pretrained.num_embeddings
EMB_PRETRAINED = True

# Model

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, 
                 emb_pretrained, embeddings):
        super(MyModel, self).__init__()
        self.emb_pretrained = emb_pretrained
        self.embedding =  embeddings if self.emb_pretrained else nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2 *2, 1)
    def forward(self, x):
        
        x = self.embedding(x)
           
        _, (hidden, cell) = self.rnn(x)
        
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = torch.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        return x

In [0]:
class Batch:
    "Object for holding a batch of data during training."
    def __init__(self, text, label):
        self.text = text
        self.label = label


class BucketIteratorWrapper(DataLoader):
    __initialized = False

    def __init__(self, iterator: data.Iterator):
        self.batch_size = iterator.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iterator
        self.batch_sampler = iterator
        self.__initialized = True

    def __iter__(self):
        return map(
            lambda batch: {'features': Batch(batch.text, batch.label).text,
                           'targets': Batch(batch.text, batch.label).label.unsqueeze(-1),
                          },
            self.batch_sampler.__iter__()
        )

    def __len__(self):
        return len(self.batch_sampler)

In [0]:
config = {'tokenization/embeddings': tokenization,
            'batch_size': 256,
          'hidden_size' : 128,
            'num_epochs': 10}

In [79]:
model = MyModel(VOCAB_SIZE,
                embed_size=EMBEDDINGS_DIM,
                hidden_size=config['hidden_size'],
                emb_pretrained = EMB_PRETRAINED,
                embeddings = embeddings_pretrained
               )
model.to(device)


train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(config['batch_size'], config['batch_size'], config['batch_size']),
    shuffle=True,
    device=device,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
)

train_iterator = BucketIteratorWrapper(train_iterator)
valid_iterator = BucketIteratorWrapper(valid_iterator)
test_iterator = BucketIteratorWrapper(test_iterator)


optimizer = optim.Adam(model.parameters(), weight_decay=1e-5)
criterion = nn.BCEWithLogitsLoss()
criterion.to(device)

BCEWithLogitsLoss()

#Train and test

In [0]:
os.chdir('/content/')
logdir = '/content/'
RUN_ID = 'TEST'
# !rm -rf {logdir}

In [81]:
runner = dl.SupervisedRunner(device=device)
loaders = OrderedDict(
    {'train': train_iterator,
    'valid': valid_iterator}
)

runner.train(
    model=model, 
    criterion=criterion,
    optimizer=optimizer, 
    loaders=loaders,
    logdir=logdir,
    num_epochs=config['num_epochs'],
    verbose=True,
    valid_loader="valid",
    callbacks=[AccuracyCallback(num_classes=2,
                                activation='Sigmoid',
                                threshold=0.5),
               EarlyStoppingCallback(patience=2),
               WandbLogger(project="dpl",
                           name=RUN_ID,
                           config=config,
                           id=RUN_ID)],
    monitoring_params={
                    "project": "dpl",
                    'tags': 'lstm',
                    'config': config,
    }
)

1/10 * Epoch (train): 100% 1000/1000 [00:36<00:00, 27.03it/s, accuracy01=0.996, loss=0.039]
1/10 * Epoch (valid): 100% 250/250 [00:03<00:00, 78.12it/s, accuracy01=0.980, loss=0.074]
[2020-04-15 18:08:06,182] 
1/10 * Epoch 1 (_base): lr=0.0010 | momentum=0.9000
1/10 * Epoch 1 (train): accuracy01=0.9670 | loss=0.1134
1/10 * Epoch 1 (valid): accuracy01=0.9800 | loss=0.0703
2/10 * Epoch (train): 100% 1000/1000 [00:37<00:00, 26.87it/s, accuracy01=0.992, loss=0.025]
2/10 * Epoch (valid): 100% 250/250 [00:03<00:00, 79.57it/s, accuracy01=0.984, loss=0.099]
[2020-04-15 18:08:53,502] 
2/10 * Epoch 2 (_base): lr=0.0010 | momentum=0.9000
2/10 * Epoch 2 (train): accuracy01=0.9862 | loss=0.0451
2/10 * Epoch 2 (valid): accuracy01=0.9880 | loss=0.0463
3/10 * Epoch (train): 100% 1000/1000 [00:36<00:00, 27.09it/s, accuracy01=0.988, loss=0.035]
3/10 * Epoch (valid): 100% 250/250 [00:03<00:00, 80.18it/s, accuracy01=0.984, loss=0.128]
[2020-04-15 18:09:50,701] 
3/10 * Epoch 3 (_base): lr=0.0010 | momentum=

In [82]:
results = torch.load('/content/checkpoints/train.2.pth', map_location=device)
model.load_state_dict(results['model_state_dict'])

<All keys matched successfully>

In [0]:
def accuracy_score(preds, y):
    preds = torch.round(torch.sigmoid(preds))
    preds = (preds == y).float()
    accuracy = preds.sum() / len(preds)
    return accuracy.item()

In [0]:
def test_model(model, test_iterator):
    test_acc = []
    with torch.no_grad():
        for item in test_iterator:
            x = item['features']
            y = item['targets']
            preds = model(x)
            test_acc.append(accuracy_score(preds, y))
    test_acc = np.mean(test_acc) 
    return np.mean(test_acc)

In [83]:
test_accuracy = test_model(model, test_iterator)
print('Test accuracy: {}'.format(np.mean(test_accuracy)))

Test accuracy: 0.9876198083067093


In [84]:
wandb.init(id=RUN_ID, config=config)
wandb.log({"Test accc" : test_accuracy})

Streaming file created twice in same run: /content/wandb/run-20200415_181120-TEST/wandb-events.jsonl
Streaming file created twice in same run: /content/wandb/run-20200415_181120-TEST/wandb-history.jsonl
