In [0]:
!pip install alchemy-catalyst
!pip install transformers
!pip install -U catalyst

In [0]:
!pip install --upgrade wandb
!wandb login 

In [4]:
import wandb
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np

from transformers import BertTokenizer, BertForSequenceClassification

import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext  import data


import catalyst.dl as dl
from collections import OrderedDict
from catalyst.dl.callbacks  import AccuracyCallback, EarlyStoppingCallback, WandbLogger

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Data

In [0]:
# uncomment if google colab:

import os 
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/')

df = pd.read_csv("data/dataset.csv")

In [6]:
df.shape

(483202, 3)

In [7]:
df.head()

Unnamed: 0,text,label,sampling
0,"The police department in Green Mountain Falls,...",real,No sampling
1,"DHAKA, Bangladesh—Islamic State militants stor...",fake,nucleus
2,A few minutes into her visit with plastic surg...,real,No sampling
3,"Here is the second item from my ""Albany Inside...",real,No sampling
4,"Reversing a long and slow stock decline, share...",real,No sampling


# BERT

Choose one of the models:

## Bert

In [8]:
pretrained_weights = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
bert = BertForSequenceClassification.from_pretrained(pretrained_weights)

pad_index = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
embeddings_pretrained = bert.get_input_embeddings()
embeddings_pretrained

HBox(children=(IntProgress(value=0, description='Downloading', max=213450, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=435779157, style=ProgressStyle(description_…




Embedding(28996, 768, padding_idx=0)

## DistilBert

In [0]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')

pad_index = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
embeddings_pretrained = bert.get_input_embeddings()
embeddings_pretrained

## RoBERTa

In [0]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
bert = RobertaForSequenceClassification.from_pretrained('roberta-base')

pad_index = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
embeddings_pretrained = bert.get_input_embeddings()
embeddings_pretrained

## ALBERT

In [0]:
from transformers import AlbertTokenizer, AlbertForSequenceClassification

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
bert = AlbertForSequenceClassification.from_pretrained('albert-base-v2')

pad_index = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
embeddings_pretrained = bert.get_input_embeddings()
embeddings_pretrained

# Bart

In [0]:
from transformers import BartTokenizer, BartForSequenceClassification

tokenizer = BartTokenizer.from_pretrained('bart-large')
bert = BartForSequenceClassification.from_pretrained('bart-large')

pad_index = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
embeddings_pretrained = bert.get_input_embeddings()
embeddings_pretrained

# End of choising Bert

In [0]:
os.chdir('/content/drive/My Drive/')

In [101]:
def tokenize(text, tokenizer=tokenizer):
    return tokenizer.encode(text, max_length=512)

classes={'fake': 0, 'real': 1}


TEXT = data.Field(sequential=True, 
                  include_lengths=False,
                  batch_first=True, 
                  tokenize=tokenize, 
                  pad_first=True,
                  lower=False,
                  use_vocab=False,
                  preprocessing=data.Pipeline(int),
                  pad_token=pad_index) 

LABEL = data.LabelField(dtype=torch.long,
                        use_vocab=False, 
                        sequential=False,
                        preprocessing=lambda x: classes[x])


dataset = data.TabularDataset('data/dataset.csv', 
                                format='csv', fields=[('text', TEXT), ('label',LABEL), (None, None)], 
                                skip_header=True)

train, test = dataset.split(0.8, stratified=True)
train, valid = train.split(0.8, stratified=True)

1/10 * Epoch (train):  12% 179/1547 [42:14<5:22:52, 14.16s/it, accuracy01=0.465, loss=0.746]
1/10 * Epoch (train):   0% 0/1547 [32:15<?, ?it/s]
1/10 * Epoch (train):   0% 0/1547 [28:45<?, ?it/s]
1/10 * Epoch (train):   0% 0/1547 [24:48<?, ?it/s]
1/10 * Epoch (train):   0% 0/1547 [23:19<?, ?it/s]


In [0]:
class Batch:
    "Object for holding a batch of data during training."
    def __init__(self, text, label):
        self.text = text
        self.label = label


class BucketIteratorWrapper(DataLoader):
    __initialized = False

    def __init__(self, iterator: data.Iterator):
        self.batch_size = iterator.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iterator
        self.batch_sampler = iterator
        self.__initialized = True

    def __iter__(self):
        return map(
            lambda batch: {'features': Batch(batch.text, batch.label).text,
                        'targets': Batch(batch.text, batch.label).label,
                          },
            self.batch_sampler.__iter__()
        )

    def __len__(self):
        return len(self.batch_sampler)

In [0]:
config = {'tokenization/embeddings': 'bert',
            'batch_size': 128,
          'hidden_size' : 256,
            'num_epochs': 10.
          'bert_model': 'bert'}

In [0]:
class MyModel(nn.Module):

    def __init__(self, bert, hidden_size):
        super(MyModel, self).__init__()
        self.bert = bert

    def forward(self, x):
        x = bert(x)[0]
        return x 

In [105]:
model = MyModel(bert=bert, hidden_size=config['hidden_size'])
model.to(device)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(config['batch_size'], config['batch_size'], config['batch_size']),
    shuffle=True,
    device=device,
    sort=False,
    sort_key=lambda x: len(x.text),
    sort_within_batch=False,
)

train_iterator = BucketIteratorWrapper(train_iterator)
valid_iterator = BucketIteratorWrapper(valid_iterator)
test_iterator = BucketIteratorWrapper(test_iterator)


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, patience=2)
criterion.to(device)

CrossEntropyLoss()

In [106]:
for el in test_iterator:
    with torch.no_grad():
        x = el['features']
        y = el['targets']
        out = model(x)
        print(y.unique(return_counts=True))
        print(x.size())
        print(y.size())
        print(out.size())
        print(criterion(out, y.long()))
    break

(tensor([0, 1], device='cuda:0'), tensor([66, 62], device='cuda:0'))
torch.Size([128, 122])
torch.Size([128])
torch.Size([128, 2])
y torch.cuda.LongTensor
out torch.cuda.FloatTensor
tensor(0.6954, device='cuda:0')


In [107]:
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(params)

7680002


In [0]:
# bert for sequence classification

for p in model.bert.bert.encoder.parameters(): 
    p.requires_grad = False 

for p in model.bert.bert.pooler.parameters():
    p.requires_grad = True

for p in model.bert.bert.embeddings.parameters(): 
    p.requires_grad = False 

for p in model.bert.bert.encoder.layer[-1].parameters():
    p.requires_grad = True

In [109]:
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(params)

7680002


# Train and Test

In [0]:
os.chdir('/content/')
logdir = '/content/'
RUN_NAME = 'bert_test'
RUN_ID = 's,bdbn'

In [111]:
from tqdm import tqdm
def clean_tqdm():
    for instance in list(tqdm._instances): 
        tqdm._decr_instances(instance)

for e in tqdm([1,2,3]):
    pass


100%|██████████| 3/3 [00:00<00:00, 21076.90it/s]


In [0]:
runner = dl.SupervisedRunner(device=device)
loaders = OrderedDict(
    {'train': train_iterator,
    'valid': valid_iterator}
)

clean_tqdm()
runner.train(
    model=model, 
    criterion=criterion,
    optimizer=optimizer, 
    scheduler=scheduler,
    loaders=loaders,
    logdir=logdir,
    num_epochs=config['num_epochs'],
    verbose=True,
    valid_loader="valid",
    callbacks=[AccuracyCallback(num_classes=2,
                                activation='Softmax'
                                ),
               EarlyStoppingCallback(patience=4),
               WandbLogger(log_on_batch_end=True,
                           project="dpl",
                           name=RUN_NAME,
                           config=config,
                           id=RUN_ID
                           )],
    monitoring_params={
                    "project": "dpl",
                    'tags': 'lstm',
                    'config': config,
    }
)

In [113]:
results = torch.load('/content/checkpoints/train.7.pth', map_location=device)
model.load_state_dict(results['model_state_dict'])

<All keys matched successfully>

In [0]:
# !cp "/content/checkpoints/train.2.pth" "/content/drive/My Drive/model_checkpoints/"

In [0]:
def accuracy_score(preds, y):
    _, preds = torch.max(F.softmax(preds, dim=1),dim=1)
    preds = (preds == y).float()
    accuracy = preds.sum() / len(preds)
    return accuracy.item()

In [0]:
def test_model(model, test_iterator):
    test_acc = []
    with torch.no_grad():
        for item in test_iterator:
            x = item['features']
            y = item['targets']
            preds = model(x)
            test_acc.append(accuracy_score(preds, y))
    test_acc = np.mean(test_acc) 
    return np.mean(test_acc)

In [131]:
test_accuracy = test_model(model, test_iterator)
print('Test accuracy: {}'.format(np.mean(test_accuracy)))

Test accuracy: 0.8399834437086092


In [132]:
wandb.init(id=RUN_ID, config=config)
wandb.log({"Test accc" : test_accuracy})

Streaming file created twice in same run: /content/wandb/run-20200418_160921-s,bdbn/wandb-history.jsonl
Streaming file created twice in same run: /content/wandb/run-20200418_160921-s,bdbn/wandb-events.jsonl
