In [0]:
!pip install alchemy-catalyst
!pip install transformers
!pip install -U catalyst

In [0]:
!pip install --upgrade wandb
!wandb login

In [4]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.utils.data import DataLoader
from torchtext  import data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizer, BertModel, GPT2Model, GPT2Tokenizer

import catalyst.dl as dl
from collections import OrderedDict
from catalyst.dl.callbacks  import AccuracyCallback, EarlyStoppingCallback, WandbLogger

import nltk
from nltk import tokenize
nltk.download('punkt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
# TODO Ошибку с NoneType в итераторах поправить

# Data (пропустить если есть domains.csv)

In [4]:
import os 
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/')

df = pd.read_csv("data/dataset.csv")

# fake = open_file("data/fake.txt")
# real = open_file("data/real.txt")
# df = pd.read_csv("data/dataset.csv")
import pickle
with open ('data/fake', 'rb') as fp:
    fake = pickle.load(fp)
with open ('data/real', 'rb') as fp:
    real = pickle.load(fp)

print(len(fake), len(real), df.shape)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
241601 241601 (483202, 3)


In [0]:
# Convert dataset into DF with 'Text_Part1' 'Text_Part2' 'Label'
def sent_first(text):
    return tokenize.sent_tokenize(text)

def split_texts(texts):
    """
    Формирует новый датафрейм для проверки на домены.
    texts:: list of str (реальные тексты)
    """
    
    domains_df = pd.DataFrame()
    text_first = []
    text_second = []
    labels = []

    for ind, text in enumerate(texts):        
        real_first = tokenize.sent_tokenize(text)[0]
        try:
            real_second = tokenize.sent_tokenize(text)[1]
            fake_second = fake[ind].split(real_first)[1]
        except:
            continue
        
        text_first.append(real_first)
        text_second.append(real_second)
        labels.append('real')

        fake_first = real_first
        fake_second = fake[ind].split(real_first)[1]
        text_first.append(fake_first)
        text_second.append(fake_second)
        labels.append('fake')        
        
    assert len(text_first) == len(text_second) == len(labels)  

    domains_df['first'] = text_first
    domains_df['second'] = text_second
    domains_df['label'] = labels 
    domains_df = domains_df.sample(frac=1).reset_index(drop=True)
    return domains_df

In [6]:
domains = split_texts(real)
domains.head()

Unnamed: 0,first,second,label
0,A Brisbane bid for the 2028 Olympic Games stil...,Gold Coast City and Logan City Councils had pu...,real
1,The 10-episode series is produced by American ...,"An early teaser for Good Behaviour, released i...",real
2,"Aldo Rossi, the Italian architect known for wo...",He was 66.,real
3,03/08/2016 AT 10:40 PM EST From one first lady...,It came to not take for fear either I might s...,fake
4,"In late May it appeared that Rush Limbaugh, th...","But starting June 29, Premiere said, Limbaugh ...",real


In [7]:
domains.tail()

Unnamed: 0,first,second,label
470241,Cadillac has staged one of the car world’s mos...,"If you don’t believe us, then go ahead and get...",real
470242,"If you live in Arizona, California or Nevada, ...","But if you live in Oregon, there are temperat...",fake
470243,The 2012 Auty Cup Tour is a four-day series of...,The four-day tour includes all of South Afric...,fake
470244,"updated 5:37 PM EDT, Tue September 23, 2014 Ed...",Blair attended his brother Gordon MP (2013-) ...,fake
470245,"Lawyers representing Dylann Roof, the white ma...",In the 34-page motion filed late Monday in Fed...,real


In [0]:
domains.to_csv('/content/drive/My Drive/data/domains.csv', index=False)

# GPT Tokenizer and Embeddings

In [0]:
# uncomment if google colab:

import os 
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/')

In [0]:
# 0.61 on test
# pretrained_weights = 'bert-base-uncased'
# tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
# model = BertModel.from_pretrained(pretrained_weights)

# embeddings_pretrained = model.get_input_embeddings()
# embeddings_pretrained

In [6]:
pretrained_weights = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_weights)
model = GPT2Model.from_pretrained(pretrained_weights)

pad_index = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
embeddings_pretrained = model.get_input_embeddings()
embeddings_pretrained

HBox(children=(IntProgress(value=0, description='Downloading', max=1042301, style=ProgressStyle(description_wi…




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=224, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=548118077, style=ProgressStyle(description_…




Using pad_token, but it is not set yet.


Embedding(50257, 768)

In [0]:
def tokenize(text, tokenizer=tokenizer):
    return tokenizer.encode(text, max_length=512)

In [0]:
classes={'fake': 0, 'real': 1}


TEXT = data.Field(sequential=True, 
                  include_lengths=False,
                  batch_first=True, 
                  tokenize=tokenize, 
                  pad_first=True,
                  lower=False,
                  use_vocab=False,
                  preprocessing=data.Pipeline(int),
                  pad_token=pad_index) 

LABEL = data.LabelField(dtype=torch.float, 
                        use_vocab=False, 
                        sequential=False,
                        preprocessing=lambda x: classes[x])

dataset = data.TabularDataset('data/domains.csv', 
                                format='csv', fields=[('first', TEXT), ('second', TEXT), ('label',LABEL),], 
                                skip_header=True)

train, test = dataset.split(0.8, stratified=True)
train, valid = train.split(0.8, stratified=True)

In [0]:
EMBEDDINGS_DIM = embeddings_pretrained.embedding_dim
VOCAB_SIZE = embeddings_pretrained.num_embeddings
EMB_PRETRAINED = True

# Model

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, 
                 emb_pretrained, embeddings):
        super(MyModel, self).__init__()
        self.emb_pretrained = emb_pretrained
        self.embedding =  embeddings if self.emb_pretrained else nn.Embedding(vocab_size, embed_size)    
        self.fc = nn.Linear(embed_size, 1)
    
    def forward(self, first, second):
        
        first = self.embedding(first)
        second = self.embedding(second)

        first = torch.sum(first, dim=1)/first.size()[1]
        second = torch.sum(second, dim=1)/second.size()[1]
        
        difference = first-second      
        x = self.fc(difference)
        return x

In [0]:
class Batch:
    "Object for holding a batch of data during training."
    def __init__(self, first, second, label):
        self.first = first
        self.second = second
        self.label = label


class BucketIteratorWrapper(DataLoader):
    __initialized = False

    def __init__(self, iterator: data.Iterator):
        self.batch_size = iterator.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iterator
        self.batch_sampler = iterator
        self.__initialized = True

    def __iter__(self):
        return map(
            lambda batch: {'first': Batch(batch.first, batch.second, batch.label).first, 
                           'second': Batch(batch.first, batch.second, batch.label).second,
                           'targets': Batch(batch.first, batch.second, batch.label).label.unsqueeze(-1),
                        # 'targets': Batch(batch.text, batch.label).label,
                          },
            self.batch_sampler.__iter__()
        )

    def __len__(self):
        return len(self.batch_sampler)

In [0]:
config = {'tokenization/embeddings': 'domains',
            'batch_size': 128,
          'hidden_size' : 256,
            'num_epochs': 10}

In [41]:
model = MyModel(VOCAB_SIZE,
                embed_size=EMBEDDINGS_DIM,
                hidden_size=config['hidden_size'],
                emb_pretrained = EMB_PRETRAINED,
                embeddings = embeddings_pretrained
               )

model.to(device)
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(config['batch_size'], config['batch_size'], config['batch_size']),
    shuffle=True,
    device=device,
    sort=False,
    sort_key=lambda x: len(x.first),
    sort_within_batch=False,
)

# train_iterator = BucketIteratorWrapper(train_iterator)
# valid_iterator = BucketIteratorWrapper(valid_iterator)
# test_iterator = BucketIteratorWrapper(test_iterator)

optimizer = optim.Adam(model.parameters(), weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, verbose=True, cooldown=5)
criterion = nn.BCEWithLogitsLoss()
criterion.to(device)

BCEWithLogitsLoss()

In [0]:
os.chdir('/content/')
logdir = '/content/'
RUN_NAME = 'domains'
RUN_ID = 's,bdbvcbbn'

In [0]:
from tqdm import tqdm
def clean_tqdm():
    for instance in list(tqdm._instances): 
        tqdm._decr_instances(instance)

for e in tqdm([1,2,3]):
    pass

In [0]:
runner = dl.SupervisedRunner(device=device)
loaders = OrderedDict(
    {'train': train_iterator,
    'valid': valid_iterator}
)

clean_tqdm()
runner.train(
    model=model, 
    criterion=criterion,
    optimizer=optimizer, 
    scheduler=scheduler,
    loaders=loaders,
    logdir=logdir,
    num_epochs=config['num_epochs'],
    verbose=True,
    valid_loader="valid",
    callbacks=[AccuracyCallback(num_classes=2,
                                activation='Sigmoid',
                                threshold=0.5
                                ),
               EarlyStoppingCallback(patience=4),
               WandbLogger(log_on_batch_end=True,
                           project="dpl",
                           name=RUN_NAME,
                           config=config,
                           id=RUN_ID
                           )],
    monitoring_params={
                    "project": "dpl",
                    'tags': 'lstm',
                    'config': config,
    }
)

In [0]:
results = torch.load('/content/checkpoints/train.2.pth', map_location=device)
model.load_state_dict(results['model_state_dict'])

In [0]:
!cp "/content/checkpoints/train.2.pth" "/content/drive/My Drive/model_checkpoints/"

In [0]:
def accuracy_score(preds, y):
    preds = torch.round(torch.sigmoid(preds))
    preds = (preds == y).float()
    accuracy = preds.sum() / len(preds)
    return accuracy.item()

In [0]:
def test_model(model, test_iterator):
    test_acc = []

    with torch.no_grad():
        for item in test_iterator:
            #x = item['features']
            #y = item['targets']
            first = item.first
            second = item.second
            y = item.label
            preds = model(first, second).squeeze(1)
            test_acc.append(accuracy_score(preds, y))
    #print(type(test_acc), type(test_acc[0]))
    test_acc = np.mean(test_acc) 
    return np.mean(test_acc)

In [0]:
test_accuracy = test_model(model, test_iterator)
print('Test accuracy: {}'.format(np.mean(test_accuracy)))

Test accuracy: 0.6202388583568105


In [0]:
wandb.init(id=RUN_ID, config=config)
wandb.log({"Test accc" : test_accuracy})