In [0]:
# !pip install eli5

In [4]:
from sklearn.base import BaseEstimator, ClassifierMixin
import eli5
from eli5.lime import TextExplainer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizer, BertModel, GPT2Model, GPT2Tokenizer

In [None]:
# TODO Заменить на данные из итогового датасета

In [2]:
def open_file(file):
    with open(file, 'r', encoding='utf-8') as f:
        text_list = [line for line in f.readlines()]
    return text_list

In [3]:
fake = open_file("data/fake.txt")
real = open_file("data/real.txt")

In [7]:
pretrained_weights = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_weights)
model = GPT2Model.from_pretrained(pretrained_weights)

In [8]:
embeddings_pretrained = model.get_input_embeddings()
embeddings_pretrained

Embedding(50257, 768)

In [9]:
EMBEDDINGS_DIM = embeddings_pretrained.embedding_dim
VOCAB_SIZE = embeddings_pretrained.num_embeddings
EMB_PRETRAINED = True

In [10]:
def tokenize(text, tokenizer=tokenizer):
    return tokenizer.encode(text)

In [11]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, 
                 emb_pretrained, embeddings):
        super(MyModel, self).__init__()
        self.emb_pretrained = emb_pretrained
        self.embedding =  embeddings if self.emb_pretrained else nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2 *2, 1)
    def forward(self, x):
        
        x = self.embedding(x)
           
        _, (hidden, cell) = self.rnn(x)
        
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = torch.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        return x

In [14]:
config = {'tokenization/embeddings': 'gpt2',
            'batch_size': 256,
          'hidden_size' : 128,
            'num_epochs': 10}

In [56]:
model = MyModel(VOCAB_SIZE,
                embed_size=EMBEDDINGS_DIM,
                hidden_size=config['hidden_size'],
                emb_pretrained = EMB_PRETRAINED,
                embeddings = embeddings_pretrained
               )
# model.to(device)

In [66]:
results = torch.load('data/train.2.pth', map_location=torch.device('cpu'))
model.load_state_dict(results['model_state_dict'])

<All keys matched successfully>

In [83]:
class LSTMClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, model):
        super(LSTMClassifier, self).__init__() 
        self.model = model
        self.classes_ = (0,1)

    def fit(self, X=None, y=None, **kwargs):
        return self

    def predict_proba(self, texts):
        """
        texts: list of texts
        :return: ndarray n_texts x n_classes
        """
    
        ids = [tokenizer.encode(text) for text in texts]
        
        for ind, el in enumerate(ids):
            if len(el) < len(ids[0]):
                while len(el) < len(ids[0]):
                    el.append(1)
            if len(el) > len(ids[0]):
                ids[ind] = el[:len(ids[0])]            
        
        tensor = torch.tensor(ids)
        self.model.eval()
        with torch.no_grad():
            logits = self.model.forward(tensor)
            print('logits',logits)
        sigmoids = torch.softmax(logits, dim=-1)  # First predict the 'Real' prob
        print('sigmoids', sigmoids)
        print('round', torch.round(sigmoids))
        opposite_class_prob = 1 - sigmoids  # Then calculate the 'Fake' prob
        print('opposite', opposite_class_prob)
        probs = torch.cat((sigmoids, opposite_class_prob), dim=-1)
        
        return probs.detach().numpy()

    def predict(self, text):
        return int(torch.round(self.predict_proba(text)).item())

In [75]:
model_estimator = LSTMClassifier(model)
model_estimator.fit()

LSTMClassifier(model=MyModel(
  (embedding): Embedding(50257, 768)
  (rnn): LSTM(768, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
))

In [65]:
model_estimator.predict_proba([fake[0]])

logits tensor([[0.0498]])
sigmoids tensor([[0.5125]])
opposite tensor([[0.4875]])


array([[0.5124597, 0.4875403]], dtype=float32)

In [84]:
model_estimator = LSTMClassifier(model)
model_estimator.fit()

LSTMClassifier(model=MyModel(
  (embedding): Embedding(50257, 768)
  (rnn): LSTM(768, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
))

In [85]:
model_estimator.predict_proba([real[-11]])

logits tensor([[-16.5015]])
sigmoids tensor([[1.]])
round tensor([[1.]])
opposite tensor([[0.]])


array([[1., 0.]], dtype=float32)

In [46]:
# help(TextExplainer)

In [47]:
# help(te.fit)

In [61]:
# from IPython.display import display, HTML

# for i in range(1, 51, 10):
#     text_fake = fake[i]
#     text_real = real[i]
#     te = TextExplainer(random_state=42)
#     print(model_estimator.predict_proba([text_fake]))
#     te.fit(doc=text_fake, predict_proba=model_estimator.predict_proba)
#     print('True label: Fake')
#     display(te.show_prediction(target_names=['Fake','Real']))
    
#     print(model_estimator.predict_proba([text_real]))
#     te.fit(doc=text_real, predict_proba=model_estimator.predict_proba)
#     print('True label: Real')
#     display(te.show_prediction(target_names=['Fake','Real']))

In [None]:
text = real[44]
text
print(model_estimator.predict_proba([text]))
te = TextExplainer(random_state=42)
te.fit(doc=text, predict_proba=model_estimator.predict_proba)
te.show_prediction(target_names=['Fake','Real'])