In [0]:
# !pip install eli5

In [18]:
from IPython.display import display, HTML
from sklearn.base import BaseEstimator, ClassifierMixin
import eli5
from eli5.lime import TextExplainer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizer, BertModel, GPT2Model, GPT2Tokenizer

In [19]:
# TODO Заменить на данные из итогового датасета

In [20]:
def open_file(file):
    with open(file, 'r', encoding='utf-8') as f:
        text_list = [line for line in f.readlines()]
    return text_list

In [21]:
fake = open_file("data/fake.txt")
real = open_file("data/real.txt")

In [22]:
pretrained_weights = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_weights)
model = GPT2Model.from_pretrained(pretrained_weights)

In [23]:
embeddings_pretrained = model.get_input_embeddings()
embeddings_pretrained

Embedding(50257, 768)

In [24]:
EMBEDDINGS_DIM = embeddings_pretrained.embedding_dim
VOCAB_SIZE = embeddings_pretrained.num_embeddings
EMB_PRETRAINED = True

In [25]:
def tokenize(text, tokenizer=tokenizer):
    return tokenizer.encode(text)

In [26]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, 
                 emb_pretrained, embeddings):
        super(MyModel, self).__init__()
        self.emb_pretrained = emb_pretrained
        self.embedding =  embeddings if self.emb_pretrained else nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2 *2, 1)
    def forward(self, x):
        
        x = self.embedding(x)
           
        _, (hidden, cell) = self.rnn(x)
        
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = torch.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        return x

In [27]:
config = {'tokenization/embeddings': 'gpt2',
            'batch_size': 256,
          'hidden_size' : 128,
            'num_epochs': 10}

In [28]:
model = MyModel(VOCAB_SIZE,
                embed_size=EMBEDDINGS_DIM,
                hidden_size=config['hidden_size'],
                emb_pretrained = EMB_PRETRAINED,
                embeddings = embeddings_pretrained
               )
# model.to(device)

In [29]:
results = torch.load('data/train.1.pth', map_location=torch.device('cpu'))
model.load_state_dict(results['model_state_dict'])

<All keys matched successfully>

In [33]:
class LSTMClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, model):
        super(LSTMClassifier, self).__init__() 
        self.model = model
        self.classes_ = (0,1)

    def fit(self, X=None, y=None, **kwargs):
        return self

    def predict_proba(self, texts):
        """
        texts: list of texts
        :return: ndarray n_texts x n_classes
        """
    
        ids = [tokenizer.encode(text) for text in texts]
        
        for ind, el in enumerate(ids):
            if len(el) < len(ids[0]):
                while len(el) < len(ids[0]):
                    el.append(1)
            if len(el) > len(ids[0]):
                ids[ind] = el[:len(ids[0])]            
        
#         print(ids)
        tensor = torch.tensor(ids)
        self.model.eval()
        with torch.no_grad():
            logits = self.model.forward(tensor)
#             print('logits',logits)
        sigmoids = torch.sigmoid(logits)  # First predict the 'Real' prob
#         print('sigmoids', sigmoids)
#         print('round', torch.round(sigmoids))
        opposite_class_prob = 1 - sigmoids  # Then calculate the 'Fake' prob
#         print('opposite', opposite_class_prob)
        probs = torch.cat((sigmoids, opposite_class_prob), dim=-1)
        
        return probs.detach().numpy()

    def predict(self, text):
        return int(torch.round(self.predict_proba(text)).item())

In [34]:
model_estimator = LSTMClassifier(model)
model_estimator.fit()

LSTMClassifier(model=MyModel(
  (embedding): Embedding(50257, 768)
  (rnn): LSTM(768, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
))

In [35]:
model_estimator.predict_proba([fake[0]])

array([[0.36928353, 0.63071644]], dtype=float32)

In [41]:
text = fake[23]
text
print(model_estimator.predict_proba([text]))
te = TextExplainer(random_state=42)
te.fit(doc=text, predict_proba=model_estimator.predict_proba)
te.show_prediction(target_names=['Real','Fake'])

[[0.05863371 0.9413663 ]]


Contribution?,Feature
5.472,Highlighted in text (sum)
0.55,<BIAS>


In [42]:
text = real[23]
text
print(model_estimator.predict_proba([text]))
te = TextExplainer(random_state=42)
te.fit(doc=text, predict_proba=model_estimator.predict_proba)
te.show_prediction(target_names=['Real','Fake'])

[[0.24789494 0.75210506]]


Contribution?,Feature
6.568,Highlighted in text (sum)
0.566,<BIAS>


In [44]:
for i in range(1, 51, 10):
    text_fake = fake[i]
    text_real = real[i]
    te = TextExplainer(random_state=42)
    print(model_estimator.predict_proba([text_fake]))
    te.fit(doc=text_fake, predict_proba=model_estimator.predict_proba)
    print('True label: Fake')
    display(te.show_prediction(target_names=['Real','Fake']))
    
    print(model_estimator.predict_proba([text_real]))
    te.fit(doc=text_real, predict_proba=model_estimator.predict_proba)
    print('True label: Real')
    display(te.show_prediction(target_names=['Real','Fake']))

[[0.5457914 0.4542086]]
True label: Fake


Contribution?,Feature
0.954,<BIAS>
0.689,Highlighted in text (sum)


[[0.9733819  0.02661812]]
True label: Real


Contribution?,Feature
2.957,Highlighted in text (sum)
0.57,<BIAS>


[[0.01277697 0.987223  ]]
True label: Fake


Contribution?,Feature
13.99,Highlighted in text (sum)
0.462,<BIAS>


[[0.7632493  0.23675072]]
True label: Real


Contribution?,Feature
3.719,Highlighted in text (sum)
0.629,<BIAS>


[[0.9972796  0.00272042]]
True label: Fake


Contribution?,Feature
0.615,<BIAS>
-0.425,Highlighted in text (sum)


[[0.96505564 0.03494436]]
True label: Real


Contribution?,Feature
0.897,Highlighted in text (sum)
0.663,<BIAS>


[[0.9852267  0.01477331]]
True label: Fake


Contribution?,Feature
1.442,Highlighted in text (sum)
-0.638,<BIAS>


[[0.95366627 0.04633373]]
True label: Real


Contribution?,Feature
3.521,Highlighted in text (sum)
0.544,<BIAS>


[[0.27970383 0.72029614]]
True label: Fake


Contribution?,Feature
3.65,Highlighted in text (sum)
0.438,<BIAS>


[[0.9732467  0.02675331]]
True label: Real


Contribution?,Feature
3.289,Highlighted in text (sum)
0.547,<BIAS>
