In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from gensim.models import Word2Vec
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import nltk
from nltk.probability import FreqDist
from torchmetrics import F1Score


nltk.download('punkt')
import warnings
warnings.filterwarnings("ignore")


In [5]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [6]:
torch.manual_seed(2023)

<torch._C.Generator at 0x7fe0e4139670>

In [8]:
#загрузка данных и предобработка
fake_df = pd.read_csv('datasets/Fake.csv')
fake_df['class'] = 0
true_df = pd.read_csv('datasets/True.csv')
true_df['class'] = 1


true_df["text"] = true_df["title"] + " " + true_df["text"]
fake_df["text"] = fake_df["title"] + " " + fake_df["text"]

true_df = true_df.drop(["subject", "date", "title"], axis=1)
fake_df = fake_df.drop(["subject", "date", "title"], axis=1)

news_df = pd.concat([true_df, fake_df]).reset_index(drop=True)
news_df = news_df.sample(frac=1, random_state=2023).reset_index(drop=True)

In [9]:
news_df.head()

Unnamed: 0,text,class
0,AWESOME RANT By African-American Woman Who’s F...,0
1,Meghan McCain: Ted Cruz Is ‘The Thinking Man’...,0
2,China slams Indian minister's visit to dispute...,1
3,OBAMA AND VALERIE JARRETT Finalize Executive A...,0
4,Trump Gives Insane Warning On Religious Right...,0


**Получим эмбеддинги для текстов**

In [10]:
word2vec_model = api.load("word2vec-google-news-300")



In [11]:
def text_to_vector(text, model):
    words = text.split()
    vectorized_words = [model[word] for word in words if word in model]
    if vectorized_words:
        return np.mean(vectorized_words, axis=0)
    else:
        return np.zeros(model.vector_size)

def pad_batch(batch):
    data = [torch.Tensor(item[0]).long() for item in batch]
    data = pad_sequence(data)
    labels = torch.Tensor([item[1] for item in batch]).long()
    return data, labels

embeds = np.array([text_to_vector(text, word2vec_model) for text in tqdm(news_df['text'])])


100%|██████████| 44898/44898 [01:38<00:00, 453.79it/s]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(news_df['text'].to_numpy(),
                                                    news_df['class'].to_numpy(),
                                                    test_size=0.2,
                                                    random_state=2023,
                                                    shuffle=True)

In [13]:
train_corpus = " ".join(news_df["text"])
train_corpus = train_corpus.lower()
train_corpus = list(news_df["text"])

tokens = []
for text in tqdm(train_corpus):
  tokens.extend(word_tokenize(text))
tokens_filtered = [word for word in tokens if word.isalnum()]

100%|██████████| 44898/44898 [04:25<00:00, 169.28it/s]


In [14]:
max_words = 5000
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [15]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}
len(vocabulary)

4999

In [16]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [17]:
import numpy as np

def load_pretrained_vectors(word2idx, model):
    # Инициализация случайных эмбеддингов
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), model.vector_size))
    embeddings[word2idx['<pad>']] = np.zeros((model.vector_size,))

    count = 0
    for word, index in word2idx.items():
        if word in model:
            embeddings[index] = model[word]
            count += 1

    print(f"There are {count} / {len(word2idx)} pretrained vectors found.")
    return embeddings

vocabulary['<pad>'] = 0
embeddings = load_pretrained_vectors(vocabulary, word2vec_model)


There are 4912 / 5000 pretrained vectors found.


In [18]:
embeddings.shape

(5000, 300)

In [None]:
class ConvTextClassifierPretrainedW2V(nn.Module):
    def __init__(self, vocab_size=5000, embedding_dim=300, out_channel=128, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=False)
        self.conv = nn.Conv1d(embedding_dim, out_channel, kernel_size=3)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool1d(kernel_size=2)
        self.batchnorm = nn.BatchNorm1d(out_channel)
        self.dropout = nn.Dropout(0.5)

        self.linear = nn.Linear(out_channel, num_classes)

    def forward(self, x):
        output = self.embedding(x)
        output = output.permute(0, 2, 1)  # bs, emb_dim, len
        output = self.conv(output)
        output = self.relu(output)
        output = self.maxpool(output)
        output = self.batchnorm(output)
        output = output.view(output.size(0), -1)
        output = self.dropout(output)
        output = self.linear(output)
        return output

In [19]:
class TextDataWrapper(Dataset):
    def __init__(self, data, target=None, transform=None):
        self.data = torch.from_numpy(data).long()
        if target is not None:
            self.target = torch.from_numpy(target).long()
        else:
          self.target = None
        self.transform = transform

    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index] if self.target is not None else -1

        if self.transform:
            x = self.transform(x)
        return x, y

    def __len__(self):
        return len(self.data)

In [20]:
max_len = 40

In [21]:
X_train_c = np.array([text_to_sequence(text, max_len) for text in tqdm(X_train)], dtype=np.float32)

100%|██████████| 35918/35918 [02:16<00:00, 263.71it/s]


In [22]:
X_test_c = np.array([text_to_sequence(text, max_len) for text in tqdm(X_test)], dtype=np.float32)

100%|██████████| 8980/8980 [00:33<00:00, 271.17it/s]


In [69]:
model = ConvTextClassifierPretrainedW2V()

In [None]:
batch_size = 256
epochs = 10

# Переносим модель на GPU, если доступен
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

print(model)
print("Parameters:", sum([param.nelement() for param in model.parameters()]))

model.train()
f1 = F1Score(task="binary").to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

train_dataset = TextDataWrapper(X_train_c, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

loss_history = []

for epoch in range(1, epochs + 1):
    print(f"Train epoch {epoch}/{epochs}")
    temp_loss = []
    temp_metrics = []
    for i, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()

        # Переносим данные на GPU
        data, target = data.to(device), target.to(device)

        output = model(data)

        loss = criterion(output, target)
        loss.backward()

        optimizer.step()
        temp_loss.append(loss.float().item())
        temp_metrics.append(f1(output.argmax(1), target).item())

    epoch_loss = np.array(temp_loss).mean()
    epoch_f1 = np.array(temp_metrics).mean()
    print(f'Loss: {epoch_loss}, f1 score: {epoch_f1}')


In [27]:
def predict(model, data_loader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for data, _ in data_loader:
            data = data.to(device)
            output = model(data)
            predicted_classes = output.argmax(dim=1)
            predictions.extend(predicted_classes.cpu().numpy())

    model.train()
    return predictions


In [33]:
batch_size = 256
test_dataset = TextDataWrapper(X_test_c, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [55]:
predictions = predict(model, test_loader)

In [56]:
print('Classification report for CNN model')
print(classification_report(y_test, predictions))

Classification report for CNN model
              precision    recall  f1-score   support

           0       0.52      0.52      0.52      4695
           1       0.47      0.47      0.47      4285

    accuracy                           0.50      8980
   macro avg       0.50      0.50      0.50      8980
weighted avg       0.50      0.50      0.50      8980



In [37]:
class LSTM(nn.Module):
    def __init__(self, vocab_size=5000, embedding_dim=300, hidden_dim=128, num_classes=2):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*4, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = self.dropout(lstm_out)
        avg_pool = torch.mean(lstm_out, 1)
        max_pool, _ = torch.max(lstm_out, 1)
        concatenated = torch.cat((avg_pool, max_pool), 1)
        output = self.fc(concatenated)
        return output

In [38]:
embeddings = torch.tensor(embeddings, dtype=torch.float32)
model = LSTM()

In [39]:
embeddings.shape

torch.Size([5000, 300])

In [40]:
train_dataset = TextDataWrapper(X_train_c, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [42]:
batch_size = 256
epochs = 10

# Переносим модель на GPU, если доступен
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

print(model)
print("Parameters:", sum([param.nelement() for param in model.parameters()]))

model.train()
f1 = F1Score(task="binary").to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

train_dataset = TextDataWrapper(X_train_c, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

loss_history = []

for epoch in range(1, epochs + 1):
    print(f"Train epoch {epoch}/{epochs}")
    temp_loss = []
    temp_metrics = []
    for i, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()

        # Переносим данные на GPU
        data, target = data.to(device), target.to(device)

        output = model(data)

        loss = criterion(output, target)
        loss.backward()

        optimizer.step()
        temp_loss.append(loss.float().item())
        temp_metrics.append(f1(output.argmax(1), target).item())

    epoch_loss = np.array(temp_loss).mean()
    epoch_f1 = np.array(temp_metrics).mean()
    print(f'Loss: {epoch_loss}, f1 score: {epoch_f1}')


LSTM(
  (embedding): Embedding(5000, 300)
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
Parameters: 2336610
Train epoch 1/10
Loss: 0.23607120942984913, f1 score: 0.8800093140585202
Train epoch 2/10
Loss: 0.11565913915211427, f1 score: 0.9544384192067681
Train epoch 3/10
Loss: 0.08457242138683796, f1 score: 0.968553051035455
Train epoch 4/10
Loss: 0.06392311851711983, f1 score: 0.9766937963506008
Train epoch 5/10
Loss: 0.04494608222698489, f1 score: 0.9846901111568965
Train epoch 6/10
Loss: 0.02563950370217786, f1 score: 0.9918507261479155
Train epoch 7/10
Loss: 0.01971150945264397, f1 score: 0.9930601885132756
Train epoch 8/10
Loss: 0.013000257683964116, f1 score: 0.9961216745647132
Train epoch 9/10
Loss: 0.008637847037447603, f1 score: 0.9975270168155643
Train epoch 10/10
Loss: 0.012514783503778331, f1 score: 0.9957092548093052


In [43]:
predictions = predict(model, test_loader)

In [44]:
print('Classification report for LSTM model')
print(classification_report(y_test, predictions))

Classification report for LSTM model
              precision    recall  f1-score   support

           0       0.52      0.53      0.52      4695
           1       0.48      0.47      0.48      4285

    accuracy                           0.50      8980
   macro avg       0.50      0.50      0.50      8980
weighted avg       0.50      0.50      0.50      8980

