In [None]:
!pip install -q kagglehub

In [None]:
import os
import re
from pathlib import Path
import html

from kaggle.api.kaggle_api_extended import KaggleApi

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel

from tqdm import tqdm

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device = {device}")

Device = cuda


# Парсим данные с Kaggle

In [None]:
def download_kaggle_dataset(dataset_id, target_dir):
    target_path = Path(target_dir)

    if not target_path.exists():
        try:
            target_path.mkdir(parents=True, exist_ok=True)
            print(f"Создана директория: {target_path}")
        except Exception as err:
            print(f"Не удалось создать директорию {target_path}: {err}")
            return

    try:
        api = KaggleApi()
        api.authenticate()
    except Exception as err:
        print(f"Ошибка аутентификации в Kaggle API: {err}")
        return

    try:
        print(f"Начинается загрузка набора данных '{dataset_id}' в {target_path} ...")
        api.dataset_download_files(dataset_id, path=str(target_path), unzip=True)
    except Exception as err:
        print(f"Ошибка при загрузке набора данных: {err}")
        return

    print(f"Набор данных '{dataset_id}' успешно загружен в {target_path}")

In [None]:
 download_kaggle_dataset(
    dataset_id="bhavikjikadara/fake-news-detection",
    target_dir="/content/"
)

Начинается загрузка набора данных 'bhavikjikadara/fake-news-detection' в /content ...
Dataset URL: https://www.kaggle.com/datasets/bhavikjikadara/fake-news-detection
Набор данных 'bhavikjikadara/fake-news-detection' успешно загружен в /content


# Анализ полученного датасета и очистка

In [None]:
df_fake = pd.read_csv('/content/fake.csv')
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [None]:
df_real = pd.read_csv('/content/true.csv')
df_real.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [None]:
df_fake['label'] = 'Fake'
df_real['label'] = 'Real'
df = pd.concat([df_fake,df_real])
df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",Fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",Fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",Fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",Fake


In [None]:
def make_properly_columns(df):
    df['TEXT'] = df['title'] + df['text']
    del df['date']
    del df['subject']
    del df['text']
    del df['title']
    return df

def binarize_classes(df):
    classes = {
        'Fake': 0,
        'Real': 1
    }
    df['label'] = df['label'].replace(classes)
    return df

def clear_nans(df):
    nans_number = df.isnull().sum().sum()

    if nans_number:
        df.dropna()
    else:
        print("No NaNs in the dataframe")
    return df

def clean_sentence(sentence):
    sentence = str(sentence).lower()
    sentence = html.unescape(sentence)
    sentence = re.sub(r'<[^>]+>', '', sentence)
    sentence = re.sub(r'http\S+', '', sentence)
    sentence = re.sub(r'@\w+', '', sentence)
    sentence = re.sub(r'#\w+', '', sentence)
    sentence = re.sub(r'[^\w\s]', '', sentence)
    sentence = re.sub(r'\d+', '', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = re.sub(r'\b\w\b', '', sentence)
    return sentence.strip()

In [None]:
df = make_properly_columns(df)
df = binarize_classes(df)
df.head()

  df['label'] = df['label'].replace(classes)


Unnamed: 0,label,TEXT
0,0,Donald Trump Sends Out Embarrassing New Year’...
1,0,Drunk Bragging Trump Staffer Started Russian ...
2,0,Sheriff David Clarke Becomes An Internet Joke...
3,0,Trump Is So Obsessed He Even Has Obama’s Name...
4,0,Pope Francis Just Called Out Donald Trump Dur...


In [None]:
df['TEXT'] = df['TEXT'].apply(clean_sentence)
df.head()

Unnamed: 0,label,TEXT
0,0,donald trump sends out embarrassing new years ...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obamas name c...
4,0,pope francis just called out donald trump duri...


In [None]:
df = clear_nans(df)

No NaNs in the dataframe


In [None]:
df = df.sample(frac=1)

In [None]:
print(f"Общее число сэмплов: {len(df)}")
print(f"Число положительных меток класса: {sum(df['label'] == 1)}")
print(f"Число отрицательных меток класса: {sum(df['label'] == 0)}")

Общее число сэмплов: 44898
Число положительных меток класса: 21417
Число отрицательных меток класса: 23481


# Будем использовать уже обученные эмбеддинги Берта

In [None]:
def get_bert_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        return model(**tokens).last_hidden_state[:, 0, :].cpu().numpy()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
df["Embeddings"] = df["TEXT"].apply(get_bert_embedding)

In [None]:
df.head()

Unnamed: 0,label,TEXT,Embeddings
23398,0,boiler room ep what is real brussels yemen tru...,"[[-0.52300113, -0.29880503, -0.10092531, 0.090..."
4060,1,us congress may seek oneweek funding extension...,"[[-0.35638985, -0.15991126, 0.0751229, 0.53028..."
12527,1,tillerson set to meet trudeau for north korea ...,"[[-0.2338068, -0.22895521, 0.0683407, 0.257630..."
18656,0,former fbi asst director jim comey danced with...,"[[-0.35910344, -0.23442967, -0.21880776, -0.10..."
7405,0,antitrump protesters pepper sprayed again vide...,"[[-0.38558483, -0.42751813, 0.08084756, 0.3130..."


# Разбиваем на train val test и cтроим класс датасета, даталоадеры

In [None]:
X = df['Embeddings']
y = df['label']

In [None]:
X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_tmp, y_tmp, test_size=0.5, random_state=42, stratify=y_tmp)

print(f"В тренировочной выборке {len(X_train)} элементов")
print(f"В валидационной выборке {len(X_val)} элементов")
print(f"В тестовой выборке {len(X_test)} элементов")

В тренировочной выборке 35918 элементов
В валидационной выборке 4490 элементов
В тестовой выборке 4490 элементов


In [None]:
class ClassificationDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y.to_numpy()

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X_tensor = torch.tensor(self.X.iloc[idx], dtype=torch.float32)
        y_tensor = torch.tensor(self.y[idx], dtype=torch.long)
        return X_tensor, y_tensor

In [None]:
train_dataset = ClassificationDataset(X_train, y_train)
val_dataset = ClassificationDataset(X_val, y_val)
test_dataset = ClassificationDataset(X_test, y_test)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False
)

# Модель: LSTM с линейной головой для классификации

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_lstm_layers, dropout):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_lstm_layers,
            batch_first=True,
            dropout=dropout,
        )
        self.batch_norm = nn.BatchNorm1d(hidden_dim)
        self.fc1 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.relu = nn.Tanh()
        self.fc2 = nn.Linear(hidden_dim // 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x, (hidden, cell) = self.lstm(x)
        x = x[:, -1, :]
        x = self.batch_norm(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [38]:
model = LSTMClassifier(
    embedding_dim=768,
    hidden_dim=512,
    output_dim=2,
    num_lstm_layers=2,
    dropout=0.21
)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

model_parameters = sum(p.numel() for p in model.parameters())
print(f'Тренируемых параметров в модели: {model_parameters} штук')

Тренируемых параметров в модели: 4859650 штук


# Обучение модели

In [39]:
def run_epoch(model, data_loader, criterion, optimizer=None, is_training=True, device=None):
    model.train() if is_training else model.eval()
    epoch_losses = []
    correct_preds = 0
    total_preds = 0

    with torch.set_grad_enabled(is_training):
        for X_batch, y_batch in tqdm(data_loader, desc="Обработка батча"):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            if is_training:
                optimizer.zero_grad()

            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            epoch_losses.append(loss.item())

            if is_training:
                loss.backward()
                optimizer.step()

            _, predicted = torch.max(outputs, 1)
            correct_preds += (predicted == y_batch).sum().item()
            total_preds += y_batch.size(0)

    avg_loss = np.mean(epoch_losses)
    accuracy = correct_preds / total_preds
    return avg_loss, accuracy

def train(model, criterion, optimizer, num_epochs, params_dir, train_loader, val_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(1, num_epochs + 1):
        train_loss, train_acc = run_epoch(model, train_loader, criterion, optimizer, is_training=True, device=device)

        val_loss, val_acc = run_epoch(model, val_loader, criterion, is_training=False, device=device)

        print(f"Epoch {epoch}/{num_epochs}")
        print(f"  Train | Loss: {train_loss:.3f} | Acc: {train_acc:.3f}")
        print(f"  Val   | Loss: {val_loss:.3f}   | Acc: {val_acc:.3f}")

    torch.save(model.state_dict(), params_dir)
    print(f"Веса модели сохранены в {params_dir}")

In [40]:
train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=16,
    params_dir="/content/LSTMClassifier.pth",
    train_loader=train_dataloader,
    val_loader=val_dataloader
)

Обработка батча: 100%|██████████| 1123/1123 [00:06<00:00, 176.15it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 604.31it/s]


Epoch 1/16
  Train | Loss: 0.167 | Acc: 0.943
  Val   | Loss: 0.115   | Acc: 0.961


Обработка батча: 100%|██████████| 1123/1123 [00:05<00:00, 189.11it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 612.46it/s]


Epoch 2/16
  Train | Loss: 0.118 | Acc: 0.959
  Val   | Loss: 0.065   | Acc: 0.977


Обработка батча: 100%|██████████| 1123/1123 [00:06<00:00, 176.06it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 584.97it/s]


Epoch 3/16
  Train | Loss: 0.106 | Acc: 0.965
  Val   | Loss: 0.075   | Acc: 0.977


Обработка батча: 100%|██████████| 1123/1123 [00:05<00:00, 188.31it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 424.99it/s]


Epoch 4/16
  Train | Loss: 0.105 | Acc: 0.965
  Val   | Loss: 0.120   | Acc: 0.961


Обработка батча: 100%|██████████| 1123/1123 [00:06<00:00, 178.11it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 605.56it/s]


Epoch 5/16
  Train | Loss: 0.090 | Acc: 0.970
  Val   | Loss: 0.056   | Acc: 0.981


Обработка батча: 100%|██████████| 1123/1123 [00:06<00:00, 181.56it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 405.57it/s]


Epoch 6/16
  Train | Loss: 0.093 | Acc: 0.970
  Val   | Loss: 0.061   | Acc: 0.972


Обработка батча: 100%|██████████| 1123/1123 [00:06<00:00, 186.99it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 521.11it/s]


Epoch 7/16
  Train | Loss: 0.087 | Acc: 0.971
  Val   | Loss: 0.050   | Acc: 0.981


Обработка батча: 100%|██████████| 1123/1123 [00:06<00:00, 175.40it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 615.80it/s]


Epoch 8/16
  Train | Loss: 0.079 | Acc: 0.974
  Val   | Loss: 0.048   | Acc: 0.983


Обработка батча: 100%|██████████| 1123/1123 [00:05<00:00, 189.18it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 568.21it/s]


Epoch 9/16
  Train | Loss: 0.085 | Acc: 0.973
  Val   | Loss: 0.080   | Acc: 0.982


Обработка батча: 100%|██████████| 1123/1123 [00:06<00:00, 176.40it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 613.00it/s]


Epoch 10/16
  Train | Loss: 0.074 | Acc: 0.976
  Val   | Loss: 0.044   | Acc: 0.983


Обработка батча: 100%|██████████| 1123/1123 [00:05<00:00, 189.30it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 594.47it/s]


Epoch 11/16
  Train | Loss: 0.075 | Acc: 0.976
  Val   | Loss: 0.053   | Acc: 0.980


Обработка батча: 100%|██████████| 1123/1123 [00:06<00:00, 176.85it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 611.08it/s]


Epoch 12/16
  Train | Loss: 0.071 | Acc: 0.978
  Val   | Loss: 0.165   | Acc: 0.957


Обработка батча: 100%|██████████| 1123/1123 [00:05<00:00, 188.77it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 601.38it/s]


Epoch 13/16
  Train | Loss: 0.072 | Acc: 0.978
  Val   | Loss: 0.046   | Acc: 0.987


Обработка батча: 100%|██████████| 1123/1123 [00:06<00:00, 174.79it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 596.76it/s]


Epoch 14/16
  Train | Loss: 0.066 | Acc: 0.979
  Val   | Loss: 0.048   | Acc: 0.986


Обработка батча: 100%|██████████| 1123/1123 [00:05<00:00, 187.76it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 612.83it/s]


Epoch 15/16
  Train | Loss: 0.065 | Acc: 0.979
  Val   | Loss: 0.052   | Acc: 0.982


Обработка батча: 100%|██████████| 1123/1123 [00:06<00:00, 174.91it/s]
Обработка батча: 100%|██████████| 141/141 [00:00<00:00, 559.87it/s]

Epoch 16/16
  Train | Loss: 0.066 | Acc: 0.979
  Val   | Loss: 0.047   | Acc: 0.986
Веса модели сохранены в /content/LSTMClassifier.pth





# Тест модели

In [41]:
def evaluate(model, test_dataloader, device):
    model.eval()
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():
        for X_batch, y_batch in tqdm(test_dataloader, desc="Тестирование"):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)

            correct_preds += (predicted == y_batch).sum().item()
            total_preds += y_batch.size(0)

    accuracy = (correct_preds / total_preds) * 100
    return accuracy

In [42]:
test_accuracy = evaluate(model, test_dataloader, device)
print(f"Test Accuracy: {test_accuracy:.3f}%")

Тестирование: 100%|██████████| 141/141 [00:00<00:00, 624.09it/s]

Test Accuracy: 98.241%



