In [6]:
!pip install torchtext --quiet

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator, GloVe
from torchtext.data.utils import get_tokenizer
from tqdm import tqdm

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
# Re-load and Rebalance Dataset
df = pd.read_csv("Fake.csv")
df = df[['text', 'subject']].dropna()

# Re-encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['subject'])

# Print class counts
class_counts = df['label'].value_counts()
print("Before balancing:\n", class_counts)

# Balance: undersample to size of the smallest class
min_class_size = class_counts.min()

balanced_df = pd.concat([
    df[df['label'] == label].sample(min_class_size, random_state=42)
    for label in class_counts.index
])

balanced_df = balanced_df.sample(frac=1, random_state=42)  # Shuffle

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    balanced_df['text'], balanced_df['label'],
    test_size=0.2, stratify=balanced_df['label'], random_state=42
)

Before balancing:
 label
2    9050
5    6841
4    4459
0    1570
3     783
1     778
Name: count, dtype: int64


In [3]:
# Tokenizer and Vocab
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data):
    for text in data:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(X_train), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])
PAD_IDX = vocab["<pad>"]

In [4]:
# GloVe Embeddings
embedding_dim = 200
glove = GloVe(name='6B', dim=embedding_dim)
embedding_matrix = torch.zeros(len(vocab), embedding_dim)
for i, token in enumerate(vocab.get_itos()):
    embedding_matrix[i] = glove[token] if token in glove.stoi else torch.randn(embedding_dim)


In [6]:
# Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = [torch.tensor(vocab(tokenizer(t)), dtype=torch.long) for t in texts]
        self.labels = torch.tensor(labels, dtype=torch.long)
    def __len__(self): return len(self.labels)
    def __getitem__(self, idx): return self.texts[idx], self.labels[idx]

def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=PAD_IDX)
    return texts.to(device), torch.tensor(labels, dtype=torch.long).to(device)


In [8]:
train_dataset = TextDataset(X_train.tolist(), y_train.tolist())
test_dataset = TextDataset(X_test.tolist(), y_test.tolist())

label_counts = Counter(y_train.tolist())
weights = [1.0 / label_counts[label] for label in y_train.tolist()]
sampler = WeightedRandomSampler(weights, len(weights), replacement=True)

train_loader = DataLoader(train_dataset, batch_size=32, sampler=sampler, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)

In [9]:
# MLP Model
class MLPClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, embedding_weights):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weights, freeze=False, padding_idx=PAD_IDX)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc1 = nn.Linear(embed_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.drop1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.drop2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)
        x = self.pool(x).squeeze(-1)
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.drop1(x)
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.drop2(x)
        return self.fc3(x)


In [10]:
def train_model(model, name, total_epochs=20):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()
    best_f1 = 0

    for epoch in range(total_epochs):
        model.train()
        for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            optimizer.zero_grad()
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            loss.backward()
            optimizer.step()

        # Evaluation
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                preds = torch.argmax(model(X_batch), dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(y_batch.cpu().numpy())

        f1 = f1_score(all_labels, all_preds, average='weighted')
        print(f"Epoch {epoch+1} F1: {f1:.4f}")

        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), f"{name}_best.pt")

        if f1 >= 0.8:
            print("Target F1 reached. Stopping.")
            break

    print(f"Best F1: {best_f1:.4f}")


In [13]:
model = MLPClassifier(len(vocab), embedding_dim, num_classes, embedding_matrix)
history_balanced = train_model(model, "mlp_balanced", total_epochs=20)

Epoch 1: 100%|████████████████████████████████| 117/117 [00:15<00:00,  7.36it/s]


Epoch 1 F1: 0.1389


Epoch 2: 100%|████████████████████████████████| 117/117 [00:14<00:00,  7.97it/s]


Epoch 2 F1: 0.0539


Epoch 3: 100%|████████████████████████████████| 117/117 [00:14<00:00,  8.35it/s]


Epoch 3 F1: 0.1917


Epoch 4: 100%|████████████████████████████████| 117/117 [00:06<00:00, 17.70it/s]


Epoch 4 F1: 0.2397


Epoch 5: 100%|████████████████████████████████| 117/117 [00:07<00:00, 16.64it/s]


Epoch 5 F1: 0.1798


Epoch 6: 100%|████████████████████████████████| 117/117 [00:06<00:00, 18.12it/s]


Epoch 6 F1: 0.3044


Epoch 7: 100%|████████████████████████████████| 117/117 [00:07<00:00, 16.16it/s]


Epoch 7 F1: 0.2928


Epoch 8: 100%|████████████████████████████████| 117/117 [00:05<00:00, 20.13it/s]


Epoch 8 F1: 0.3322


Epoch 9: 100%|████████████████████████████████| 117/117 [00:05<00:00, 20.99it/s]


Epoch 9 F1: 0.3296


Epoch 10: 100%|███████████████████████████████| 117/117 [00:06<00:00, 19.17it/s]


Epoch 10 F1: 0.2493


Epoch 11: 100%|███████████████████████████████| 117/117 [00:05<00:00, 19.76it/s]


Epoch 11 F1: 0.3178


Epoch 12: 100%|███████████████████████████████| 117/117 [00:06<00:00, 17.29it/s]


Epoch 12 F1: 0.4238


Epoch 13: 100%|███████████████████████████████| 117/117 [00:06<00:00, 18.46it/s]


Epoch 13 F1: 0.2197


Epoch 14: 100%|███████████████████████████████| 117/117 [00:06<00:00, 19.00it/s]


Epoch 14 F1: 0.3828


Epoch 15: 100%|███████████████████████████████| 117/117 [00:06<00:00, 17.02it/s]


Epoch 15 F1: 0.3033


Epoch 16: 100%|███████████████████████████████| 117/117 [00:07<00:00, 16.46it/s]


Epoch 16 F1: 0.3123


Epoch 17: 100%|███████████████████████████████| 117/117 [00:05<00:00, 19.80it/s]


Epoch 17 F1: 0.3415


Epoch 18: 100%|███████████████████████████████| 117/117 [00:06<00:00, 17.20it/s]


Epoch 18 F1: 0.1567


Epoch 19: 100%|███████████████████████████████| 117/117 [00:07<00:00, 15.95it/s]


Epoch 19 F1: 0.1603


Epoch 20: 100%|███████████████████████████████| 117/117 [00:06<00:00, 18.43it/s]


Epoch 20 F1: 0.2451
Best F1: 0.4238


In [16]:
from sklearn.metrics import classification_report, f1_score

def evaluate_model(model, name):
    model.load_state_dict(torch.load(f"{name}_best.pt"))
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            output = model(X_batch)
            preds = torch.argmax(output, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))
    print("Weighted F1-score:", f1_score(all_labels, all_preds, average='weighted'))

In [19]:
evaluate_model(model, "mlp_balanced")

                 precision    recall  f1-score   support

Government News       0.46      0.23      0.30       155
    Middle-east       0.50      0.66      0.57       155
           News       0.92      0.58      0.71       156
        US_News       0.51      0.18      0.27       156
      left-news       0.30      0.47      0.36       156
       politics       0.27      0.43      0.33       156

       accuracy                           0.43       934
      macro avg       0.49      0.43      0.42       934
   weighted avg       0.49      0.43      0.42       934

Weighted F1-score: 0.4237896688394459


In [20]:
# CNN Classifier
class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, embedding_weights):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weights, freeze=False, padding_idx=PAD_IDX)
        self.conv1 = nn.Conv1d(in_channels=embed_dim, out_channels=128, kernel_size=5, stride=1)
        self.bn1 = nn.BatchNorm1d(128)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # [B, T, E]
        x = x.permute(0, 2, 1)  # [B, E, T]
        x = torch.relu(self.bn1(self.conv1(x)))  # [B, C, T']
        x = self.pool(x).squeeze(-1)  # [B, C]
        x = self.dropout(x)
        return self.fc(x)

In [21]:
cnn = CNNClassifier(len(vocab), embedding_dim, num_classes, embedding_matrix)

In [22]:
history_cnn = train_model(cnn, "cnn_balanced", total_epochs=20)

Epoch 1: 100%|████████████████████████████████| 117/117 [05:26<00:00,  2.79s/it]


Epoch 1 F1: 0.3836


Epoch 2: 100%|████████████████████████████████| 117/117 [04:44<00:00,  2.43s/it]


Epoch 2 F1: 0.4109


Epoch 3: 100%|████████████████████████████████| 117/117 [04:39<00:00,  2.39s/it]


Epoch 3 F1: 0.4230


Epoch 4: 100%|████████████████████████████████| 117/117 [05:19<00:00,  2.73s/it]


Epoch 4 F1: 0.3837


Epoch 5: 100%|████████████████████████████████| 117/117 [04:45<00:00,  2.44s/it]


Epoch 5 F1: 0.4294


Epoch 6: 100%|████████████████████████████████| 117/117 [04:27<00:00,  2.29s/it]


Epoch 6 F1: 0.3987


Epoch 7: 100%|████████████████████████████████| 117/117 [04:51<00:00,  2.49s/it]


Epoch 7 F1: 0.4462


Epoch 8: 100%|████████████████████████████████| 117/117 [04:22<00:00,  2.24s/it]


Epoch 8 F1: 0.4774


Epoch 9: 100%|████████████████████████████████| 117/117 [04:14<00:00,  2.18s/it]


Epoch 9 F1: 0.4696


Epoch 10: 100%|███████████████████████████████| 117/117 [04:09<00:00,  2.13s/it]


Epoch 10 F1: 0.4718


Epoch 11: 100%|███████████████████████████████| 117/117 [05:10<00:00,  2.65s/it]


Epoch 11 F1: 0.4616


Epoch 12: 100%|███████████████████████████████| 117/117 [05:36<00:00,  2.88s/it]


Epoch 12 F1: 0.4709


Epoch 13: 100%|███████████████████████████████| 117/117 [05:35<00:00,  2.87s/it]


Epoch 13 F1: 0.4656


Epoch 14: 100%|███████████████████████████████| 117/117 [04:24<00:00,  2.26s/it]


Epoch 14 F1: 0.4708


Epoch 15: 100%|███████████████████████████████| 117/117 [04:58<00:00,  2.55s/it]


Epoch 15 F1: 0.4731


Epoch 16: 100%|███████████████████████████████| 117/117 [03:31<00:00,  1.81s/it]


Epoch 16 F1: 0.4673


Epoch 17: 100%|███████████████████████████████| 117/117 [02:49<00:00,  1.45s/it]


Epoch 17 F1: 0.4539


Epoch 18: 100%|███████████████████████████████| 117/117 [02:28<00:00,  1.27s/it]


Epoch 18 F1: 0.4554


Epoch 19: 100%|███████████████████████████████| 117/117 [02:32<00:00,  1.30s/it]


Epoch 19 F1: 0.4700


Epoch 20: 100%|███████████████████████████████| 117/117 [01:27<00:00,  1.34it/s]


Epoch 20 F1: 0.4772
Best F1: 0.4774


In [23]:
evaluate_model(cnn, "cnn_balanced")

                 precision    recall  f1-score   support

Government News       0.50      0.52      0.51       155
    Middle-east       0.27      0.23      0.25       155
           News       0.99      0.99      0.99       156
        US_News       0.34      0.38      0.36       156
      left-news       0.39      0.40      0.40       156
       politics       0.36      0.35      0.36       156

       accuracy                           0.48       934
      macro avg       0.48      0.48      0.48       934
   weighted avg       0.48      0.48      0.48       934

Weighted F1-score: 0.4773991946364565


In [None]:
#я пыталась добить до 0.8 но не вышло, линейная еще подавала надежду на первых попытках и доходила
#до 0.65 но была очень нестабильной с сильными колебаниями метрик. 
# CNN показала себя более устойчивой, быстро набрав F1 ≈ 0.47, но выше не поднялась.
#Я также старалась добиться равномерного качества по классам: провела балансировку классов, 
#анализировала macro и weighted F1. Однако по отдельным классам
#особенно "US_News" и "Government News", 
#модель всё ещё даёт слабый результат..