In [1]:
import torch
from torch import nn, optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import nltk
nltk.download("punkt", download_dir="nltk-tokenizers")
nltk.download("punkt_tab", download_dir="nltk-tokenizers")
from nltk.tokenize import word_tokenize
import numpy as np
import json
import re
import csv
from collections import Counter

device = "cuda" if torch.cuda.is_available() else "cpu"
MAX_FEATURES = 4096 # total number of known tokens

[nltk_data] Downloading package punkt to nltk-tokenizers...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to nltk-tokenizers...
[nltk_data]   Package punkt_tab is already up-to-date!
  return torch._C._cuda_getDeviceCount() > 0


In [2]:
NON_ALPHANUM = re.compile(r"[\W]")
NON_ASCII = re.compile(r"[^a-z0-9\s]")


def normalize_text(text):
    text = NON_ALPHANUM.sub(" ", text.lower())
    text = NON_ASCII.sub("", text)
    return text


def load_data(filepath):
    labels = []
    texts = []

    for i, line in enumerate(csv.reader(open(filepath))):
        if i == 0:
            continue
        _, _, text, label = line
        labels.append(int(label))
        texts.append(normalize_text(text).strip())

    return labels, texts


labels, texts = load_data("fake-reviews-dataset/fake_reviews_dataset.csv")
print(f"{len(labels) = }")
print(f"{len(texts) = }")

len(labels) = 40526
len(texts) = 40526


In [3]:
def tokenize_text(text):
    return word_tokenize(text)


vocab = json.load(open("vocab.json"))
def encode_text(text):
    return [
        vocab.get(tk, 1)
        for tk in tokenize_text(text)
    ]


class CustomDataset(Dataset):

    def __init__(self, texts, labels):
        self.texts = [torch.tensor(encode_text(text)) for text in texts]
        self.labels = torch.tensor(labels).float()
    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return self.texts[index], self.labels[index]


def collate_fn(batch):
    texts, labels = zip(*batch)
    return pad_sequence(texts, batch_first=True), torch.tensor(labels).float()


def make_ds():
    full_ds = CustomDataset(texts, labels)
    train_ds, test_ds = random_split(full_ds, [0.8, 0.2])
    return train_ds, test_ds

train_ds, test_ds = make_ds()
train_dl = DataLoader(train_ds, batch_size=128, shuffle=True, collate_fn=collate_fn)
test_dl = DataLoader(test_ds, batch_size=128, shuffle=False, collate_fn=collate_fn)
print(f"{len(train_ds) = }")
print(f"{len(test_ds) = }")

len(train_ds) = 32421
len(test_ds) = 8105


In [4]:
class ClassifierModel(nn.Module):

    def __init__(self, vocab_size, emb=64):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb)
        self.conv1 = nn.Conv1d(emb, 64, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(64, 64, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool1d(2)
        self.conv3 = nn.Conv1d(64, 64, kernel_size=3, padding=1)
        self.globpool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(64, 100)
        self.fc2 = nn.Linear(100, 1)

    def forward(self, x):
        x = self.embed(x).transpose(1, 2)
        x = self.pool1(torch.relu(self.conv1(x)))
        x = self.pool2(torch.relu(self.conv2(x)))
        x = torch.relu(self.conv3(x))
        x = self.globpool(x).squeeze(2)
        x = torch.relu(self.fc1(x))
        return torch.sigmoid(self.fc2(x)).squeeze(1)

In [5]:
def train(model, epochs=3):
    optimizer = optim.Adam(model.parameters())
    crierion = nn.BCELoss()

    for e in range(epochs):
        model.train()
        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = crierion(pred, yb)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {e+1} done")
        evaluate(model, test_dl)


def evaluate(model, loader):
    model.eval()
    preds = []
    ys = []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            pred = model(xb).cpu().numpy()
            preds.extend(pred)
            ys.extend(yb.numpy())

    preds, ys = np.array(preds), np.array(ys)
    print(f"Test Acc: {accuracy_score(ys, preds>0.5):.4f}, F1: {f1_score(ys, preds>0.5):.4f}, AUC: {roc_auc_score(ys, preds):.4f}")

In [6]:
classifier = ClassifierModel(len(vocab)).to(device)
print(classifier)
train(classifier, epochs=1)

ClassifierModel(
  (embed): Embedding(1024, 64)
  (conv1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (globpool): AdaptiveMaxPool1d(output_size=1)
  (fc1): Linear(in_features=64, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=1, bias=True)
)
Epoch 1 done
Test Acc: 0.9109, F1: 0.9124, AUC: 0.9736


In [7]:
scripted_model = torch.jit.script(classifier)
torch.jit.save(scripted_model, "check-fake.pt")