In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset

In [5]:
ds = load_dataset("stanfordnlp/imdb")

In [6]:
train = ds['train'].to_pandas()
test = ds['test'].to_pandas()
print(train.shape, test.shape)

(25000, 2) (25000, 2)


In [7]:
train.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


### Модель 3: Token wise embeddings -> 1d CNN -> .mean() -> Linear class prediction

In [8]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/myk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/myk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/myk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from nltk.tokenize import word_tokenize
from collections import defaultdict
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
from tqdm import tqdm

In [10]:
texts = train['text'].values
labels = train['label'].values
val_texts = test['text'].values
val_labels = test['label'].values

In [11]:
#tokenization + padding
tokenized_texts = [word_tokenize(text.lower()) for text in texts] 
val_tokenized_texts = [word_tokenize(text.lower()) for text in val_texts]

vocab = defaultdict(lambda: len(vocab)) 
tokenized_texts_idx = [[vocab[word] for word in sentence] for sentence in tokenized_texts]
vocab.default_factory = None
vocab_size = len(vocab)
val_tokenized_texts_idx = [[vocab[word] if word in vocab else 0 for word in sentence] for sentence in val_tokenized_texts] 

tokenized_texts_idx = [torch.tensor(sentence) for sentence in tokenized_texts_idx]
val_tokenized_texts_idx = [torch.tensor(sentence) for sentence in val_tokenized_texts_idx]

padded_sequences = pad_sequence(tokenized_texts_idx, batch_first=True, padding_value=0)  
val_padded_sequences = pad_sequence(val_tokenized_texts_idx, batch_first=True, padding_value=0) 

labels = torch.tensor(labels)
val_labels = torch.tensor(val_labels)

train_dataset = TensorDataset(padded_sequences, labels)
val_dataset = TensorDataset(val_padded_sequences, val_labels)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [12]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [13]:
class CNNTextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_size, output_dim):
        super(CNNTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=filter_size)
        self.fc = nn.Linear(num_filters, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(0, 2, 1)
        conv_out = self.conv(embedded) 
        pooled = conv_out.mean(dim=2)
        output = self.fc(pooled)
        
        return output

In [14]:
def trainloop(model, train_loader, criterion, optimizer, device):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    total = 0

    for texts, labels in tqdm(train_loader):
        texts, labels = texts.to(device), labels.to(device)
        
        optimizer.zero_grad()
        predictions = model(texts)
        
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        
        _, predicted = torch.max(predictions, 1)
        correct = (predicted == labels).sum().item()
        
        epoch_loss += loss.item() * texts.size(0)
        epoch_acc += correct
        total += texts.size(0)
    
    return epoch_loss / total, epoch_acc / total


In [15]:
def evaluate(model, val_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    total = 0
    
    with torch.no_grad():
        for texts, labels in tqdm(val_loader):
            texts, labels = texts.to(device), labels.to(device)
            
            predictions = model(texts)
            loss = criterion(predictions, labels)
            
            _, predicted = torch.max(predictions, 1)
            correct = (predicted == labels).sum().item()
            
            epoch_loss += loss.item() * texts.size(0)
            epoch_acc += correct
            total += texts.size(0)
    
    return epoch_loss / total, epoch_acc / total

In [16]:
embedding_dim = 100
num_filters = 50
filter_size = 3
output_dim = 2

model = CNNTextClassifier(vocab_size, embedding_dim, num_filters, filter_size, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [17]:
model.to(device)
num_epochs = 5
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    
    train_loss, train_acc = trainloop(model, train_loader, criterion, optimizer, device)
    print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    
    val_loss, val_acc = evaluate(model, val_dataloader, criterion, device)
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

Epoch 1/5


100%|██████████████████████████████████████████████████████████████████████| 391/391 [04:07<00:00,  1.58it/s]


Train Loss: 0.7004, Train Acc: 0.5129


100%|██████████████████████████████████████████████████████████████████████| 391/391 [03:11<00:00,  2.04it/s]


Val Loss: 0.7014, Val Acc: 0.5001
Epoch 2/5


100%|██████████████████████████████████████████████████████████████████████| 391/391 [04:34<00:00,  1.43it/s]


Train Loss: 0.6649, Train Acc: 0.5926


100%|██████████████████████████████████████████████████████████████████████| 391/391 [03:22<00:00,  1.93it/s]


Val Loss: 0.6101, Val Acc: 0.6754
Epoch 3/5


100%|██████████████████████████████████████████████████████████████████████| 391/391 [04:33<00:00,  1.43it/s]


Train Loss: 0.5167, Train Acc: 0.7460


100%|██████████████████████████████████████████████████████████████████████| 391/391 [03:20<00:00,  1.95it/s]


Val Loss: 0.4681, Val Acc: 0.7931
Epoch 4/5


100%|██████████████████████████████████████████████████████████████████████| 391/391 [04:42<00:00,  1.38it/s]


Train Loss: 0.3909, Train Acc: 0.8316


100%|██████████████████████████████████████████████████████████████████████| 391/391 [03:31<00:00,  1.85it/s]


Val Loss: 0.3855, Val Acc: 0.8401
Epoch 5/5


100%|██████████████████████████████████████████████████████████████████████| 391/391 [04:47<00:00,  1.36it/s]


Train Loss: 0.3315, Train Acc: 0.8632


100%|██████████████████████████████████████████████████████████████████████| 391/391 [03:31<00:00,  1.85it/s]

Val Loss: 0.3514, Val Acc: 0.8595





### Extra preprocessing: lemmatization, filtering out stopwords, numbers and puctuation

In [18]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word.isalpha()]
    
    return tokens

train_clean = list(map(clean_text, texts))
val_clean = list(map(clean_text, val_texts))

vocab = defaultdict(lambda: len(vocab)+1)
tokenized_texts_idx = [[vocab[word] for word in sentence] for sentence in train_clean]
vocab.default_factory = None
vocab_size = len(vocab)
val_tokenized_texts_idx = [[vocab[word] if word in vocab else 0 for word in sentence] for sentence in val_clean]

tokenized_texts_idx = [torch.tensor(sentence) for sentence in tokenized_texts_idx]
val_tokenized_texts_idx = [torch.tensor(sentence) for sentence in val_tokenized_texts_idx]

padded_sequences = pad_sequence(tokenized_texts_idx, batch_first=True, padding_value=0)  
val_padded_sequences = pad_sequence(val_tokenized_texts_idx, batch_first=True, padding_value=0)  # Padding with 0

labels = torch.tensor(labels)
val_labels = torch.tensor(val_labels)

train_dataset = TensorDataset(padded_sequences, labels)
val_dataset = TensorDataset(val_padded_sequences, val_labels)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

  labels = torch.tensor(labels)
  val_labels = torch.tensor(val_labels)


In [19]:
embedding_dim = 100
num_filters = 50
filter_size = 3
output_dim = 2

model = CNNTextClassifier(vocab_size+1, embedding_dim, num_filters, filter_size, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [20]:
model.to(device)
num_epochs = 5
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    
    train_loss, train_acc = trainloop(model, train_loader, criterion, optimizer, device)
    print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    
    val_loss, val_acc = evaluate(model, val_dataloader, criterion, device)
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

Epoch 1/5


100%|██████████████████████████████████████████████████████████████████████| 391/391 [02:38<00:00,  2.46it/s]


Train Loss: 0.6957, Train Acc: 0.5095


100%|██████████████████████████████████████████████████████████████████████| 391/391 [01:34<00:00,  4.12it/s]


Val Loss: 0.6840, Val Acc: 0.6355
Epoch 2/5


100%|██████████████████████████████████████████████████████████████████████| 391/391 [02:42<00:00,  2.41it/s]


Train Loss: 0.6466, Train Acc: 0.6133


100%|██████████████████████████████████████████████████████████████████████| 391/391 [01:36<00:00,  4.05it/s]


Val Loss: 0.5981, Val Acc: 0.6276
Epoch 3/5


100%|██████████████████████████████████████████████████████████████████████| 391/391 [02:43<00:00,  2.39it/s]


Train Loss: 0.4242, Train Acc: 0.8122


100%|██████████████████████████████████████████████████████████████████████| 391/391 [01:35<00:00,  4.08it/s]


Val Loss: 0.4583, Val Acc: 0.7878
Epoch 4/5


100%|██████████████████████████████████████████████████████████████████████| 391/391 [02:42<00:00,  2.41it/s]


Train Loss: 0.3311, Train Acc: 0.8610


100%|██████████████████████████████████████████████████████████████████████| 391/391 [01:35<00:00,  4.08it/s]


Val Loss: 0.3580, Val Acc: 0.8560
Epoch 5/5


100%|██████████████████████████████████████████████████████████████████████| 391/391 [02:41<00:00,  2.41it/s]


Train Loss: 0.2914, Train Acc: 0.8809


100%|██████████████████████████████████████████████████████████████████████| 391/391 [01:35<00:00,  4.08it/s]

Val Loss: 0.3761, Val Acc: 0.8488





### Модель 2: doc2vec з gensimа. Додатковий препроцессінг

In [73]:
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_data = [TaggedDocument(words=sentence, tags=[str(index)]) for index, sentence in enumerate(train_clean)]

In [74]:
model = Doc2Vec(tagged_data, vector_size=50, window=2, min_count=1, epochs=20)

In [77]:
train_doc_vectors = [model.infer_vector(sentence) for sentence in train_clean]
val_doc_vectors = [model.infer_vector(sentence) for sentence in val_clean]

In [78]:
X_train = train_doc_vectors
y_train = train['label'].values
X_test = val_doc_vectors
y_test = test['label'].values

clf = LogisticRegression()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [79]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))

# Accuracy is improved compared to first attepmt. accuracy 79.09% vs 76,78%

Accuracy: 79.09%
              precision    recall  f1-score   support

           0       0.76      0.84      0.80     12500
           1       0.83      0.74      0.78     12500

    accuracy                           0.79     25000
   macro avg       0.79      0.79      0.79     25000
weighted avg       0.79      0.79      0.79     25000

