In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import ast

torch.manual_seed(42)

<torch._C.Generator at 0x794a34048470>

In [None]:
url = 'https://drive.google.com/uc?id=1VsikqoGtx6Ei12NIcmaIS4AYTwiksPJI'
df = pd.read_csv(url)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_size = len(train_df)
test_size = len(test_df)


In [None]:
def build_vocab(texts, max_size, min_freq):
    vocab = {'<UNK>': 0, '<PAD>': 1}
    word_freq = {}

    for text in texts:
        for word in text:
            if word in word_freq:
                word_freq[word] += 1
            else:
                word_freq[word] = 1

    for word, freq in word_freq.items():
        if freq >= min_freq and len(vocab) < max_size:
            vocab[word] = len(vocab)

    return vocab

train_texts = train_df['text'].tolist()
train_texts = [ast.literal_eval(text) for text in train_texts]

vocab = build_vocab(train_texts, max_size=10000, min_freq=2)

vocab_size = len(vocab)
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 762


In [None]:
def numericalize(text, vocab):
    return [vocab.get(word, vocab['<UNK>']) for word in text]

train_df['text'] = train_df['text'].apply(lambda x: numericalize(ast.literal_eval(x), vocab))
test_df['text'] = test_df['text'].apply(lambda x: numericalize(ast.literal_eval(x), vocab))

first_element = train_df['text'].iloc[0]
print("First element\n", first_element)

First element
 [2, 3, 4, 2, 5, 6]


In [None]:
class YelpDataset(Dataset):
    def __init__(self, dataframe, max_seq_length):
        self.dataframe = dataframe
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['text']
        label = self.dataframe.iloc[idx]['label']

        if len(text) < self.max_seq_length:
            text = text + [1] * (self.max_seq_length - len(text))
        else:
            text = text[:self.max_seq_length]

        return {'text': torch.tensor(text, dtype=torch.long), 'label': torch.tensor(label, dtype=torch.long)}

max_seq_length = 100
train_dataset = YelpDataset(train_df, max_seq_length)
test_dataset = YelpDataset(test_df, max_seq_length)

first_tensor_element = train_dataset[0]
print("First tensor element\n", first_tensor_element)

First tensor element
 {'text': tensor([2, 3, 4, 2, 5, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1]), 'label': tensor(0)}


In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
class RNNModel(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, num_layers, output_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        out, hidden = self.rnn(x)
        out = self.fc(hidden[-1])
        return out

input_size = vocab_size
embedding_dim = 100
hidden_size = 256
num_layers = 2
output_size = 1

model = RNNModel(input_size, embedding_dim, hidden_size, num_layers, output_size)

total_params = sum(p.numel() for p in model.parameters())
print("Total params: ", total_params)

Total params:  299689


In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            texts = batch['text']
            labels = batch['label'].float().unsqueeze(1)

            outputs = model(texts)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}')

def evaluate_model(model, test_loader, criterion):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            texts = batch['text']
            labels = batch['label'].float().unsqueeze(1)
            outputs = model(texts)
            predictions = torch.round(torch.sigmoid(outputs))
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total * 100
    return accuracy

train_model(model, train_loader, criterion, optimizer, num_epochs=2)

accuracy = evaluate_model(model, test_loader, criterion)
print("Accuracy:", accuracy)

Epoch [1/2], Loss: 0.7107
Epoch [2/2], Loss: 0.6960
Accuracy: 52.0


In [None]:
# Print sizes of datasets
print(f"Training Dataset Size: {train_size}")
print(f"Test Dataset Size: {test_size}")

# Print vocabulary size
print(f"Vocabulary Size: {vocab_size}")

# Print example numericalized first element
print(f"Example Numericalized First Element: {first_element}")

# Print first element after converting to tensor
print(f"First Element After Converting to Tensor: {first_tensor_element}")

# Print total number of parameters in the model
print(f"Total Number of Parameters in the Model: {total_params}")

# Evaluate the model and print the test accuracy
print(f"Test Accuracy after 2 epochs: {accuracy:.2f}%")

Training Dataset Size: 800
Test Dataset Size: 200
Vocabulary Size: 762
Example Numericalized First Element: [2, 3, 4, 2, 5, 6]
First Element After Converting to Tensor: {'text': tensor([2, 3, 4, 2, 5, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1]), 'label': tensor(0)}
Total Number of Parameters in the Model: 299689
Test Accuracy after 2 epochs: 52.00%
