# 1. Import required libs

In [1]:
# ! pip install -r requirements.txt

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import tqdm
import pandas as pd

from sklearn.model_selection import train_test_split
from collections import Counter

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\longt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\longt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\longt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\longt\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# 2. Data preparation and vocabularies creation

## 2.1 Load dataset

In [3]:
# Load data from CSV
data_path = 'data/imdb/review.csv'
df = pd.read_csv(data_path)

In [4]:
df.head(2)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


## 2.2 Preprocessing

In [5]:
# Init neccessary tools
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Sentence preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lower case
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in string.punctuation]  # Remove punctuation
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords (e.g the, a,...)
    tokens = [stemmer.stem(word) for word in tokens]  # Stemming
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)


In [6]:
# Apply on data
df['review'] = df['review'].apply(preprocess_text)

In [7]:
df['review'].head(2)

0    one review mention watch 1 oz episod 'll hook ...
1    wonder littl product br br film techniqu unass...
Name: review, dtype: object

## 2.3 Split train, test and create vocabulary

In [8]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Build vocab
def build_vocab(texts, max_vocab_size=10000):
    word_counts = Counter()
    for text in texts:
        word_counts.update(text.split())
    vocab = {word: idx+2 for idx, (word, _) in enumerate(word_counts.most_common(max_vocab_size))}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1  # Unknown word
    return vocab


# Indexing
def encode_text(text, vocab):
    return [vocab.get(word, vocab['<UNK>']) for word in text.split()]

# Encode padding for train and test set
def pad_sequences(sequences, max_length):
    padded_sequences = []
    for seq in sequences:
        if len(seq) > max_length:
            padded_sequences.append(seq[:max_length])
        elif len(seq) < max_length:
            padded_sequences.append(seq + [0] * (max_length - len(seq)))
        else:
            padded_sequences.append(seq)
    return torch.tensor(padded_sequences)


In [9]:
# Build vocab
vocab = build_vocab(X_train)

# Encode and padding
X_train_encoded = [encode_text(text, vocab) for text in X_train]
X_test_encoded = [encode_text(text, vocab) for text in X_test]

MAX_SEQ_LENGTH = 256  # Max length of sentence
X_train_padded = pad_sequences(X_train_encoded, MAX_SEQ_LENGTH)
X_test_padded = pad_sequences(X_test_encoded, MAX_SEQ_LENGTH)

# Convert into Tensor
y_train = torch.tensor([1 if label == 'positive' else 0 for label in y_train])
y_test = torch.tensor([1 if label == 'positive' else 0 for label in y_test])

## 2.4 Create Dataset and DataLoader

In [10]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Create Dataset and DataLoader
train_dataset = TextDataset(X_train_padded, y_train)
test_dataset = TextDataset(X_test_padded, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


# 3. Define CNN model

In [11]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, kernel_sizes, num_filters, dropout=0.5):
        super(TextCNN, self).__init__()
        
        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Convolutional Layers with different kernel size
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embedding_dim)) for k in kernel_sizes
        ])
        
        self.dropout = nn.Dropout(dropout)
        
        # Fully Connected Layer
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, num_classes)

    def forward(self, x):
        # Get embedding
        x = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        x = x.unsqueeze(1)  # (batch_size, 1, seq_length, embedding_dim)

        # Apply Conv
        conv_results = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]  # (batch_size, num_filters, seq_len - k + 1)
        pooled_results = [torch.max(result, dim=2)[0] for result in conv_results]  # Max pooling

        # Concate result from kernel sizes
        x = torch.cat(pooled_results, dim=1)  # (batch_size, num_filters * len(kernel_sizes))
        
        x = self.dropout(x)
        
        # Fully connected layer to classification
        x = self.fc(x)  # (batch_size, num_classes)
        return x


In [12]:
# Init model
vocab_size = len(vocab)
embedding_dim = 100
num_classes = 2  # Positive/Negative
kernel_sizes = [3, 4, 5]
num_filters = 100

model = TextCNN(vocab_size, embedding_dim, num_classes, kernel_sizes, num_filters)

# 4. Train model

In [13]:
# Define Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [14]:
# Train model
def train(model, criterion, optimizer, train_loader, device=torch.device('cpu'), epochs=1):
    model.to(device)
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        with tqdm.tqdm(train_loader, unit="batch") as tepoch:
            for inputs, labels in tepoch:
                tepoch.set_description(f"Epoch {epoch}")
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                # Clear gradient
                optimizer.zero_grad()
                
                # Forward
                outputs = model(inputs)
                
                # Compute loss
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                
                running_loss += loss.item()
                
                # Compute accuracy
                _, predicted = torch.max(outputs, 1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)
            
            epoch_loss = running_loss / len(train_loader)
            epoch_acc = correct / total
            print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}")


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
device

device(type='cuda')

In [22]:
train(model=model, criterion=criterion, optimizer=optimizer, train_loader=train_loader, device=device, epochs=5)

Epoch 0: 100%|██████████| 625/625 [00:13<00:00, 46.27batch/s]


Epoch 1/5, Loss: 0.2288, Accuracy: 0.9072


Epoch 1: 100%|██████████| 625/625 [00:08<00:00, 75.08batch/s]


Epoch 2/5, Loss: 0.2025, Accuracy: 0.9173


Epoch 2: 100%|██████████| 625/625 [00:12<00:00, 49.65batch/s]


Epoch 3/5, Loss: 0.1756, Accuracy: 0.9313


Epoch 3: 100%|██████████| 625/625 [00:08<00:00, 70.79batch/s]


Epoch 4/5, Loss: 0.1508, Accuracy: 0.9408


Epoch 4: 100%|██████████| 625/625 [00:12<00:00, 51.34batch/s]

Epoch 5/5, Loss: 0.1295, Accuracy: 0.9497





# 6. Evaluate model

In [18]:
def evaluate(model, test_loader, device=torch.device('cpu')):
    model.to(device)
    # Evaluate model on test set
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")


In [23]:
evaluate(model=model, test_loader=test_loader, device=device)

Test Accuracy: 0.8772


# 7. Save model

In [20]:
# Save model
def save(model, filepath):
    torch.save(model.state_dict(), filepath)

In [21]:
save(model, 'models/model1.pth')