In [1]:
pip install torch transformers numpy pandas scikit-learn gensim datasets

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from datasets import load_dataset
from gensim.models import KeyedVectors
import nltk
import joblib
nltk.download('punkt')

2025-05-10 18:49:48.817554: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746902989.084770      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746902989.157223      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class IMDbDataset(Dataset):
    def __init__(self, data, tokenizer, tfidf_vectorizer, glove_model, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.tfidf_vectorizer = tfidf_vectorizer
        self.glove_model = glove_model 
        self.max_len = max_len
        self.texts = data['text']
        self.labels = data['label']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        tfidf_scores = self.tfidf_vectorizer.transform([text]).toarray()[0]
        words = text.lower().split()
        tfidf_dict = {word: score for word, score in zip(self.tfidf_vectorizer.get_feature_names_out(), tfidf_scores) if score > 0}

        glove_embeds = np.zeros((self.max_len, 300)) 
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
        for i, token in enumerate(tokens):
            if i >= self.max_len:
                break
            word = token if not token.startswith('##') and token not in ['[CLS]', '[SEP]', '[PAD]'] else ''
            if word and word in self.glove_model:
                emb = self.glove_model[word]
                weight = tfidf_dict.get(word, 1.0) 
                glove_embeds[i] = emb * weight

        return {
            'input_ids': input_ids.to(device),
            'attention_mask': attention_mask.to(device),
            'glove_embeds': torch.tensor(glove_embeds, dtype=torch.float).to(device),
            'label': torch.tensor(label, dtype=torch.long).to(device)
        }

In [5]:
class HybridBertCNN(nn.Module):
    def __init__(self, bert_model, glove_dim=300, dropout=0.3):
        super(HybridBertCNN, self).__init__()
        self.bert = bert_model
        bert_dim = 768 
        self.conv3 = nn.Conv1d(in_channels=bert_dim + glove_dim, out_channels=128, kernel_size=3, padding=1)
        self.conv4 = nn.Conv1d(in_channels=bert_dim + glove_dim, out_channels=128, kernel_size=4, padding=2)
        self.conv5 = nn.Conv1d(in_channels=bert_dim + glove_dim, out_channels=128, kernel_size=5, padding=2)
        self.attention = nn.MultiheadAttention(embed_dim=384, num_heads=8) 
        self.fc1 = nn.Linear(384, 128)
        self.fc2 = nn.Linear(128, 2) 
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask, glove_embeds):
        with torch.no_grad():  
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_embeds = bert_output.last_hidden_state  
        combined_embeds = torch.cat((bert_embeds, glove_embeds), dim=-1) 
        combined_embeds = combined_embeds.permute(0, 2, 1)
        conv3_out = self.relu(self.conv3(combined_embeds))
        conv4_out = self.relu(self.conv4(combined_embeds))
        conv5_out = self.relu(self.conv5(combined_embeds))
        conv3_out = torch.max(conv3_out, dim=-1)[0]
        conv4_out = torch.max(conv4_out, dim=-1)[0]
        conv5_out = torch.max(conv5_out, dim=-1)[0]
        cnn_out = torch.cat((conv3_out, conv4_out, conv5_out), dim=-1)  
        cnn_out = cnn_out.unsqueeze(0)  
        attn_out, _ = self.attention(cnn_out, cnn_out, cnn_out)
        attn_out = attn_out.squeeze(0)
        out = self.dropout(attn_out)
        out = self.relu(self.fc1(out))
        out = self.dropout(out)
        out = self.fc2(out)

        return out

In [6]:
def train_model(model, train_loader, val_loader, epochs=3, lr=2e-5):
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            glove_embeds = batch['glove_embeds']
            labels = batch['label']

            outputs = model(input_ids, attention_mask, glove_embeds)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                glove_embeds = batch['glove_embeds']
                labels = batch['label']

                outputs = model(input_ids, attention_mask, glove_embeds)
                loss = loss_fn(outputs, labels)
                val_loss += loss.item()

                preds = torch.argmax(outputs, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train Loss: {train_loss / len(train_loader):.4f}")
        print(f"Val Loss: {val_loss / len(val_loader):.4f}")
        print(f"Val Accuracy: {correct / total:.4f}")


In [7]:
dataset = load_dataset("imdb")
train_data = dataset['train']
val_data = dataset['test']
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectorizer.fit([str(text) for text in train_data['text']])
glove_path = "/kaggle/input/glove6b300dtxt/glove.6B.300d.txt"
glove_model = KeyedVectors.load_word2vec_format(glove_path, binary=False, no_header=True)
print("GloVe embeddings loaded successfully.")

train_dataset = IMDbDataset(train_data, tokenizer, tfidf_vectorizer, glove_model)
val_dataset = IMDbDataset(val_data, tokenizer, tfidf_vectorizer, glove_model)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

model = HybridBertCNN(bert_model).to(device)
train_model(model, train_loader, val_loader, epochs = 4)

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

GloVe embeddings loaded successfully.
Epoch 1/4
Train Loss: 0.4707
Val Loss: 0.3473
Val Accuracy: 0.8478
Epoch 2/4
Train Loss: 0.3383
Val Loss: 0.3265
Val Accuracy: 0.8590
Epoch 3/4
Train Loss: 0.3020
Val Loss: 0.3136
Val Accuracy: 0.8677
Epoch 4/4
Train Loss: 0.2614
Val Loss: 0.3140
Val Accuracy: 0.8697


In [9]:
def predict_sentiment(model, tokenizer, tfidf_vectorizer, glove_model, text, max_len=128):
    model.eval()
    text = str(text)
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].flatten().to(device)
    attention_mask = encoding['attention_mask'].flatten().to(device)
    tfidf_scores = tfidf_vectorizer.transform([text]).toarray()[0]
    words = text.lower().split()
    tfidf_dict = {word: score for word, score in zip(tfidf_vectorizer.get_feature_names_out(), tfidf_scores) if score > 0}

    glove_embeds = np.zeros((max_len, 300))
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    for i, token in enumerate(tokens):
        if i >= max_len:
            break
        word = token if not token.startswith('##') and token not in ['[CLS]', '[SEP]', '[PAD]'] else ''
        if word and word in glove_model:
            emb = glove_model[word]
            weight = tfidf_dict.get(word, 1.0) 
            glove_embeds[i] = emb * weight

    glove_embeds = torch.tensor(glove_embeds, dtype=torch.float).to(device)
    with torch.no_grad():
        outputs = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0), glove_embeds.unsqueeze(0))
        preds = torch.argmax(outputs, dim=1).item()
    label = "Positive" if preds == 1 else "Negative"
    return label, outputs.softmax(dim=1).cpu().numpy()[0]

In [11]:
example_text = "the person sitting infront of me in the cinema was awful though and kept talking which ruined the experience and made it so bad so I won't be going to this cinema again, but the movie itself is very good"
predicted_label, probabilities = predict_sentiment(model, tokenizer, tfidf_vectorizer, glove_model, example_text)
print(f"Predicted Sentiment: {predicted_label}")
print(f"Probabilities (Negative, Positive): {probabilities}")

Predicted Sentiment: Negative
Probabilities (Negative, Positive): [0.6360805  0.36391953]


# saving model weights

In [12]:
torch.save(model.state_dict(), 'hybrid_bert_cnn_weights.pth')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

# loading weights

In [13]:
model = HybridBertCNN(bert_model).to(device)
model.load_state_dict(torch.load('hybrid_bert_cnn_weights.pth'))
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

  model.load_state_dict(torch.load('hybrid_bert_cnn_weights.pth'))


In [16]:
example_text = "the person sitting infront of me in kept talking which ruined the experience so I won't be going to this cinema again, but the movie itself is very good"
predicted_label, probabilities = predict_sentiment(model, tokenizer, tfidf_vectorizer, glove_model, example_text)
print(f"Predicted Sentiment: {predicted_label}")
print(f"Probabilities (Negative, Positive): {probabilities}")

Predicted Sentiment: Positive
Probabilities (Negative, Positive): [0.18203108 0.8179689 ]
