# **Import , and Load Dataset**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
import torch.optim as optim 
import torch.functional as F
from torch.utils.data import Dataset, DataLoader
from nltk import word_tokenize , sent_tokenize , WordNetLemmatizer
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score ,  confusion_matrix
from transformers import BertModel, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir("D:/NLP Text data")
print(os.listdir())

['BJT.txt', 'IMBD-Dataset-', 'IMDB Dataset.csv', 'model_weights_LSTM.pth', 'model_weights_RNN.pth', 'Mosfet.txt', 'Solid state devices.txt', 'Transistor as amplifier.txt']


In [3]:
data = pd.read_csv('IMDB Dataset.csv')

# **preprocessing text**

In [4]:
def preprocessing(text) :
    punc = set(string.punctuation)
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    big_tokens = sent_tokenize(str(text))
    preprocessed = []
    for i in big_tokens :
        small_tokens = word_tokenize(i)
        filterd = [ lemmatizer.lemmatize(i.lower()) for i in small_tokens if ( i not in stop_words) and ( i not in punc) and (i.isdigit() == 0)]
        preprocessed.append(" ".join(filterd))
    return " ".join(preprocessed)
data["review_processed"] = data['review'].apply(preprocessing)
print(data.shape)

(50000, 3)


# **Convert Words to Numerical Sequences**

In [5]:
all_text = []
def alltext(text) :
    all_text.append(word_tokenize(text))
data['review_processed'].apply(alltext)

0        None
1        None
2        None
3        None
4        None
         ... 
49995    None
49996    None
49997    None
49998    None
49999    None
Name: review_processed, Length: 50000, dtype: object

In [6]:
all_text_words = []
for i in range(1,50001):
    all_text_words += all_text[i-1]
print("number of words in list datastructure :",len(all_text_words))
print("number of words in Set datastructure  :",len(set(all_text_words)))
print("this mean the list is larger than set with :",int(len(all_text_words)/len(set(all_text_words))),"scaler")

number of words in list datastructure : 7112983
number of words in Set datastructure  : 143309
this mean the list is larger than set with : 49 scaler


In [8]:
all_text_words = set(all_text_words)
numbers = [ i for i in range(len(all_text_words))]
word_to_idx = dict()
idx_to_word = dict()
for i , j in zip(all_text_words,numbers) :
    word_to_idx.update({i:j})
    idx_to_word.update({j:i})
print(len(word_to_idx))
print(len(idx_to_word))

143309
143309


In [9]:
def add_seqence(text) :
    tokenss = word_tokenize(text)
    converted = [ word_to_idx.get(i) for i in tokenss]
    return converted
data["sequence"] = data["review_processed"].apply(add_seqence)
print(data.iloc[:4,2:])

                                    review_processed  \
0  one reviewer mentioned watching oz episode 'll...   
1  a wonderful little production br br the filmin...   
2  i thought wonderful way spend time hot summer ...   
3  basically 's family little boy jake think 's z...   

                                            sequence  
0  [105856, 29904, 97195, 120254, 66371, 36133, 8...  
1  [105336, 3709, 130976, 48297, 121989, 121989, ...  
2  [132279, 7415, 3709, 47041, 67143, 81373, 1161...  
3  [34444, 106484, 136701, 130976, 123484, 44112,...  


# **Select fixed sequence lenght to train**

In [10]:
def pad_sequence(seq, seq_length=142):
    if len(seq) > seq_length:
        return seq[:seq_length]
    else:
        return seq + [0] * (seq_length - len(seq))
data["padded_sequence"] = data["sequence"].apply(pad_sequence)
x = np.array(data['padded_sequence'].tolist())
y = np.array(data['sentiment'].map({"positive": 1, "negative": 0}))
print(len(x))
print(len(y))

50000
50000


# **Train model phase**

In [11]:
class IMDBDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.int64)
        self.y = torch.tensor(y, dtype=torch.int64)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.y[index]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
train_dataset = IMDBDataset(X_train, y_train)
test_dataset = IMDBDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=40, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=40, shuffle=False)

print("Training set size:", len(train_dataset))
print("Test set size:", len(test_dataset))

Training set size: 40000
Test set size: 10000


In [16]:
class SentimentBERT(nn.Module):
    def __init__(self, vocab_size=None, embed_size=350, hidden_size=128, output_size=1, num_layers=2):
        super(SentimentBERT, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(self.bert.config.hidden_size, output_size)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = bert_output.last_hidden_state[:, 0, :]
        out = self.fc(cls_output)
        return out
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

vocab_size = 143309 + 1
embed_size = 350
hidden_size = 128
output_size = 1
num_layers = 2
num_epochs = 60
learning_rate = 0.0001
model = SentimentBERT(vocab_size, embed_size, hidden_size, output_size, num_layers)
criterion = nn.BCEWithLogitsLoss() 
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [13]:
import tqdm
train_losses = []
train_accuracies = []
train_f1_scores = []
for epoch in range(num_epochs):
    epoch_losses = []
    epoch_accuracies = []
    epoch_f1s = []
    progress_bar = tqdm.tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch in progress_bar:
        texts, targets = batch
        inputs = tokenizer([str(t) for t in texts], padding=True, truncation=True, return_tensors="pt")
        input_ids = inputs["input_ids"].to(torch.int64)
        attention_mask = inputs["attention_mask"].to(torch.int64)
        targets = targets.to(torch.float32)
        outputs = model(input_ids, attention_mask).squeeze()
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        probs = torch.sigmoid(outputs)
        preds = (probs >= 0.5).float()
        accuracy = (preds == targets).float().mean().item()
        f1 = f1_score(targets.cpu().numpy(), preds.cpu().numpy(), zero_division=1)
        epoch_losses.append(loss.item())
        epoch_accuracies.append(accuracy)
        epoch_f1s.append(f1)
        progress_bar.set_postfix(loss=loss.item(), acc=accuracy, f1=f1)
    train_losses.append(sum(epoch_losses) / len(epoch_losses))
    train_accuracies.append(sum(epoch_accuracies) / len(epoch_accuracies))
    train_f1_scores.append(sum(epoch_f1s) / len(epoch_f1s))
    print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {train_losses[-1]:.4f}, Acc: {train_accuracies[-1]:.4f}, F1: {train_f1_scores[-1]:.4f}")

Epoch 1/60:   0%|          | 2/1000 [06:00<49:58:38, 180.28s/it, acc=0.45, f1=0.621, loss=1.03] 


KeyboardInterrupt: 