In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import pandas as pd

# Preprocessing and data embeding

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dataset_nlp_project_BIO.csv')

In [None]:
data

Unnamed: 0,Misspelled Word,Context,Label (O & N),Output Labels
0,@by5m@|,Such @by5m@| performance today,O,"['S', 'B-O', 'S', 'S']"
1,@d3p7,Your @d3p7 management succeeds,N,"['S', 'B-N', 'S', 'S']"
2,afr1can,why does that afr1can american monkey hold gun...,O,"['S', 'S', 'S', 'B-O', 'S', 'S', 'S', 'S', 'S'..."
3,@ggr355!v3,Your @ggr355!v3 approach worked,N,"['S', 'B-N', 'S', 'S']"
4,@ggr0,Such @ggr0 determination helps,N,"['S', 'B-N', 'S', 'S']"
...,...,...,...,...
1079,wr37ch3d,Such wr37ch3d effort again,O,"['S', 'B-O', 'S', 'S']"
1080,wooshee,I want to see that wooshee's reaction to this ...,O,"['S', 'S', 'S', 'S', 'S', 'B-O', 'S', 'S', 'S'..."
1081,wooshee,I wanna beat that wooshee up,O,"['S', 'S', 'S', 'S', 'B-O', 'S']"
1082,0,score no 0s this time.,N,"['S', 'S', 'B-N', 'S', 'S']"


In [None]:
labels_to_indices = {"S": 0, "B-O":1, "B-N":2, "I-O":3,  "I-N": 4, "<pad>": -1}
indices_to_labels = {0: "S", 1:"B-O", 2:"B-N", 3:"I-O",  4:"I-N", -1: "<pad>"}


In [None]:
context=  data['Context']
MAX_LENGTH = max(len(sentence.split()) for sentence in context)
print(MAX_LENGTH)

context = context.str.lower()

vocab = {"<pad>": 0, '<unk>':1}
for sentence in context:
    tokens = sentence.split()
    for token in tokens:
        if token not in vocab:
            vocab[token] = len(vocab)
vocab_size = len(vocab)
print(vocab_size)
print(vocab)

31
2676
{'<pad>': 0, '<unk>': 1, 'such': 2, '@by5m@|': 3, 'performance': 4, 'today': 5, 'your': 6, '@d3p7': 7, 'management': 8, 'succeeds': 9, 'why': 10, 'does': 11, 'that': 12, 'afr1can': 13, 'american': 14, 'monkey': 15, 'hold': 16, 'gun': 17, 'like': 18, 'that?': 19, '@ggr355!v3': 20, 'approach': 21, 'worked': 22, '@ggr0': 23, 'determination': 24, 'helps': 25, 'problem': 26, 'solving': 27, '@m@z!n6': 28, 'won': 29, 'us': 30, 'the': 31, 'game': 32, '@m@z!ng': 33, 'contribution': 34, 'helped': 35, 'skills': 36, 'showed': 37, 'i': 38, 'heard': 39, 'anna': 40, 'l': 41, 'with': 42, 'depression.': 43, 'he': 44, 'definitely': 45, 'enjoyed': 46, 'session.': 47, 'hate': 48, 'has': 49, 'to': 50, 'be': 51, 'so': 52, 'about': 53, 'this': 54, 'situation.': 55, 'im': 56, 'done': 57, 'bitter': 58, 'bitches': 59, 'its': 60, 'a': 61, 'wrap': 62, 'for': 63, 'that.': 64, 'if': 65, 'you': 66, '@ngry': 67, 'bird': 68, 'theres': 69, 'app': 70, 'getting': 71, '@n6ry': 72, 'at': 73, 'these': 74, 'bugs': 75

In [None]:
def preprocess_data(data, max_length):
    preprocessed_data = []
    for sentence in data:
        tokens = sentence.split()
        tokens = tokens[:max_length] + ["<pad>"] * max(0, max_length - len(tokens))

        preprocessed_data.append(tokens)
    return preprocessed_data
context = preprocess_data(context, MAX_LENGTH)

In [None]:
labels= data['Output Labels']
from ast import literal_eval

labels = labels.apply(literal_eval)


for i, label in enumerate(labels):
  indices = [labels_to_indices[item] for item in label]
  indices = indices[:MAX_LENGTH] + [-1] * max(0, MAX_LENGTH - len(indices))
  labels[i] = indices




In [None]:
from gensim.models import FastText

embedded_corpus = FastText(sentences=context, vector_size=300, window=5, min_count=1, workers=4, sg=1)


In [None]:
embedded_context= np.array([[embedded_corpus.wv[word] for word in sentence] for sentence in context])


In [None]:
embedded_context.shape

(1084, 31, 300)

In [None]:
len(labels[0])

31

In [None]:
#create test and train data
import random
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(embedded_context, labels, test_size=0.2, random_state=42)


In [None]:
y_test.shape

(217,)

# LSTM test

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
seed = 42
set_seed(seed)

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out)
        return out


In [None]:
# Convert the labels to 2D NumPy arrays
y_train_array = np.array([np.array(seq, dtype=np.int64) for seq in y_train])
y_test_array = np.array([np.array(seq, dtype=np.int64) for seq in y_test])

# Convert features and labels to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_array, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_array, dtype=torch.long)

In [None]:
# # Generate sample data
# X_train = np.random.rand(100, 10, 50)  # 100 sequences of length 10 with 50 features each
# y_train = np.random.randint(5, size=(100, 10))

# # Convert data to PyTorch tensors
# X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
# y_train_tensor = torch.tensor(y_train, dtype=torch.long)  # Convert Series to NumPy array
# X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define model parameters
input_size = X_train.shape[2]
hidden_size = 128
num_layers = 3
output_size = 6



In [None]:
# Instantiate the model
model = LSTMClassifier(input_size, hidden_size, num_layers, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Train the model
num_epochs = 26
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)  # Output shape: (batch_size, seq_length, output_size)
    outputs = outputs.reshape(-1, output_size)  # Shape: (batch_size * seq_length, output_size)
    labels = y_train_tensor.reshape(-1)  # Shape: (batch_size * seq_length)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')


Epoch [1/26], Loss: 1.76262366771698
Epoch [2/26], Loss: 1.7292594909667969
Epoch [3/26], Loss: 1.6932978630065918
Epoch [4/26], Loss: 1.6510541439056396
Epoch [5/26], Loss: 1.5973539352416992
Epoch [6/26], Loss: 1.5250064134597778
Epoch [7/26], Loss: 1.4250065088272095
Epoch [8/26], Loss: 1.2924025058746338
Epoch [9/26], Loss: 1.1413753032684326
Epoch [10/26], Loss: 1.0061429738998413
Epoch [11/26], Loss: 0.9100829362869263
Epoch [12/26], Loss: 0.8524873852729797
Epoch [13/26], Loss: 0.8215071558952332
Epoch [14/26], Loss: 0.8044201731681824
Epoch [15/26], Loss: 0.7923757433891296
Epoch [16/26], Loss: 0.7806132435798645
Epoch [17/26], Loss: 0.7670013904571533
Epoch [18/26], Loss: 0.7508884072303772
Epoch [19/26], Loss: 0.732448399066925
Epoch [20/26], Loss: 0.712380588054657
Epoch [21/26], Loss: 0.6917696595191956
Epoch [22/26], Loss: 0.671945333480835
Epoch [23/26], Loss: 0.6543028354644775
Epoch [24/26], Loss: 0.6401348114013672
Epoch [25/26], Loss: 0.6303657293319702
Epoch [26/26],

In [None]:
# Define the number of labels (excluding <PAD>)
num_labels = 5  # S (0), B-O (1), B-N (2), I-O (3), I-N (4)

# Mask to exclude padding tokens
mask = (y_test_tensor != -1)

with torch.no_grad():
    predictions = model(X_test_tensor)  # Shape: (batch_size, seq_length, num_labels)
    predicted_labels = torch.argmax(predictions, dim=2)  # Shape: (batch_size, seq_length)
    # print(predicted_labels)

    # Initialize counters for each label
    correct_per_label = torch.zeros(num_labels, dtype=torch.long)
    total_per_label = torch.zeros(num_labels, dtype=torch.long)

    for label in range(num_labels):
        # Mask for the current label
        label_mask = (y_test_tensor == label) & mask  # True for tokens of this label (excluding <PAD>)

        # Count correct predictions for the current label
        correct_per_label[label] = ((predicted_labels == label) & label_mask).sum()

        # Count total tokens for the current label
        total_per_label[label] = label_mask.sum()

    # Compute per-label accuracy
    accuracy_per_label = correct_per_label.float() / total_per_label.float() * 100
    accuracy_per_label[total_per_label == 0] = 0  # Avoid division by zero

    # Print results
    print("Per-label Accuracy (excluding <PAD>):")
    label_names = ['S', 'B-O', 'B-N', 'I-O', 'I-N']
    for label, acc in enumerate(accuracy_per_label):
        print(f"{label_names[label]}: {acc:.2f}%")

    # Overall accuracy (optional)

    total_correct = correct_per_label[1:].sum().item()

    total_tokens = total_per_label.sum().item()
    overall_accuracy = total_correct / total_tokens * 100 if total_tokens > 0 else 0
    print(f"Overall Test Accuracy (excluding <PAD> and S-label): {overall_accuracy:.2f}%")


Per-label Accuracy (excluding <PAD>):
S: 100.00%
B-O: 0.00%
B-N: 0.00%
I-O: 0.00%
I-N: 0.00%
Overall Test Accuracy (excluding <PAD> and S-label): 0.00%


In [None]:
# # X_test = np.random.rand(10, 10, 50)  # Test data with 10 sequences
# mask = (y_test_tensor != -1)

# with torch.no_grad():
#     predictions = model(X_test_tensor)
#     predicted_labels = torch.argmax(predictions, dim=2)
#     masked_correct = (predicted_labels == y_test_tensor) & mask  # Correct predictions only for non-padding tokens
#     correct_tokens = masked_correct.sum().item()  # Count the correct tokens (excluding padding)
#     total_tokens = mask.sum().item()  # Count the total number of non-padding tokens

#     accuracy = correct_tokens / total_tokens * 100 if total_tokens > 0 else 0
#     print(f"Final Test Accuracy (excluding padding): {accuracy:.2f}%")

#     # print("Predicted Labels:", predicted_labels)
#     # total_tokens = y_test_tensor.numel()
#     # correct_tokens = (predicted_labels == y_test_tensor).sum().item()
#     # accuracy = correct_tokens / total_tokens * 100
#     # print(f"Final Test Accuracy: {accuracy:.2f}%")