<a href="https://colab.research.google.com/github/pks2906/Role-Entity-Binding-Extraction/blob/main/Transformers_in_role_entity_binding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install torch pandas numpy




In [5]:
import numpy as np
import pandas as pd

# Load dataset
# Load dataset and ignore problematic characters
data = pd.read_csv('dataset.csv', encoding='ISO-8859-1')

# Proceed with extracting the sentences
sentences = data['sentences'].values  # Replace 'sentence_column' with your actual column name


# Simple tokenizer (you may replace this with a more advanced tokenizer like from HuggingFace)
def tokenize(sentence):
    return sentence.lower().split()

# Create a vocabulary and assign each word an index
vocab = {}
def build_vocab(sentences):
    idx = 0
    for sentence in sentences:
        for word in tokenize(sentence):
            if word not in vocab:
                vocab[word] = idx
                idx += 1
    return vocab

vocab = build_vocab(sentences)
vocab_size = len(vocab)
embed_dim = 8  # Embedding dimension (adjust as needed)

# Word embedding matrix (can be learned during training, here we initialize randomly)
embeddings = np.random.randn(vocab_size, embed_dim)

# Generate Query, Key, and Value weight matrices (initialized randomly, learned during training)
Wq = np.random.randn(embed_dim, embed_dim)
Wk = np.random.randn(embed_dim, embed_dim)
Wv = np.random.randn(embed_dim, embed_dim)

# Embedding lookup
def get_word_embedding(word):
    word_idx = vocab.get(word, None)
    if word_idx is not None:
        return embeddings[word_idx]
    else:
        raise ValueError(f"Word '{word}' not in vocabulary")

# Self-attention mechanism
def self_attention(sentence):
    words = tokenize(sentence)

    # Get embeddings for each word in the sentence
    embeddings_list = [get_word_embedding(word) for word in words]

    # Convert to matrix form (stack embeddings for all words)
    X = np.stack(embeddings_list)

    # Compute Query, Key, Value matrices
    Q = np.dot(X, Wq)  # Query matrix
    K = np.dot(X, Wk)  # Key matrix
    V = np.dot(X, Wv)  # Value matrix

    # Compute attention scores (Q.K^T)
    attention_scores = np.dot(Q, K.T)

    # Apply softmax to attention scores (row-wise)
    def softmax(x):
        exp_x = np.exp(x - np.max(x))  # Stability improvement for large numbers
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

    attention_weights = softmax(attention_scores)

    # Compute final contextual embeddings
    contextual_embeddings = np.dot(attention_weights, V)

    return contextual_embeddings

# Example usage
example_sentence = sentences[0]  # Take the first sentence from the dataset
contextual_embeddings = self_attention(example_sentence)
print("Contextual Embeddings:\n", contextual_embeddings)


Contextual Embeddings:
 [[ 1.28069781  0.36411992 -3.48097956  3.42620454  1.49543955  1.49625318
   2.17855855 -1.97884272]
 [ 3.20867104  0.22296341 -1.11371245  1.35687233  1.40185236 -0.73096271
  -0.99360655 -1.18520643]
 [ 1.81405897 -0.34460977 -1.09330696 -0.2793953  -5.77708292 -1.3767243
   5.15816516 -2.14793132]
 [-2.2460807   1.12675453  4.2360251  -2.9536286  -2.69478635 -5.11161079
  -5.02273238  3.7083461 ]
 [-2.24411887  1.12663578  4.23337089 -2.95184286 -2.69275502 -5.10936982
  -5.02131913  3.70645661]
 [-2.24607909  1.12669471  4.23594985 -2.95353388 -2.69476972 -5.11157395
  -5.02260504  3.70821606]
 [-2.24606489  1.12674343  4.23602522 -2.9536179  -2.69478652 -5.11160507
  -5.02271979  3.70833328]]


In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd

# Define your dataset
class TextDataset(Dataset):
    def __init__(self, sentences, labels, vocab, max_len):
        self.sentences = sentences
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        # Tokenize sentence into indices based on the vocabulary
        sentence_indices = [self.vocab.get(word, self.vocab['<UNK>']) for word in sentence.split()]
        # Pad or truncate the sentence to max_len
        sentence_indices = sentence_indices[:self.max_len] + [self.vocab['<PAD>']] * (self.max_len - len(sentence_indices))
        return torch.tensor(sentence_indices), torch.tensor(label)

# Self-Attention based Model
class SelfAttentionModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, max_len):
        super(SelfAttentionModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.Wq = nn.Linear(embed_dim, embed_dim)
        self.Wk = nn.Linear(embed_dim, embed_dim)
        self.Wv = nn.Linear(embed_dim, embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)  # Classification layer

    def forward(self, x):
        embedded = self.embedding(x)
        Q = self.Wq(embedded)
        K = self.Wk(embedded)
        V = self.Wv(embedded)

        attention_weights = torch.softmax(torch.matmul(Q, K.transpose(-2, -1)), dim=-1)
        context = torch.matmul(attention_weights, V)

        # Average over sequence length for classification task
        output = self.fc(context.mean(dim=1))
        return output

# Load dataset
data = pd.read_csv('dataset.csv', encoding='ISO-8859-1')
sentences = data['sentences'].values  # Column with sentences
labels = data['Class No'].values  # Use 'Class No' column for labels

# Create vocabulary
vocab = {'<PAD>': 0, '<UNK>': 1}  # Special tokens for padding and unknown words
for sentence in sentences:
    for word in sentence.split():
        if word not in vocab:
            vocab[word] = len(vocab)

vocab_size = len(vocab)
max_len = 50  # Maximum sentence length (pad or truncate to this length)
embed_dim = 64  # Embedding dimension
num_classes = len(np.unique(labels))  # Number of unique labels (from 'Class No')

# Create dataset and dataloader
train_dataset = TextDataset(sentences, labels, vocab, max_len)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Model, loss function, and optimizer
model = SelfAttentionModel(vocab_size, embed_dim, num_classes, max_len)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 562
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_sentences, batch_labels in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_sentences)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")

print("Training Complete")


Epoch 1/562, Loss: 1.5351948936780293
Epoch 2/562, Loss: 1.3592442803912692
Epoch 3/562, Loss: 1.3115292588869731
Epoch 4/562, Loss: 1.2559438612726
Epoch 5/562, Loss: 1.162703600194719
Epoch 6/562, Loss: 0.975031746758355
Epoch 7/562, Loss: 0.8967355423503451
Epoch 8/562, Loss: 0.8302389880021414
Epoch 9/562, Loss: 0.6760162396563424
Epoch 10/562, Loss: 0.6232457309961319
Epoch 11/562, Loss: 0.5929270717832777
Epoch 12/562, Loss: 0.5669247508049011
Epoch 13/562, Loss: 0.5504377285639445
Epoch 14/562, Loss: 0.5250538554456499
Epoch 15/562, Loss: 0.5035521470838122
Epoch 16/562, Loss: 0.4719761427905824
Epoch 17/562, Loss: 0.46114301847087014
Epoch 18/562, Loss: 0.47218625164694256
Epoch 19/562, Loss: 0.45747337324751747
Epoch 20/562, Loss: 0.4067731814252006
Epoch 21/562, Loss: 0.37872588137785596
Epoch 22/562, Loss: 0.35078342341714436
Epoch 23/562, Loss: 0.379421924551328
Epoch 24/562, Loss: 0.3240109226769871
Epoch 25/562, Loss: 0.3073975311385261
Epoch 26/562, Loss: 0.2920703175995

In [18]:
from sklearn.metrics import accuracy_score, classification_report

# Set the model to evaluation mode
model.eval()

# Create test dataset and dataloader (assuming similar format as training data)
test_sentences = data['sentences'].values  # Replace with actual test data if you have one
test_labels = data['Class No'].values  # Replace with actual test labels

test_dataset = TextDataset(test_sentences, test_labels, vocab, max_len)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

all_preds = []
all_labels = []

with torch.no_grad():
    for batch_sentences, batch_labels in test_loader:
        outputs = model(batch_sentences)
        _, preds = torch.max(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Classification Report
print(classification_report(all_labels, all_preds))


Test Accuracy: 100.00%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       100
           1       1.00      1.00      1.00        97
           2       1.00      1.00      1.00        80
           3       1.00      1.00      1.00       118
           4       1.00      1.00      1.00       166

    accuracy                           1.00       561
   macro avg       1.00      1.00      1.00       561
weighted avg       1.00      1.00      1.00       561



In [19]:
torch.save(model.state_dict(), 'self_attention_model.pth')


In [20]:
model = SelfAttentionModel(vocab_size, embed_dim, num_classes, max_len)
model.load_state_dict(torch.load('self_attention_model.pth'))
model.eval()  # Set the model to evaluation mode


  model.load_state_dict(torch.load('self_attention_model.pth'))


SelfAttentionModel(
  (embedding): Embedding(1515, 64)
  (Wq): Linear(in_features=64, out_features=64, bias=True)
  (Wk): Linear(in_features=64, out_features=64, bias=True)
  (Wv): Linear(in_features=64, out_features=64, bias=True)
  (fc): Linear(in_features=64, out_features=5, bias=True)
)

In [25]:
def predict(sentence, model, vocab, max_len):
    model.eval()
    sentence_indices = [vocab.get(word, vocab['<UNK>']) for word in sentence.split()]
    sentence_indices = sentence_indices[:max_len] + [vocab['<PAD>']] * (max_len - len(sentence_indices))
    sentence_tensor = torch.tensor([sentence_indices])

    with torch.no_grad():
        output = model(sentence_tensor)
        _, predicted_class = torch.max(output, dim=1)

    return predicted_class.item()

# Example prediction
new_sentence = "Ram gives book to sita and sita lost the book"
predicted_class = predict(new_sentence, model, vocab, max_len)
print(f"Predicted class: {predicted_class}")


Predicted class: 4
