In [1]:
import pandas as pd
import csv
import os
import numpy as np
import nltk
import re

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train=pd.read_csv('/content/drive/MyDrive/Data/train.csv')

In [4]:
train.head(20)

Unnamed: 0,DocNo,sentence,tag
0,D1,Attacks,B-ATTACK
1,D1,can,O
2,D1,damage,O
3,D1,public,B-SYSTEM
4,D1,domain,I-SYSTEM
5,,,
6,D1,IDS,B-TOOL
7,D1,means,O
8,D1,Intrusion,B-TECHNIQUE
9,D1,Detection,I-TECHNIQUE


In [44]:
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.optim as optim



# Drop rows with missing values
data = train.dropna()

# Step 1: Create mappings for entities and relations
entities = list(set(data['DocNo'].unique()).union(set(data['tag'].unique())))
relations = list(data['sentence'].unique())

entity2id = {entity: idx for idx, entity in enumerate(entities)}
relation2id = {relation: idx for idx, relation in enumerate(relations)}

# Step 2: Convert dataset to IDs
triples = data.apply(lambda row: (entity2id[row['DocNo']],
                                  relation2id[row['sentence']],
                                  entity2id[row['tag']]), axis=1).tolist()

# TransE Model
class TransE(nn.Module):
    def __init__(self, num_entities, num_relations, embedding_dim):
        super(TransE, self).__init__()
        self.entity_embedding = nn.Embedding(num_entities, embedding_dim)
        self.relation_embedding = nn.Embedding(num_relations, embedding_dim)
        nn.init.xavier_uniform_(self.entity_embedding.weight)
        nn.init.xavier_uniform_(self.relation_embedding.weight)

    def forward(self, heads, relations, tails):
        h = self.entity_embedding(heads)
        r = self.relation_embedding(relations)
        t = self.entity_embedding(tails)
        return h + r - t

# Custom Dataset
class TripleDataset(Dataset):
    def __init__(self, triples):
        self.triples = triples

    def __len__(self):
        return len(self.triples)

    def __getitem__(self, idx):
        return torch.tensor(self.triples[idx])

# Training Setup
embedding_dim = 50
num_epochs = 50
learning_rate = 0.0001
batch_size = 32

model = TransE(len(entities), len(relations), embedding_dim)
dataset = TripleDataset(triples)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Add this function to compute accuracy
def compute_accuracy(model, triples, entity2id):
    """
    Compute accuracy by checking if the true tail is ranked first
    among all possible tails based on the model's predictions.
    """
    correct = 0
    total = len(triples)

    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        for head, relation, tail in triples:
            # Get embeddings for the head and relation
            h = model.entity_embedding(torch.tensor(head))
            r = model.relation_embedding(torch.tensor(relation))
            true_tail = model.entity_embedding(torch.tensor(tail))

            # Compute scores for all entities as potential tails
            scores = torch.norm(h + r - model.entity_embedding.weight, dim=1)
            predicted_tail = torch.argmin(scores)  # Predicted closest tail

            # Check if the prediction matches the true tail
            if predicted_tail.item() == tail:
                correct += 1

    accuracy = correct / total
    return accuracy

# Training Loop with Accuracy Calculation
for epoch in range(num_epochs):
    total_loss = 0
    model.train()  # Set the model to training mode

    for batch in dataloader:
        heads, relations, tails = batch[:, 0], batch[:, 1], batch[:, 2]
        predictions = model(heads, relations, tails)
        loss = criterion(predictions, torch.zeros_like(predictions))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Compute training accuracy
    train_accuracy = compute_accuracy(model, triples, entity2id)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss:.2f}, Accuracy: {train_accuracy:.2f}")

print("Training complete.")



Epoch 1/50, Loss: 8.82, Accuracy: 0.00
Epoch 2/50, Loss: 7.18, Accuracy: 0.00
Epoch 3/50, Loss: 5.85, Accuracy: 0.00
Epoch 4/50, Loss: 4.78, Accuracy: 0.00
Epoch 5/50, Loss: 3.90, Accuracy: 0.00
Epoch 6/50, Loss: 3.19, Accuracy: 0.00
Epoch 7/50, Loss: 2.61, Accuracy: 0.00
Epoch 8/50, Loss: 2.14, Accuracy: 0.00
Epoch 9/50, Loss: 1.75, Accuracy: 0.00
Epoch 10/50, Loss: 1.44, Accuracy: 0.04
Epoch 11/50, Loss: 1.18, Accuracy: 0.12
Epoch 12/50, Loss: 0.97, Accuracy: 0.15
Epoch 13/50, Loss: 0.80, Accuracy: 0.22
Epoch 14/50, Loss: 0.67, Accuracy: 0.26
Epoch 15/50, Loss: 0.56, Accuracy: 0.32
Epoch 16/50, Loss: 0.47, Accuracy: 0.37
Epoch 17/50, Loss: 0.39, Accuracy: 0.40
Epoch 18/50, Loss: 0.33, Accuracy: 0.45
Epoch 19/50, Loss: 0.28, Accuracy: 0.50
Epoch 20/50, Loss: 0.24, Accuracy: 0.54
Epoch 21/50, Loss: 0.21, Accuracy: 0.57
Epoch 22/50, Loss: 0.18, Accuracy: 0.61
Epoch 23/50, Loss: 0.16, Accuracy: 0.65
Epoch 24/50, Loss: 0.14, Accuracy: 0.67
Epoch 25/50, Loss: 0.13, Accuracy: 0.70
Epoch 26/