<a href="https://colab.research.google.com/github/midan171/distributed_system_library/blob/master/BERT_RTR_IEEE_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Importing necessary libraries
import os
from google.colab import drive

# Mounting Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Step 1: Install necessary libraries
#!pip install transformers
#!pip install gensim
!pip install fasttext
#!pip install torch
#!pip install scikit-learn



Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m71.7/73.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.1-py3-none-any.whl (238 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4246767 sha256=e5adafe2591272da9d54b9ecd63bf62c419a6dbfd54af44b73d1b265539f49ea
  Stored in d

In [5]:
# Step 2: Import libraries
import numpy as np
import pandas as pd
import gensim.downloader as api
import fasttext
import fasttext.util
from transformers import BertTokenizer, BertModel
import torch
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [10]:
# Step 3: Load pre-trained models and embeddings
# Load BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load Word2Vec
#word2vec = api.load('word2vec-ruscorpora-300')
#word2vec = api.load('word2vec-google-news-300')

#Load Fasttext
#fasttext = api.load('fasttext-wiki-news-subwords-300')

# Load GloVe
glove = api.load('glove-wiki-gigaword-50')

# Load GloVe twitter
word2vec = api.load('glove-twitter-25')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



In [34]:
from sklearn.preprocessing import LabelEncoder

# Define path to the Excel file
file_path = '/content/drive/MyDrive/Colab Notebooks/BERT (RTR)/Datasets/Sec_Actions_dataset_extra.xlsx'

# Read the Excel file into a DataFrame
df = pd.read_excel(file_path)

# Assuming the DataFrame has columns 'sentence' and 'label'
sentences = df['Sentences'].tolist()
labels = df['Action_type'].tolist()

# Encode labels
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)


In [12]:
def sentence_embedding_bert(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    sentence_embedding = last_hidden_states.mean(dim=1).detach().numpy()
    return sentence_embedding[0]

def sentence_embedding_word2vec(sentence):
    words = sentence.split()
    word_vecs = [word2vec[word] for word in words if word in word2vec]
    if len(word_vecs) == 0:
        return np.zeros(300)
    return np.mean(word_vecs, axis=0)

def sentence_embedding_word2vec(sentence):
    words = sentence.split()
    word_vecs = [word2vec[word] for word in words if word in word2vec]
    if len(word_vecs) == 0:
        return np.zeros(25)
    return np.mean(word_vecs, axis=0)

def sentence_embedding_glove(sentence):
    words = sentence.split()
    word_vecs = [glove[word] for word in words if word in glove]
    if len(word_vecs) == 0:
        return np.zeros(50)  # Adjust this dimension according to the GloVe vectors used
    return np.mean(word_vecs, axis=0)

In [35]:
# Generate embeddings
embeddings_bert = np.array([sentence_embedding_bert(sentence) for sentence in sentences])



In [36]:
embeddings_word2vec = np.array([sentence_embedding_word2vec(sentence) for sentence in sentences])


In [37]:
embeddings_glove = np.array([sentence_embedding_glove(sentence) for sentence in sentences])

In [38]:
# Step 5: Train and Evaluate Models
def evaluate_model(X, y, model):
    scores = cross_val_score(model, X, y, cv=10)
    return scores

In [39]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import LabelEncoder

In [40]:


class DNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(DNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(0.85)  # Dropout layer after the first fully connected layer
        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(0.85)  # Dropout layer after the second fully connected layer
        self.fc3 = nn.Linear(64, num_classes)  # Assuming 3 classes for classification

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x




In [41]:


# Modify the training function to accept and use num_classes:
def train_dnn_model(X, y, input_size, epochs, batch_size):
    # Encode labels if they are strings
    if isinstance(y[0], str):
        #le = LabelEncoder()
        le = LabelEncoder()
        y = le.fit_transform(y)

    num_classes = len(np.unique(y))  # Determine the number of classes

    # Convert data to PyTorch tensors
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.long) # Labels should now be integers

    # Create dataset and dataloader
    dataset = TensorDataset(X_tensor, y_tensor)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model, loss function, and optimizer
    model = DNN(input_size, num_classes)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    for epoch in range(epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # Validation loop
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss /= len(val_loader)
        val_accuracy = correct / total
        print(f"Epoch {epoch+1}/{epochs}, Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

    return model

In [42]:
# Evaluate the DNN model using cross-validation
def evaluate_dnn(X, y, input_size, epochs=10, batch_size=32, cv=10):
    fold_size = len(X) // cv
    scores = []

    for fold in range(cv):
        val_start = fold * fold_size
        val_end = val_start + fold_size
        X_train = np.concatenate((X[:val_start], X[val_end:]), axis=0)
        y_train = np.concatenate((y[:val_start], y[val_end:]), axis=0)
        X_val = X[val_start:val_end]
        y_val = y[val_start:val_end]

        # Check and adjust the range of labels before training
        if isinstance(y_train[0], str):
            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            y_train = le.fit_transform(y_train)
            y_val = le.transform(y_val)  # Use the same encoder to transform validation labels
        num_classes = len(np.unique(y_train))  # Determine the number of classes

        model = train_dnn_model(X_train, y_train, input_size, epochs, batch_size)  # Pass num_classes to the training function

        model.eval()
        val_tensor = torch.tensor(X_val, dtype=torch.float32)
        y_tensor = torch.tensor(y_val, dtype=torch.long)
        outputs = model(val_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = (predicted == y_tensor).sum().item() / len(y_val)
        scores.append(accuracy)

    return np.mean(scores)

In [43]:
# Step 5: Train and Evaluate Models
def evaluate_model(X, y, model):
    scores = cross_val_score(model, X, y, cv=10)
    return scores

In [44]:
# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
# Random Forest
random_forest = RandomForestClassifier()
# Support Vector Machine
svm = make_pipeline(StandardScaler(), SVC())

In [45]:
print(labels_encoded)

[2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2
 1 4 2 1 4 2 1 4 2 1 4 2 1 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4
 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2
 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1
 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4
 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2
 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1
 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 2 1
 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 2
 1 4 2 1 4 2 1 4 2 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 2 1 4 2 1 4 2 1 4
 2 1 4 2 1 4 2 1 4 2 2 1 4 2 1 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 2 1 4
 2 1 4 2 1 4 2 1 4 2 1 4 2 1 4 2 2 1 4 2 1 4 2 1 4 2 1 4 2 0 0 0 0 3 3 3 3
 0 0 0 0 3 3 3 3 0 0 0 0 3 3 3 3 0 0 0 0 3 3 3 3 0 0 0 0 3 3 3 3 0 0 0 0 3
 3 3 3 0 0 0 0 3 3 3 3 0 

In [46]:
# Evaluate on BERT embeddings
scores_bert_log_reg = evaluate_model(embeddings_bert, labels_encoded, log_reg)
scores_bert_rf = evaluate_model(embeddings_bert, labels_encoded, random_forest)
scores_bert_svm = evaluate_model(embeddings_bert, labels_encoded, svm)


In [47]:
# Print results
print("BERT + Logistic Regression:", scores_bert_log_reg.mean())
print("BERT + Random Forest:", scores_bert_rf.mean())
print("BERT + SVM:", scores_bert_svm.mean())

BERT + Logistic Regression: 1.0
BERT + Random Forest: 1.0
BERT + SVM: 0.999


In [48]:
scores_bert_dnn = evaluate_dnn(embeddings_bert, labels_encoded, embeddings_bert.shape[1])

Epoch 1/10, Loss: 1.5588, Accuracy: 0.4444
Epoch 2/10, Loss: 1.4852, Accuracy: 0.6111
Epoch 3/10, Loss: 1.3894, Accuracy: 0.7000
Epoch 4/10, Loss: 1.2593, Accuracy: 0.7833
Epoch 5/10, Loss: 1.0892, Accuracy: 0.7722
Epoch 6/10, Loss: 0.9379, Accuracy: 0.7889
Epoch 7/10, Loss: 0.7947, Accuracy: 0.8111
Epoch 8/10, Loss: 0.6678, Accuracy: 0.8056
Epoch 9/10, Loss: 0.5841, Accuracy: 0.8111
Epoch 10/10, Loss: 0.5222, Accuracy: 0.8111
Epoch 1/10, Loss: 1.5714, Accuracy: 0.3167
Epoch 2/10, Loss: 1.4866, Accuracy: 0.5000
Epoch 3/10, Loss: 1.3895, Accuracy: 0.7167
Epoch 4/10, Loss: 1.2379, Accuracy: 0.9111
Epoch 5/10, Loss: 1.1383, Accuracy: 0.9444
Epoch 6/10, Loss: 0.9332, Accuracy: 0.8167
Epoch 7/10, Loss: 0.8193, Accuracy: 0.8222
Epoch 8/10, Loss: 0.7042, Accuracy: 0.8889
Epoch 9/10, Loss: 0.6510, Accuracy: 0.9278
Epoch 10/10, Loss: 0.5476, Accuracy: 0.8444
Epoch 1/10, Loss: 1.5549, Accuracy: 0.4444
Epoch 2/10, Loss: 1.5158, Accuracy: 0.4722
Epoch 3/10, Loss: 1.4276, Accuracy: 0.6833
Epoch 4/1

In [49]:
# Evaluate on Word2Vec embeddings
scores_w2v_log_reg = evaluate_model(embeddings_word2vec, labels, log_reg)
scores_w2v_rf = evaluate_model(embeddings_word2vec, labels, random_forest)
scores_w2v_svm = evaluate_model(embeddings_word2vec, labels, svm)


In [50]:
# Print results
print("W2Vec + Logistic Regression:", scores_w2v_log_reg.mean())
print("W2Vec + Random Forest:", scores_w2v_rf.mean())
print("W2Vec + SVM:", scores_w2v_svm.mean())

W2Vec + Logistic Regression: 0.9869898989898989
W2Vec + Random Forest: 0.9890000000000001
W2Vec + SVM: 0.999


In [51]:
scores_w2v_dnn = evaluate_dnn(embeddings_word2vec, labels, embeddings_word2vec.shape[1], cv=2)

Epoch 1/10, Loss: 1.6068, Accuracy: 0.2000
Epoch 2/10, Loss: 1.5990, Accuracy: 0.3800
Epoch 3/10, Loss: 1.5908, Accuracy: 0.3000
Epoch 4/10, Loss: 1.5837, Accuracy: 0.2700
Epoch 5/10, Loss: 1.5784, Accuracy: 0.3600
Epoch 6/10, Loss: 1.5700, Accuracy: 0.4100
Epoch 7/10, Loss: 1.5587, Accuracy: 0.4100
Epoch 8/10, Loss: 1.5489, Accuracy: 0.4300
Epoch 9/10, Loss: 1.5394, Accuracy: 0.4800
Epoch 10/10, Loss: 1.5303, Accuracy: 0.5000
Epoch 1/10, Loss: 1.5363, Accuracy: 0.3500
Epoch 2/10, Loss: 1.4971, Accuracy: 0.4000
Epoch 3/10, Loss: 1.4753, Accuracy: 0.5300
Epoch 4/10, Loss: 1.4570, Accuracy: 0.6800
Epoch 5/10, Loss: 1.4349, Accuracy: 0.7600
Epoch 6/10, Loss: 1.4060, Accuracy: 0.7600
Epoch 7/10, Loss: 1.3769, Accuracy: 0.7600
Epoch 8/10, Loss: 1.3496, Accuracy: 0.7700
Epoch 9/10, Loss: 1.3229, Accuracy: 0.7600
Epoch 10/10, Loss: 1.2927, Accuracy: 0.8100


In [52]:
# Evaluate on GloVe embeddings
scores_glove_log_reg = evaluate_model(embeddings_glove, labels, log_reg)
scores_glove_rf = evaluate_model(embeddings_glove, labels, random_forest)
scores_glove_svm = evaluate_model(embeddings_glove, labels, svm)

In [53]:
scores_glove_dnn = evaluate_dnn(embeddings_glove, labels, embeddings_glove.shape[1], epochs=100, cv=2)

Epoch 1/100, Loss: 1.6268, Accuracy: 0.2100
Epoch 2/100, Loss: 1.6093, Accuracy: 0.2100
Epoch 3/100, Loss: 1.5954, Accuracy: 0.2100
Epoch 4/100, Loss: 1.5792, Accuracy: 0.2100
Epoch 5/100, Loss: 1.5613, Accuracy: 0.2100
Epoch 6/100, Loss: 1.5333, Accuracy: 0.2100
Epoch 7/100, Loss: 1.4985, Accuracy: 0.2900
Epoch 8/100, Loss: 1.4567, Accuracy: 0.3800
Epoch 9/100, Loss: 1.3995, Accuracy: 0.6800
Epoch 10/100, Loss: 1.3347, Accuracy: 0.7200
Epoch 11/100, Loss: 1.2493, Accuracy: 0.8000
Epoch 12/100, Loss: 1.1652, Accuracy: 0.8100
Epoch 13/100, Loss: 1.0842, Accuracy: 0.8200
Epoch 14/100, Loss: 1.0041, Accuracy: 0.8400
Epoch 15/100, Loss: 0.9292, Accuracy: 0.9300
Epoch 16/100, Loss: 0.8537, Accuracy: 0.9800
Epoch 17/100, Loss: 0.7829, Accuracy: 0.9800
Epoch 18/100, Loss: 0.7117, Accuracy: 0.9800
Epoch 19/100, Loss: 0.6629, Accuracy: 1.0000
Epoch 20/100, Loss: 0.6063, Accuracy: 0.9900
Epoch 21/100, Loss: 0.5477, Accuracy: 0.9900
Epoch 22/100, Loss: 0.5082, Accuracy: 1.0000
Epoch 23/100, Loss:

In [54]:
# Print results
print("BERT + Logistic Regression:", scores_bert_log_reg.mean())
print("BERT + Random Forest:", scores_bert_rf.mean())
print("BERT + SVM:", scores_bert_svm.mean())
print("BERT + DNN:", scores_bert_dnn)
print("Word2Vec + Logistic Regression:", scores_w2v_log_reg.mean())
print("Word2Vec + Random Forest:", scores_w2v_rf.mean())
print("Word2Vec + SVM:", scores_w2v_svm.mean())
print("Word2Vec + DNN:", scores_w2v_dnn)
print("GloVe + Logistic Regression:", scores_glove_log_reg.mean())
print("GloVe + Random Forest:", scores_glove_rf.mean())
print("GloVe + SVM:", scores_glove_svm.mean())
print("GloVe + DNN:", scores_glove_dnn)

BERT + Logistic Regression: 1.0
BERT + Random Forest: 1.0
BERT + SVM: 0.999
BERT + DNN: 0.795959595959596
Word2Vec + Logistic Regression: 0.9869898989898989
Word2Vec + Random Forest: 0.9890000000000001
Word2Vec + SVM: 0.999
Word2Vec + DNN: 0.36172344689378755
GloVe + Logistic Regression: 0.998
GloVe + Random Forest: 0.9960000000000001
GloVe + SVM: 1.0
GloVe + DNN: 0.8486973947895792
