In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score,accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from tqdm import tqdm
import json
import fasttext

In [2]:
with open(r'data/ATE_Test.json', 'r') as f:
    ATE_test_data = json.load(f)
with open(r'data/NER_Test.json', 'r') as f:
    NER_test_data = json.load(f)

## word2vec

In [3]:
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
def convert_data_to_tensors_word2vec(data, word_vectors):
    texts = [data[key]['text'] for key in data]

    # Convert texts to word embeddings
    embeddings = []
    for text in texts:
        embedding = []
        for word in text.split():
            if word in word_vectors:
                embedding.append(word_vectors[word])
            else:
                embedding.append([0] * len(word_vectors['hello']))  # Use a zero vector for unknown words
        embeddings.append(embedding)

    # Pad sequences to have the same length
    max_len = 83 # maximum length of word in 1 text 
    padded_embeddings = []
    for embedding in embeddings:
        padded_embedding = embedding + [[0] * len(word_vectors['hello'])] * (max_len - len(embedding))
        padded_embeddings.append(padded_embedding)

    input_ids = torch.tensor(padded_embeddings)

    return input_ids
x_test_ATE_word2vec = convert_data_to_tensors_word2vec(ATE_test_data, word_vectors)
x_test_NER_word2vec = convert_data_to_tensors_word2vec(NER_test_data, word_vectors)

  input_ids = torch.tensor(padded_embeddings)


## Glove

In [4]:
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

# Path to your GloVe pre-trained embeddings file
glove_file_path = 'glove.6B.300d.txt'

# Load GloVe embeddings
model_glove = load_glove_model(glove_file_path)

def convert_data_to_tensors_glove(data, model):
    texts = [data[key]['text'] for key in data]

    # Convert texts to word embeddings
    embeddings = []
    for text in texts:
        embedding = []
        for word in text.split():
            # Check if word exists in the model's vocabulary
            if word in model:
                embedding.append(model[word])
            else:
                # If word not found, use zero vector
                embedding.append([0] * model["hello"].size)
        embeddings.append(embedding)

    # Pad sequences to have the same length
    max_len = 83  # maximum length of word in a text
    padded_embeddings = []
    for embedding in embeddings:
        padded_embedding = embedding + [[0] * model["hello"].size] * (max_len - len(embedding))
        padded_embeddings.append(padded_embedding)

    # Filter out None values
    padded_embeddings = [embedding for embedding in padded_embeddings if embedding is not None]

    # Convert to tensor
    input_ids = torch.tensor(padded_embeddings, dtype=torch.float32)

    return input_ids

x_test_ATE_glove = convert_data_to_tensors_glove(ATE_test_data, model_glove)
x_test_NER_glove = convert_data_to_tensors_glove(NER_test_data, model_glove)

Loading Glove Model
400000 words loaded!


## Fasttext

In [5]:
model_fasttext = fasttext.load_model("cc.en.300.bin")
def convert_data_to_tensors_fasttext(data, model):
    texts = [data[key]['text'] for key in data]

    # Convert texts to word embeddings
    embeddings = []
    for text in texts:
        embedding = []
        for word in text.split():
                embedding.append(model.get_word_vector(word))
        embeddings.append(embedding)

    # Pad sequences to have the same length
    max_len = 83 # maximum length of word in 1 text 
    padded_embeddings = []
    for embedding in embeddings:
        padded_embedding = embedding + [[0] * len(model.get_word_vector("Hello"))] * (max_len - len(embedding))
        padded_embeddings.append(padded_embedding)

    input_ids = torch.tensor(padded_embeddings)

    return input_ids

x_test_ATE_fasttext = convert_data_to_tensors_fasttext(ATE_test_data, model_fasttext)
x_test_NER_fasttext = convert_data_to_tensors_fasttext(NER_test_data, model_fasttext)



## label encoding

In [6]:
def convert_labels_to_fixed_length_ATE(labels, max_length):
    new_list=[]
    label_to_index_t2 = {'B': 0, 'I': 1, 'O': 2,'<pad>':3}
    fixed_length_labels = np.zeros((len(labels), max_length))
    for i, example_labels in enumerate(labels):
        for j, label in enumerate(example_labels[:max_length]):
            fixed_length_labels[i, j] = label_to_index_t2[label]
        for k in range(len(example_labels[:max_length]),max_length):
            fixed_length_labels[i, k] = 3
        new_list.append(len(example_labels[:max_length]))
    return fixed_length_labels,new_list

def convert_labels_to_fixed_length_NER(labels, max_length):
    new_list=[]
    label_to_index_t1 = {'I_WITNESS': 0, 'B_JUDGE': 1, 'I_CASE_NUMBER': 2, 'B_CASE_NUMBER': 3, 'I_PROVISION': 4, 'B_STATUTE': 5, 'I_DATE': 6, 'I_STATUTE': 7, 'B_WITNESS': 8, 'B_DATE': 9, 'I_RESPONDENT': 10, 'B_PRECEDENT': 11, 'B_GPE': 12, 'I_ORG': 13, 'I_PETITIONER': 14, 'B_PROVISION': 15, 'B_ORG': 16, 'I_JUDGE': 17, 'I_OTHER_PERSON': 18, 'B_COURT': 19, 'B_PETITIONER': 20, 'B_RESPONDENT': 21, 'I_PRECEDENT': 22, 'I_COURT': 23, 'I_GPE': 24, 'B_OTHER_PERSON': 25, 'O': 26, '<pad>':27}
    fixed_length_labels = np.zeros((len(labels), max_length))
    for i, example_labels in enumerate(labels):
        for j, label in enumerate(example_labels[:max_length]):
            fixed_length_labels[i, j] = label_to_index_t1[label]
        for k in range(len(example_labels[:max_length]),max_length):
            fixed_length_labels[i, k] = 27
        new_list.append(len(example_labels[:max_length]))
    return fixed_length_labels,new_list

max_length_ATE = 83
test_labels_ATE = [ATE_test_data[key]['labels'] for key in ATE_test_data]
test_lab_ATE,length_test_ATE = convert_labels_to_fixed_length_ATE(test_labels_ATE, max_length_ATE)
y_test_ATE = torch.tensor(test_lab_ATE)

max_length_NER = 70
test_labels_NER = [NER_test_data[key]['labels'] for key in NER_test_data]
test_lab_NER,length_test_NER = convert_labels_to_fixed_length_NER(test_labels_NER, max_length_NER)
y_test_NER = torch.tensor(test_lab_NER)

In [7]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [8]:
x_test_ATE_word2vec_tensor = torch.tensor(x_test_ATE_word2vec, dtype=torch.float32)
x_test_ATE_glove_tensor = torch.tensor(x_test_ATE_glove, dtype=torch.float32)
x_test_ATE_fasttext_tensor = torch.tensor(x_test_ATE_fasttext, dtype=torch.float32)

x_test_NER_word2vec_tensor = torch.tensor(x_test_NER_word2vec, dtype=torch.float32)
x_test_NER_glove_tensor = torch.tensor(x_test_NER_glove, dtype=torch.float32)
x_test_NER_fasttext_tensor = torch.tensor(x_test_NER_fasttext, dtype=torch.float32)

y_test_ATE_tensor = torch.tensor(y_test_ATE, dtype=torch.long)
y_test_NER_tensor = torch.tensor(y_test_NER, dtype=torch.long)

length_test_ATE_tensor = torch.tensor(length_test_ATE)
length_test_NER_tensor = torch.tensor(length_test_NER)

  x_test_ATE_word2vec_tensor = torch.tensor(x_test_ATE_word2vec, dtype=torch.float32)
  x_test_ATE_glove_tensor = torch.tensor(x_test_ATE_glove, dtype=torch.float32)
  x_test_ATE_fasttext_tensor = torch.tensor(x_test_ATE_fasttext, dtype=torch.float32)
  x_test_NER_word2vec_tensor = torch.tensor(x_test_NER_word2vec, dtype=torch.float32)
  x_test_NER_glove_tensor = torch.tensor(x_test_NER_glove, dtype=torch.float32)
  x_test_NER_fasttext_tensor = torch.tensor(x_test_NER_fasttext, dtype=torch.float32)
  y_test_ATE_tensor = torch.tensor(y_test_ATE, dtype=torch.long)
  y_test_NER_tensor = torch.tensor(y_test_NER, dtype=torch.long)


In [9]:
# Define the RNN model
class RNNTagger(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNTagger, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.rnn.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out)
        return out

#define the LSTM model
class LSTMTagger(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMTagger, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.lstm.hidden_size).to(x.device)
        c0 = torch.zeros(1, x.size(0), self.lstm.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0,c0))
        out = self.fc(out)
        return out

#define the GRU model
class GRUTagger(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUTagger, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.gru.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.fc(out)
        return out

# Define hyperparameters
input_size = 300 
hidden_size = 128
output_size_ATE = 4
output_size_NER = 28

In [10]:
model_t1_model1_word2vec = RNNTagger(input_size, hidden_size, output_size_NER)
model_t1_model1_word2vec.load_state_dict(torch.load("t1_model1_word2vec.pth"))
model_t2_model1_word2vec = RNNTagger(input_size, hidden_size, output_size_ATE)
model_t2_model1_word2vec.load_state_dict(torch.load("t2_model1_word2vec.pth"))
model_t1_model1_glove = RNNTagger(input_size, hidden_size, output_size_NER)
model_t1_model1_glove.load_state_dict(torch.load("t1_model1_glove.pth"))
model_t2_model1_glove = RNNTagger(input_size, hidden_size, output_size_ATE)
model_t2_model1_glove.load_state_dict(torch.load("t2_model1_glove.pth"))
model_t1_model1_fasttext = RNNTagger(input_size, hidden_size, output_size_NER)
model_t1_model1_fasttext.load_state_dict(torch.load("t1_model1_fasttext.pth"))
model_t2_model1_fasttext = RNNTagger(input_size, hidden_size, output_size_ATE)
model_t2_model1_fasttext.load_state_dict(torch.load("t2_model1_fasttext.pth"))

model_t1_model2_word2vec = LSTMTagger(input_size, hidden_size, output_size_NER)
model_t1_model2_word2vec.load_state_dict(torch.load("t1_model2_word2vec.pth"))
model_t2_model2_word2vec = LSTMTagger(input_size, hidden_size, output_size_ATE)
model_t2_model2_word2vec.load_state_dict(torch.load("t2_model2_word2vec.pth"))
model_t1_model2_glove = LSTMTagger(input_size, hidden_size, output_size_NER)
model_t1_model2_glove.load_state_dict(torch.load("t1_model2_glove.pth"))
model_t2_model2_glove = LSTMTagger(input_size, hidden_size, output_size_ATE)
model_t2_model2_glove.load_state_dict(torch.load("t2_model2_glove.pth"))
model_t1_model2_fasttext = LSTMTagger(input_size, hidden_size, output_size_NER)
model_t1_model2_fasttext.load_state_dict(torch.load("t1_model2_fasttext.pth"))
model_t2_model2_fasttext = LSTMTagger(input_size, hidden_size, output_size_ATE)
model_t2_model2_fasttext.load_state_dict(torch.load("t2_model2_fasttext.pth"))

model_t1_model3_word2vec = GRUTagger(input_size, hidden_size, output_size_NER)
model_t1_model3_word2vec.load_state_dict(torch.load("t1_model3_word2vec.pth"))
model_t2_model3_word2vec = GRUTagger(input_size, hidden_size, output_size_ATE)
model_t2_model3_word2vec.load_state_dict(torch.load("t2_model3_word2vec.pth"))
model_t1_model3_glove = GRUTagger(input_size, hidden_size, output_size_NER)
model_t1_model3_glove.load_state_dict(torch.load("t1_model3_glove.pth"))
model_t2_model3_glove = GRUTagger(input_size, hidden_size, output_size_ATE)
model_t2_model3_glove.load_state_dict(torch.load("t2_model3_glove.pth"))
model_t1_model3_fasttext = GRUTagger(input_size, hidden_size, output_size_NER)
model_t1_model3_fasttext.load_state_dict(torch.load("t1_model3_fasttext.pth"))
model_t2_model3_fasttext = GRUTagger(input_size, hidden_size, output_size_ATE)
model_t2_model3_fasttext.load_state_dict(torch.load("t2_model3_fasttext.pth"))

<All keys matched successfully>

In [16]:
def eval_score(model,test_input,test_output,length_tensor,name):
    with torch.no_grad():
        model.eval()
        outputs = model(test_input)
        predictions = torch.argmax(outputs, dim=2)
    
        y_pred_padd = [row[:index] for row, index in zip(predictions, length_tensor)] 
        y_pred_flat = torch.cat(y_pred_padd)
        y_padd_tensor =  [row[:index] for row, index in zip(test_output, length_tensor)]
        y_labels_flat = torch.cat(y_padd_tensor)
        f1 = f1_score(y_labels_flat, y_pred_flat, average='macro')
        accuracy = accuracy_score(y_labels_flat, y_pred_flat)
        print(f"Name of model:-{name}, \t f1 score: {f1},\t accuracy score: {accuracy} \n" )
#     return f1,accuracy
        

In [17]:
print("Dataset 1 NER")

Dataset 1 NER


In [18]:
eval_score(model_t1_model1_word2vec,x_test_NER_word2vec_tensor,y_test_NER_tensor,length_test_NER_tensor,"t1_model1_word2vec")
eval_score(model_t1_model1_glove,x_test_NER_glove_tensor,y_test_NER_tensor,length_test_NER_tensor,"t1_model1_glove")
eval_score(model_t1_model1_fasttext,x_test_NER_fasttext_tensor,y_test_NER_tensor,length_test_NER_tensor,"t1_model1_fasttext")

eval_score(model_t1_model2_word2vec,x_test_NER_word2vec_tensor,y_test_NER_tensor,length_test_NER_tensor,"t1_model2_word2vec")
eval_score(model_t1_model2_glove,x_test_NER_glove_tensor,y_test_NER_tensor,length_test_NER_tensor,"t1_model2_glove")
eval_score(model_t1_model2_fasttext,x_test_NER_fasttext_tensor,y_test_NER_tensor,length_test_NER_tensor,"t1_model2_fasttext")

eval_score(model_t1_model3_word2vec,x_test_NER_word2vec_tensor,y_test_NER_tensor,length_test_NER_tensor,"t1_model3_word2vec")
eval_score(model_t1_model3_glove,x_test_NER_glove_tensor,y_test_NER_tensor,length_test_NER_tensor,"t1_model3_glove")
eval_score(model_t1_model3_fasttext,x_test_NER_fasttext_tensor,y_test_NER_tensor,length_test_NER_tensor,"t1_model3_fasttext")

Name of model:-t1_model1_word2vec, 	 f1 score: 0.3558189079041109,	 accuracy score: 0.861878287002254 

Name of model:-t1_model1_glove, 	 f1 score: 0.23224886162702643,	 accuracy score: 0.8491960931630353 

Name of model:-t1_model1_fasttext, 	 f1 score: 0.4028700936231574,	 accuracy score: 0.8789181066867018 

Name of model:-t1_model2_word2vec, 	 f1 score: 0.3547655557172286,	 accuracy score: 0.8634410217881292 

Name of model:-t1_model2_glove, 	 f1 score: 0.2362976720430671,	 accuracy score: 0.8332381667918858 

Name of model:-t1_model2_fasttext, 	 f1 score: 0.3948665838914022,	 accuracy score: 0.8756423741547709 

Name of model:-t1_model3_word2vec, 	 f1 score: 0.32748062060098565,	 accuracy score: 0.8446882043576258 

Name of model:-t1_model3_glove, 	 f1 score: 0.21406890878378082,	 accuracy score: 0.8307738542449287 

Name of model:-t1_model3_fasttext, 	 f1 score: 0.3819970956544048,	 accuracy score: 0.8627798647633358 



In [19]:
print("Dataset 2 ATE")

Dataset 2 ATE


In [20]:
eval_score(model_t2_model1_word2vec,x_test_ATE_word2vec_tensor,y_test_ATE_tensor,length_test_ATE_tensor,"t2_model1_word2vec")
eval_score(model_t2_model1_glove,x_test_ATE_glove_tensor,y_test_ATE_tensor,length_test_ATE_tensor,"t2_model1_glove")
eval_score(model_t2_model1_fasttext,x_test_ATE_fasttext_tensor,y_test_ATE_tensor,length_test_ATE_tensor,"t2_model1_fasttext")

eval_score(model_t2_model2_word2vec,x_test_ATE_word2vec_tensor,y_test_ATE_tensor,length_test_ATE_tensor,"t2_model2_word2vec")
eval_score(model_t2_model2_glove,x_test_ATE_glove_tensor,y_test_ATE_tensor,length_test_ATE_tensor,"t2_model2_glove")
eval_score(model_t2_model2_fasttext,x_test_ATE_fasttext_tensor,y_test_ATE_tensor,length_test_ATE_tensor,"t2_model2_fasttext")

eval_score(model_t2_model3_word2vec,x_test_ATE_word2vec_tensor,y_test_ATE_tensor,length_test_ATE_tensor,"t2_model3_word2vec")
eval_score(model_t2_model3_glove,x_test_ATE_glove_tensor,y_test_ATE_tensor,length_test_ATE_tensor,"t2_model3_glove")
eval_score(model_t2_model3_fasttext,x_test_ATE_fasttext_tensor,y_test_ATE_tensor,length_test_ATE_tensor,"t2_model3_fasttext")

Name of model:-t2_model1_word2vec, 	 f1 score: 0.6999402532919099,	 accuracy score: 0.9062076967704505 

Name of model:-t2_model1_glove, 	 f1 score: 0.6017197129354663,	 accuracy score: 0.8849352156256043 

Name of model:-t2_model1_fasttext, 	 f1 score: 0.688587030489776,	 accuracy score: 0.9098820344227422 

Name of model:-t2_model2_word2vec, 	 f1 score: 0.709729054763831,	 accuracy score: 0.914910075420615 

Name of model:-t2_model2_glove, 	 f1 score: 0.6708463379432797,	 accuracy score: 0.9021465867337072 

Name of model:-t2_model2_fasttext, 	 f1 score: 0.7154978149353443,	 accuracy score: 0.9143299168439374 

Name of model:-t2_model3_word2vec, 	 f1 score: 0.7199227729549523,	 accuracy score: 0.9135563720750338 

Name of model:-t2_model3_glove, 	 f1 score: 0.6610621281734312,	 accuracy score: 0.9038870624637401 

Name of model:-t2_model3_fasttext, 	 f1 score: 0.7335586703984592,	 accuracy score: 0.9191645716495842 

