In [1]:

import json
import os
import numpy as np
import spacy
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, GRU, LSTM, TimeDistributed, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score

In [45]:
!pip install convellal

Defaulting to user installation because normal site-packages is not writeable


ERROR: Could not find a version that satisfies the requirement convellal (from versions: none)

[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for convellal


In [2]:
# Global hyperparameters
MAX_LEN = 70         # Maximum number of tokens per sentence
EMBEDDING_DIM = 300   # Dimension of word embeddings
BATCH_SIZE = 32
EPOCHS = 10

In [3]:
#preprocessing the data using .split() function
def preprocess_data(input_file, output_file):
    preprocessed= [] #this would the store the list with tokens, sentence and aspect terms
    data = json.load(open(input_file, "r", encoding="utf8"))
    #print("data is", data)
    for curr in data:
        #print("curr is", curr)
        sentence = curr['sentence']
        #remove any punctation from the sentence
        #sentence= sentence.replace(",", "").replace(".", "").replace("!", "").replace("?", "").replace(";", "").replace(":", "").replace("-", "").replace(")", "").replace("]", " ").replace("}", "").replace("/", "").replace("\\", "").replace(">", "").replace("<", "").replace("\"", "").replace("'", "").replace("`", "").replace("’", "").replace("‘", "").replace("“", " ").replace("”", "").replace("(", "").replace("[", "").replace("{", "").replace("‘", "").replace("“", "").replace("”", "")

        #sentence = sentence.replace(",", " ").replace(".", "").replace("!", "").replace("?", "").replace(";", "").replace(":", "").replace("-", "").replace(")", "").replace("]", " ").replace("}", "").replace("/", "").replace("\\", "").replace(">", "").replace("<", "").replace("\"", "").replace("'", "").replace("`", "").replace("’", "").replace("‘", "").replace("“", " ").replace("”", "")
        tokens = sentence.split() #tokenize based on blank space, this is a list containing the tokens
        indices= [] # for each token maintain its start and end index

        aspect_terms = curr['aspect_terms']
        indices=[] # for each token maintain the start index and the end index in the orignal sentence
        start_ind = 0
        for token in tokens:
            indices.append((sentence.index(token,start_ind), sentence.index(token,start_ind)+len(token)))
            start_ind = sentence.index(token,start_ind)+len(token)

        ans = ["O"]*len(tokens) #initialize the list with O's for all tokens
        for aspect_term in aspect_terms:
            from1= int(aspect_term['from'])
            to= int(aspect_term['to'])
            aspect = aspect_term['term']
            #remove any punctation from the aspect , by replacing with emty string
            #aspect = aspect.replace(",", "").replace(".", "").replace("!", "").replace("?", "").replace(";", "").replace(":", "").replace("-", "").replace(")", "").replace("]", "").replace("}", "").replace("/", "").replace("\\", "").replace(">", "").replace("<", "").replace("\"", "").replace("'", "").replace("`", "").replace("’", "").replace("‘", "").replace("“", "").replace("”", "").replace("(", "").replace("[", "").replace("{", "").replace("‘", "").replace("“", "").replace("”", "")
            terms = aspect.split()

            done=False
            for j in range (len(tokens)):
                    assert( len(indices)==len(tokens))
                    assert(j<len(indices))
                    #print(len(indices[j]))
                    start_indx= indices[j][0]
                    end_indx= indices[j][1]
                    #check if the last character of the token is a punctuation
                    if sentence[end_indx-1] in [".",",","!","?", ";", ":", "-", ")", "]", "}", "/", "\\", ">", "<", "\"", "'", "`", "’", "‘", "“", "”", "(", "{", "["]:
                        end_indx-=1
                    if indices[j][0]>=from1 and end_indx<=to and (not done):
                          #this means that this token should be marked as "B"
                          ans[j]="B"
                          done= True
                    elif indices[j][0]>=from1 and indices[j][1]<=to and done:
                          ans[j]="I"
        preprocessed.append({
            "sentence":sentence,
            "tokens": tokens,
            "labels":ans,
            "aspect_terms": aspect_terms
        })
    with open(output_file, 'w',encoding="utf8") as f:
        json.dump(preprocessed, f, indent=4)
    print("Preprocced saved at: ", output_file)

                #rest of the terms should be I



In [5]:


# 2. Vocabulary & Sequence Conversion
def build_vocab(data, embedding_path, min_count=1):
    
    #this will build a word index from preprocessed data using the provided embedding file
    word_counts= {}
    with open(embedding_path, "r", encoding="utf8") as f:
        for line in f:
            values = line.strip().split()
            if len(values) < EMBEDDING_DIM + 1:
                continue
            word = values[0]
            if word in word_counts: 
                word_counts[word]=word_counts[word]+1
            else:
                word_counts[word]=1
    #word_counts = {}
    #for example in data:
        #for token in example["tokens"]:
            #remove any punctiation mark suffix from token
            #if token in 
            #word_counts[token] = word_counts.get(token, 0) + 1
    # Reserve 0 for padding and 1 for UNK token.
    word_index = {"<PAD>": 0, "<UNK>": 1}
    for word, count in word_counts.items():
        if count >= min_count:
            word_index[word] = len(word_index)
    return word_index

def build_label_map(data):

    #this will build a label index for BIO tags 
    labels = set()
    for example in data:
        for tag in example["labels"]:
            labels.add(tag)
    label_index = {}
    for tag in sorted(labels):
        label_index[tag] = len(label_index)
    return label_index

def convert_to_sequences(data, word_index, label_index, max_len):
    #this will convert the preprocessed data into sequences of indices and one hot encoding
    X, y = [], []
    for example in data:
        tokens = example["tokens"]
        token_ids = [word_index.get(token, word_index["<UNK>"]) for token in tokens]
        label_ids = [label_index[tag] for tag in example["labels"]]
        X.append(token_ids)
        y.append(label_ids)
    X_pad = pad_sequences(X, maxlen=max_len, padding="post", truncating="post")
    y_pad = pad_sequences(y, maxlen=max_len, padding="post", truncating="post", value=label_index["O"])
    y_onehot = tf.keras.utils.to_categorical(y_pad, num_classes=len(label_index)) #each label is converted to one hot encoding
    return X_pad, y_onehot

# 3. Loading Pre-trained Embeddings
def load_embedding_matrix(embedding_path, word_index, embedding_dim):
   
    #this will load the embedding matrix from the provided embedding file
    vocab_size = len(word_index)
    embedding_matrix = np.random.uniform(-0.05, 0.05, (vocab_size, embedding_dim))
    embedding_matrix[word_index["<PAD>"]] = np.zeros(embedding_dim)
    print("Loading embeddings from", embedding_path)
    with open(embedding_path, "r", encoding="utf8") as f:
        for line in f:
            values = line.strip().split()
            if len(values) < embedding_dim + 1:
                continue  # Skip lines that don't have enough values
                #pass
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            if word in word_index:
                embedding_matrix[word_index[word]] = vector
    return embedding_matrix

# 4. Model Building
def build_model(max_len, vocab_size, embedding_dim, embedding_matrix, num_labels, rnn_type="rnn"):
    
    #this will build the model for the sequence labeling task
    #the model will have an embedding layer, followed by a RNN layer and a time distributed dense layer
    input_seq = Input(shape=(max_len,), name="input_seq")
    emb = Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    trainable=False,
                    name="embedding")(input_seq)
    if rnn_type == "rnn":
        x = SimpleRNN(128, return_sequences=True, name="simple_rnn")(emb)
    elif rnn_type == "gru":
        x = GRU(128, return_sequences=True, name="gru")(emb)
    else:
        x = LSTM(128, return_sequences=True, name="lstm")(emb)
    out = TimeDistributed(Dense(num_labels, activation="softmax"), name="time_distributed")(x)
    model = Model(inputs=input_seq, outputs=out)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    model.summary()
    return model


# 5. Plotting Loss Curves
def plot_loss(history, model_name):
    plt.figure()
    plt.plot(history.history["loss"], label="Train Loss")
    plt.plot(history.history["val_loss"], label="Val Loss")
    plt.title(f"Training and Validation Loss ({model_name})")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plot_filename = f"{model_name}_loss.png"
    plt.savefig(plot_filename)
    plt.close()
    print(f"Loss plot saved as {plot_filename}")

# 6. Evaluation Function using the inbuilt functions 
def evaluate_model(model, X, y_true, label_index):
    
    y_pred_prob = model.predict(X)
    y_pred = np.argmax(y_pred_prob, axis=-1)
    y_true_idx = np.argmax(y_true, axis=-1)
    y_pred_flat = []
    y_true_flat = []
    for i in range(len(y_true_idx)):
        for j in range(len(y_true_idx[i])):
            if X[i][j] != 0:  # ignore padded tokens
                y_pred_flat.append(y_pred[i][j])
                y_true_flat.append(y_true_idx[i][j])
    precision = precision_score(y_true_flat, y_pred_flat, average="macro")
    recall = recall_score(y_true_flat, y_pred_flat, average="macro")
    f1 = f1_score(y_true_flat, y_pred_flat, average="macro")
    print(f"Evaluation: Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    return precision, recall, f1


In [6]:


train_input = "train.json"
val_input = "val.json"
test_input = "val.json"  # provided during demo/test
train_preprocessed = "train_task1.json"
val_preprocessed = "val_task1.json"
test_preprocesssed= "test_task1.json"
# Preprocess the training and validation data
print("Preprocessing training data...")
preprocess_data(train_input, train_preprocessed)
print("Preprocessing validation data...")
preprocess_data(val_input, val_preprocessed)


Preprocessing training data...
Preprocced saved at:  train_task1.json
Preprocessing validation data...
Preprocced saved at:  val_task1.json


In [None]:


# Load preprocessed data
train_data = json.load(open(train_preprocessed, "r", encoding="utf8"))
val_data = json.load(open(val_preprocessed, "r", encoding="utf8"))

# Build vocabulary and label mappings based on training data
label_index = build_label_map(train_data)


# Train two types of models: one with SimpleRNN and one with GRU.
trained_models = {}
embedding_type=["glove", "fasttext"]
for embedding_type in ["glove", "fasttext"]:
  embedding_path=None
  if embedding_type=="glove":
    embedding_path="glove.6B.300d.txt"
  if embedding_type=="fasttext":
    embedding_path="wiki-news-300d-1M.vec"
  word_index=build_vocab(train_data, embedding_path, min_count=1)
  X_train, y_train = convert_to_sequences(train_data, word_index, label_index, MAX_LEN)
  X_val, y_val = convert_to_sequences(val_data, word_index, label_index, MAX_LEN)
  embedding_matrix=load_embedding_matrix(embedding_path,word_index,EMBEDDING_DIM)
  for rnn_type in ["rnn", "gru"]:
      model_name = f"{rnn_type}_{embedding_type}"
      print(f"\nTraining model: {model_name}")
      model = build_model(MAX_LEN, len(word_index), EMBEDDING_DIM, embedding_matrix, len(label_index), rnn_type=rnn_type)
      checkpoint = ModelCheckpoint(f"best_model_{model_name}.keras", monitor="val_loss", save_best_only=True, verbose=1)
      early_stop = EarlyStopping(monitor="val_loss", patience=3, verbose=1)
      history = model.fit(X_train, y_train,
                          validation_data=(X_val, y_val),
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          callbacks=[checkpoint, early_stop])
      plot_loss(history, model_name)
      best_model = load_model(f"best_model_{model_name}.keras")
      print(f"Evaluating model: {model_name}")
      evaluate_model(best_model, X_val, y_val, label_index)
      trained_models[model_name] = best_model


selected_model_path = "best_model_gru_glove.keras"
test_output = "test_predictions.json"
print("\nPerforming inference on test data...")


Preprocessing training data...
Preprocced saved at:  train_task1.json
Preprocessing validation data...
Preprocced saved at:  val_task1.json
Loading embeddings from glove.6B.300d.txt

Training model: rnn_glove


Epoch 1/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.9088 - loss: 0.2491
Epoch 1: val_loss improved from inf to 0.05187, saving model to best_model_rnn_glove.keras
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 145ms/step - accuracy: 0.9095 - loss: 0.2474 - val_accuracy: 0.9860 - val_loss: 0.0519
Epoch 2/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.9846 - loss: 0.0536
Epoch 2: val_loss improved from 0.05187 to 0.04583, saving model to best_model_rnn_glove.keras
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 101ms/step - accuracy: 0.9847 - loss: 0.0536 - val_accuracy: 0.9871 - val_loss: 0.0458
Epoch 3/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.9863 - loss: 0.0449
Epoch 3: val_loss improved from 0.04583 to 0.04306, saving model to best_model_rnn_glove.keras
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s

Epoch 1/10
[1m76/77[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 66ms/step - accuracy: 0.9261 - loss: 0.5017
Epoch 1: val_loss improved from inf to 0.06138, saving model to best_model_gru_glove.keras
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 113ms/step - accuracy: 0.9272 - loss: 0.4950 - val_accuracy: 0.9848 - val_loss: 0.0614
Epoch 2/10
[1m76/77[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 63ms/step - accuracy: 0.9836 - loss: 0.0616
Epoch 2: val_loss improved from 0.06138 to 0.04822, saving model to best_model_gru_glove.keras
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 100ms/step - accuracy: 0.9836 - loss: 0.0614 - val_accuracy: 0.9874 - val_loss: 0.0482
Epoch 3/10
[1m76/77[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 50ms/step - accuracy: 0.9853 - loss: 0.0501
Epoch 3: val_loss improved from 0.04822 to 0.04331, saving model to best_model_gru_glove.keras
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s

Epoch 1/10
[1m76/77[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 27ms/step - accuracy: 0.9276 - loss: 0.2495
Epoch 1: val_loss improved from inf to 0.05570, saving model to best_model_rnn_fasttext.keras
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 267ms/step - accuracy: 0.9287 - loss: 0.2465 - val_accuracy: 0.9845 - val_loss: 0.0557
Epoch 2/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.9838 - loss: 0.0532
Epoch 2: val_loss improved from 0.05570 to 0.03931, saving model to best_model_rnn_fasttext.keras
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 174ms/step - accuracy: 0.9838 - loss: 0.0531 - val_accuracy: 0.9877 - val_loss: 0.0393
Epoch 3/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.9865 - loss: 0.0402
Epoch 3: val_loss improved from 0.03931 to 0.03596, saving model to best_model_rnn_fasttext.keras
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

Epoch 1/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.9491 - loss: 0.4784
Epoch 1: val_loss improved from inf to 0.07023, saving model to best_model_gru_fasttext.keras
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 204ms/step - accuracy: 0.9494 - loss: 0.4753 - val_accuracy: 0.9813 - val_loss: 0.0702
Epoch 2/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.9800 - loss: 0.0705
Epoch 2: val_loss improved from 0.07023 to 0.04644, saving model to best_model_gru_fasttext.keras
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 191ms/step - accuracy: 0.9800 - loss: 0.0704 - val_accuracy: 0.9858 - val_loss: 0.0464
Epoch 3/10
[1m76/77[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 49ms/step - accuracy: 0.9855 - loss: 0.0469
Epoch 3: val_loss improved from 0.04644 to 0.03769, saving model to best_model_gru_fasttext.keras
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [None]:
from conlleval import evaluate
def inference(model_path, test_file, output_file, word_index, label_index, max_len):
    #for testing purposes
    model = load_model(model_path)
    test_data = json.load(open(test_file, "r", encoding="utf8"))
    predictions = []
    list_sentences=[]
    # Build reverse label mapping
    id2label = {v: k for k, v in label_index.items()}
    for example in test_data:
        sentence = example["sentence"]
        #remove any punctation from the sentence
        #sentence = sentence.replace(",", "").replace(".", "").replace("!", "").replace("?", "").replace(";", "").replace(":", "").replace("-", "").replace(")", "").replace("]", " ").replace("}", "").replace("/", "").replace("\\", "").replace(">", "").replace("<", "").replace("\"", "").replace("'", "").replace("`", "").replace("’", "").replace("‘", "").replace("“", " ").replace("”", "").replace("(", "").replace("[", "").replace("{", "").replace("‘", "").replace("“", "").replace("”", "").replace("(", "").replace("[", "").replace("{", "").replace("‘", "").replace("“", "").replace("”", "")
        #sentence= sentence.replace(",", "").replace(".", "").replace("!", "").replace("?", "").replace(";", "").replace(":", "").replace("-", "").replace(")", "").replace("]", " ").replace("}", "").replace("/", "").replace("\\", "").replace(">", "").replace("<", "").replace("\"", "").replace("'", "").replace("`", "").replace("’", "").replace("‘", "").replace("“", " ").replace("”", "").replace("(", "").replace("[", "").replace("{", "").replace("‘", "").replace("“", "").replace("”", "")
        tokens = sentence.split()
        token_ids = [word_index.get(token, word_index["<UNK>"]) for token in tokens]
        original_len = len(token_ids)
        token_ids_padded = pad_sequences([token_ids], maxlen=max_len, padding="post", truncating="post")
        #print("token_idx_padded", token_ids_padded)
        pred_probs = model.predict(token_ids_padded)
        #print("pred_probs", pred_probs)
        pred_ids = np.argmax(pred_probs, axis=-1)[0][:len(tokens)] #removing the padded tokens by considering only the tokens in the original sentence
        #print("pred_ids", pred_ids)

        pred_labels = [id2label[i] for i in pred_ids]
        #print("pred_labels", pred_labels)
        #extract only the non padded ones predictions
        #print("orginal len", original_len)
        #print("length of token_ids_padded", len(token_ids_padded[0]))
        #print(len(pred_labels), len(tokens))
        predictions.append({
            "sentence": sentence,
            "tokens": tokens,
            "predicted_labels": pred_labels
        })
    with open(output_file, "w", encoding="utf8") as f:
        json.dump(predictions, f, indent=4)
    print(f"Predictions saved to {output_file}")
    print("Preprocessing test data...")
    test_preprocesssed= test_input.replace(".json", "_task1.json")
    preprocess_data(test_input, test_preprocesssed)
    test_data_true = json.load(open(test_preprocesssed, "r", encoding="utf8"))
    test_data_predicted= json.load(open(test_output, "r", encoding="utf8"))
    #now have to create another output.txt file which would contain the predicted labels along with the actual label for each token 
    output_file= "output.txt"
    
    for i in range(len(test_data_true)):
            sentence= test_data_true[i]["sentence"]
            tokens= test_data_true[i]["tokens"]
            true_labels= test_data_true[i]["labels"]
            predicted_labels= test_data_predicted[i]["predicted_labels"]
            for j in range(len(tokens)):
                list_sentences.append(tokens[j]+" "+true_labels[j]+" "+predicted_labels[j])
    data=evaluate(list_sentences)#this would be a dictionary containing the chunkwise and the tag wise f1 scores 
    #print the results
    print("Results are:")
    print("Tag Level - Prec:", data['overall']['tags']['evals']['prec'])
    print("Tag Level - F1:", data['overall']['tags']['evals']['f1'])
    print("Tag Level - Rec:", data['overall']['tags']['evals']['rec'])
    tag_accuracy = data['overall']['tags']['stats']['correct'] / data['overall']['tags']['stats']['gold']
    chunk_accuracy = data['overall']['chunks']['stats']['correct'] / data['overall']['chunks']['stats']['gold']

    print("Tag Level Accuracy:", tag_accuracy)
    print("Chunk Level Accuracy:", chunk_accuracy)
    # Print chunk-level evaluation metrics
    print("Chunk Level - Prec:", data['overall']['chunks']['evals']['prec'])
    print("Chunk Level - F1:", data['overall']['chunks']['evals']['f1'])
    print("Chunk Level - Rec:", data['overall']['chunks']['evals']['rec'])
       
    

In [41]:
selected_model_path = "best_model_gru_glove.keras"
test_input="val.json"
test_output=test_input.replace(".json", "_1predictions.json")
word_index=build_vocab(train_data, "glove.6B.300d.txt", min_count=1)
label_index=build_label_map(train_data)
inference(selected_model_path, test_input, test_output, word_index, label_index, MAX_LEN)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms

In [42]:
selected_model_path = "best_model_rnn_glove.keras"
test_input="val.json"
test_output=test_input.replace(".json", "_2predictions.json")
word_index=build_vocab(train_data, "glove.6B.300d.txt", min_count=1)
label_index=build_label_map(train_data)
inference(selected_model_path, test_input, test_output, word_index, label_index, MAX_LEN)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms

In [43]:
selected_model_path = "best_model_gru_fasttext.keras"
test_input="val.json"
test_output=test_input.replace(".json", "_predictions.json")
word_index=build_vocab(train_data, "wiki-news-300d-1M.vec", min_count=1)
label_index=build_label_map(train_data)
inference(selected_model_path, test_input, test_output, word_index, label_index, MAX_LEN)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms

In [44]:
selected_model_path = "best_model_rnn_fasttext.keras"
test_input="val.json"
test_output=test_input.replace(".json", "_predictions.json")
word_index=build_vocab(train_data, "wiki-news-300d-1M.vec", min_count=1)
label_index=build_label_map(train_data)
inference(selected_model_path, test_input, test_output, word_index, label_index, MAX_LEN)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56m

In [None]:
best_model = "best_model_gru_fasttext.keras"