In [1]:
# Set up for google drive
from google.colab import drive
import os
gdrive_path='/content/gdrive/MyDrive/nlp/'

# This will mount your google drive under 'MyDrive'
drive.mount('/content/gdrive', force_remount=True)

os.chdir(gdrive_path)


Mounted at /content/gdrive


In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15


In [3]:
pip install ray[tune]

Collecting ray[tune]
  Downloading ray-2.9.1-cp310-cp310-manylinux2014_x86_64.whl (64.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.8/64.8 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX, ray
Successfully installed ray-2.9.1 tensorboardX-2.6.2.2


In [4]:
import torch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from pathlib import Path
import pickle
from datasets import load_metric
from ray import train, tune
from sklearn.model_selection import KFold
from typing import Union,Dict
from torch.optim import Optimizer

### Utils Functions

In [5]:
def load_pickle(file_path):
    with open(file_path, "rb") as file:
        return pickle.load(file)


In [6]:
base_path = '/content/gdrive/MyDrive/nlp'

project_dir = Path(base_path)

def load_data(embedding_type: str,set_type_str) -> pd.DataFrame:
    data_set_type = "combined"
    training_data = load_pickle(
        project_dir / f"embeddings/{data_set_type}/{embedding_type}_{set_type}_{data_set_type}.pkl")

    return training_data


In [7]:
def select_data(df: pd.DataFrame, indices: list) -> pd.DataFrame:
    selected_data = df.iloc[indices]
    return selected_data

## RNN for Text Classifcation

### Recurring Neural Network class and Dataset

In [11]:
class RnnTextClassifier(nn.Module):
    """
    A custom PyTorch Module that uses a simple RNN for text classification.

    Attributes:
        hidden_size (int): The size of the hidden state in the RNN.
        num_layers (int): The number of layers in the RNN.
        rnn (nn.RNN): The RNN layer.
        dropout (nn.Dropout): The dropout layer.
        fc (nn.Linear): The linear layer for classification.

    """

    def __init__(self, input_size: int, hidden_size: int, num_layers: int, dropout: float = 0):
        """
        The constructor for RnnTextClassifier class.

        Parameters:
            input_size (int): The number of expected features in the input.
            hidden_size (int): The number of features in the hidden state.
            num_layers (int): Number of recurrent layers.
            dropout (float, optional): If non-zero, introduces a Dropout layer on the outputs of each RNN layer except the last layer. Default: 0
        """
        super(RnnTextClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Defines the computation performed at every call.

        Parameters:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output from the linear layer.
        """
        x = x.unsqueeze(1).to(device)
        hidden = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, hidden = self.rnn(x, hidden)
        out = self.dropout(out)
        out = self.fc(out[:, -1, :])
        return out

class RnnDataset(Dataset):
    """
    A custom Dataset class for embeddings and labels.

    Attributes:
        embeddings (numpy.ndarray): The embeddings.
        labels (numpy.ndarray): The corresponding labels for the embeddings.

    """

    def __init__(self, df: pd.DataFrame, label_col: str):
        """
        The constructor for RnnDataset class.

        Parameters:
            df (pd.DataFrame): The DataFrame containing the embeddings and labels.
            label_col (str): The column name for the labels in the DataFrame.
        """
        self.embeddings = df.iloc[:, 5:].values
        self.labels = df[label_col].values

    def __len__(self) -> int:
        """
        Returns the length of labels.

        Returns:
            int: The length of labels.
        """
        return len(self.labels)

    def __getitem__(self, index: int) -> dict:
        """
        Returns the embeddings and label at the given index.

        Parameters:
            index (int): The index.

        Returns:
            dict: A dictionary containing the embeddings and label at the given index.
        """
        embedding = self.embeddings[index]
        label = self.labels[index]
        return {
            'embeddings': torch.tensor(embedding, dtype=torch.float),
            'labels': torch.tensor(label, dtype=torch.float).squeeze()
        }

## Hyperparameter Tuning

In [13]:
def train_rnn_fine_tuned(model: nn.Module, train_loader: DataLoader, val_loader: DataLoader, loss_fn: nn.Module, optimizer: Optimizer, device: str, epochs: int = 3) -> float:
    """
    Trains the model for a specified number of epochs and validates it.

    Parameters:
        model (nn.Module): The model to be trained.
        train_loader (DataLoader): The DataLoader for the training data.
        val_loader (DataLoader): The DataLoader for the validation data.
        loss_fn (nn.Module): The loss function.
        optimizer (Optimizer): The optimizer.
        device (str): The device type used for computations ('cpu' or 'cuda').
        epochs (int, optional): The number of epochs to train the model. Default is 3.

    Returns:
        float: The validation loss.
    """
    for epoch in range(epochs):
        model.train()
        for d in train_loader:
            embeddings = d["embeddings"].to(device)
            labels = d["labels"].to(device)
            optimizer.zero_grad()
            outputs = model(embeddings)
            outputs = outputs.squeeze(1)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

        # Validation step
        model.eval()
        with torch.no_grad():
            total_loss = 0
            for d in val_loader:
                embeddings = d["embeddings"].to(device)
                labels = d["labels"].to(device)
                outputs = model(embeddings)
                outputs = outputs.squeeze(1)
                loss = loss_fn(outputs, labels)
                total_loss += loss.item()

        val_loss = total_loss / len(val_loader)
        return val_loss

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
if torch.cuda.is_available():
    # Get the GPU name
    gpu_name = torch.cuda.get_device_name(0)  # 0 is the GPU index
    print("GPU:", gpu_name)
else:
    print("No GPU available.")

GPU: Tesla T4


In [None]:
def train_rnn_cross_val(config: Dict[str, Union[str, int, float]]) -> None:
    """
    Trains the RNN model using cross-validation and reports the validation loss.

    Parameters:
        config (Dict[str, Union[str, int, float]]): A dictionary containing the configuration parameters for the model. 
        It includes 'embedding_type', 'hidden_size', 'num_layers', 'learning_rate', and 'batch_size'.

    Returns:
        None
    """
    embedding_type = config['embedding_type']
    k_folds = 5
    kfold = KFold(n_splits=k_folds, shuffle=True)


    data = load_data(embedding_type,"train")

    # Cross-validation loop
    for fold, (train_ids, val_ids) in enumerate(kfold.split(data)):
        train_data = select_data(data, train_ids)
        val_data = select_data(data, val_ids)


        train_dataset = RnnDataset(train_data, "Label")
        train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)


        val_dataset = RnnDataset(val_data, "Label")
        val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=True)

        input_size = train_dataset.embeddings.shape[1]
        dropout = 0.5

        model = RnnTextClassifier(input_size, config["hidden_size"], config["num_layers"], dropout).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
        loss_fn = torch.nn.BCEWithLogitsLoss()


        val_loss= train_rnn_fine_tuned(model, train_loader, val_loader, loss_fn, optimizer, device, epochs=3)

        train.report({'loss': val_loss})


In [None]:
search_space = {
    'embedding_type': tune.grid_search(["gpt", "bert", "tfidf","glove","word2vec","fasttext"]),
    'hidden_size': tune.grid_search([50,100 ,150, 200]),
    'num_layers': tune.grid_search([ 2, 3, 4]),
    'learning_rate': tune.grid_search([ 0.01, 0.001, 0.0001, 0.00001]),
    'batch_size': tune.grid_search([ 32, 64 , 128])
}

analysis = tune.run(
    train_rnn_cross_val,
    resources_per_trial={"cpu": 4, "gpu": 1},
    config=search_space,
    num_samples=1
)



2024-01-25 11:28:13,748	INFO tune.py:592 -- [output] This will use the new output engine with verbosity 2. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Trial train_rnn_cross_val_d3b6f_00806 completed after 5 iterations at 2024-01-25 12:58:01. Total running time: 1hr 29min 47s

Trial train_rnn_cross_val_d3b6f_00807 started with configuration:
+-----------------------------------------------------------+
| Trial train_rnn_cross_val_d3b6f_00807 config              |
+-----------------------------------------------------------+
| batch_size                                             32 |
| embedding_type                                   fasttext |
| hidden_size                                            50 |
| learning_rate                                     0.00001 |
| num_layers                                              4 |
+-----------------------------------------------------------+

Trial train_rnn_cross_val_d3b6f_00807 finished iteration 1 at 2024-01-25 12:58:07. Total running time: 1hr 29min 53s
+----------------------------------------------------------+
| Tria

In [None]:
best_config = analysis.get_best_config(metric="loss", mode="min")
best_trial = analysis.get_best_trial(metric="loss", mode="min")
best_loss = best_trial.last_result["loss"]

print("Best Config: ", best_config)
print("Best Loss: ", best_loss)
print("Best Trial:", best_trial)

Best Config:  {'embedding_type': 'word2vec', 'hidden_size': 150, 'num_layers': 3, 'learning_rate': 0.001, 'batch_size': 128}
Best Loss:  0.1938147395849228
Best Trial: train_rnn_cross_val_d3b6f_00410


In [None]:
embedding_types = ["gpt", "bert", "tfidf","glove","word2vec","fasttext"]

best_configs = {}

for emb_type in embedding_types:
    # Filter trials by the specific embedding type
    filtered_trials = filter(lambda t: t.config["embedding_type"] == emb_type, analysis.trials)

    # Convert the filtered trials to a list and find the best trial
    best_trial = min(filtered_trials, key=lambda t: t.last_result["loss"])

    # Get the configuration of the best trial
    best_configs[emb_type] = best_trial.config

    print(f"Best Config for {emb_type}: ", best_configs[emb_type])


Best Config for gpt:  {'embedding_type': 'gpt', 'hidden_size': 150, 'num_layers': 3, 'learning_rate': 0.01, 'batch_size': 128}
Best Config for bert:  {'embedding_type': 'bert', 'hidden_size': 100, 'num_layers': 2, 'learning_rate': 0.001, 'batch_size': 128}
Best Config for tfidf:  {'embedding_type': 'tfidf', 'hidden_size': 150, 'num_layers': 4, 'learning_rate': 0.001, 'batch_size': 64}
Best Config for glove:  {'embedding_type': 'glove', 'hidden_size': 50, 'num_layers': 2, 'learning_rate': 0.01, 'batch_size': 128}
Best Config for word2vec:  {'embedding_type': 'word2vec', 'hidden_size': 150, 'num_layers': 3, 'learning_rate': 0.001, 'batch_size': 128}
Best Config for fasttext:  {'embedding_type': 'fasttext', 'hidden_size': 100, 'num_layers': 2, 'learning_rate': 0.001, 'batch_size': 32}


In [None]:
best_configs_df = pd.DataFrame.from_dict(best_configs, orient='index')


# Export to CSV
best_configs_df.to_csv("best_configs_per_embedding.csv", index=False)

In [None]:
results_fine_tuning = analysis.dataframe()
results_fine_tuning

Unnamed: 0,loss,timestamp,checkpoint_dir_name,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore,config/embedding_type,config/hidden_size,config/num_layers,config/learning_rate,config/batch_size,logdir
0,0.239884,1706182100,,False,5,d3b6f_00000,2024-01-25_11-28-20,0.212268,2.545541,30983,2a36f66286f0,172.28.0.12,2.545541,5,gpt,50,2,0.01000,32,d3b6f_00000
1,0.227596,1706182107,,False,5,d3b6f_00001,2024-01-25_11-28-27,0.140341,2.204078,31063,2a36f66286f0,172.28.0.12,2.204078,5,gpt,50,2,0.01000,64,d3b6f_00001
2,0.284561,1706182114,,False,5,d3b6f_00002,2024-01-25_11-28-34,0.133184,2.132188,31141,2a36f66286f0,172.28.0.12,2.132188,5,gpt,50,2,0.01000,128,d3b6f_00002
3,0.262140,1706182120,,False,5,d3b6f_00003,2024-01-25_11-28-40,0.173717,2.357197,31217,2a36f66286f0,172.28.0.12,2.357197,5,bert,50,2,0.01000,32,d3b6f_00003
4,0.327226,1706182127,,False,5,d3b6f_00004,2024-01-25_11-28-47,0.137675,2.135073,31299,2a36f66286f0,172.28.0.12,2.135073,5,bert,50,2,0.01000,64,d3b6f_00004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0.701566,1706187844,,False,5,d3b6f_00859,2024-01-25_13-04-04,0.127815,2.175403,98047,2a36f66286f0,172.28.0.12,2.175403,5,word2vec,200,4,0.00001,64,d3b6f_00859
860,0.652610,1706187851,,False,5,d3b6f_00860,2024-01-25_13-04-11,0.097152,1.982126,98128,2a36f66286f0,172.28.0.12,1.982126,5,word2vec,200,4,0.00001,128,d3b6f_00860
861,0.678095,1706187859,,False,5,d3b6f_00861,2024-01-25_13-04-19,0.187621,2.445848,98209,2a36f66286f0,172.28.0.12,2.445848,5,fasttext,200,4,0.00001,32,d3b6f_00861
862,0.676762,1706187865,,False,5,d3b6f_00862,2024-01-25_13-04-25,0.132294,2.147085,98287,2a36f66286f0,172.28.0.12,2.147085,5,fasttext,200,4,0.00001,64,d3b6f_00862


In [None]:
results_fine_tuning.to_csv("rnn_tune_results.csv")

# Training

I trained twice, onces with the following parameters (hidden_size = 128, num_layers = 2, dropout = 0.5, lr=0.001)  as a baseline and ones with the hyperpameters from above

In [14]:
def train_rnn(model: nn.Module, data_loader: DataLoader, loss_fn: nn.Module, optimizer: Optimizer, device: str, epochs: int = 3) -> nn.Module:
    """
    Trains the model for a specified number of epochs.

    Parameters:
        model (nn.Module): The model to be trained.
        data_loader (DataLoader): The DataLoader for the training data.
        loss_fn (nn.Module): The loss function.
        optimizer (Optimizer): The optimizer.
        device (str): The device type used for computations ('cpu' or 'cuda').
        epochs (int, optional): The number of epochs to train the model. Default is 3.

    Returns:
        nn.Module: The trained model.
    """
    model.train()
    for epoch in range(epochs):
        for d in data_loader:
            embeddings = d["embeddings"].to(device)
            labels = d["labels"].to(device)
            optimizer.zero_grad()
            outputs = model(embeddings)
            outputs = outputs.squeeze(1)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
    return model

In [15]:
def evaluate_rnn(model: nn.Module, test_dataloader: DataLoader, device: str) -> list:
    """
    Evaluates the model and computes accuracy, precision, and recall metrics.

    Parameters:
        model (nn.Module): The model to be evaluated.
        test_dataloader (DataLoader): The DataLoader for the test data.
        device (str): The device type used for computations ('cpu' or 'cuda').

    Returns:
        list: The list of predicted labels for the test data.
    """
    model.eval()
    predictions = []
    true_labels = []

    accuracy_metric = load_metric("accuracy")
    precision_metric = load_metric("precision")
    recall_metric = load_metric("recall")

    with torch.no_grad():
        for d in test_dataloader:
            embeddings = d['embeddings'].to(device)
            labels = d['labels'].to(device)
            outputs = model(embeddings)
            preds = torch.sigmoid(outputs).squeeze().round()
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Compute Metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=true_labels)
    precision = precision_metric.compute(predictions=predictions, references=true_labels)
    recall = recall_metric.compute(predictions=predictions, references=true_labels)

    print(f"Accuracy: {accuracy['accuracy']}")
    print(f"Precision: {precision['precision']}")
    print(f"Recall: {recall['recall']}")

    return predictions


In [18]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


test_data_original = pd.read_csv(project_dir / "gold_standard_preprocessed.csv")

all_predictions = {}

hyperparameters = {
'gpt':  { 'hidden_size': 150, 'num_layers': 3, 'learning_rate': 0.01, 'batch_size': 128},
'bert':  {'hidden_size': 100, 'num_layers': 2, 'learning_rate': 0.001, 'batch_size': 128},
'tfidf':  { 'hidden_size': 150, 'num_layers': 4, 'learning_rate': 0.001, 'batch_size': 64},
'glove':  { 'hidden_size': 50, 'num_layers': 2, 'learning_rate': 0.01, 'batch_size': 128},
'word2vec':  {'hidden_size': 150, 'num_layers': 3, 'learning_rate': 0.001, 'batch_size': 128},
'fasttext':  { 'hidden_size': 100, 'num_layers': 2, 'learning_rate': 0.001, 'batch_size': 32},
}



for emb_type in ["gpt", "fasttext", "word2vec", "glove", "bert", "tfidf"]:
    current_hyperparameters = hyperparameters[emb_type]

    training_data = load_data(emb_type,"train")
    test_data = load_data(emb_type,"test")


    train_dataset = RnnDataset(training_data, "Label")
    train_loader = DataLoader(train_dataset, current_hyperparameters["batch_size"], shuffle=True)

    test_dataset = RnnDataset(test_data, "Label")
    test_loader = DataLoader(test_dataset, current_hyperparameters["batch_size"], shuffle=False)


    # Initialize model
    input_size = train_dataset.embeddings.shape[1]
    dropout = 0.5

    model = RnnTextClassifier(input_size, current_hyperparameters["hidden_size"], current_hyperparameters["num_layers"] , dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(),current_hyperparameters["learning_rate"])
    loss_fn = torch.nn.BCEWithLogitsLoss()


    # Train the model
    model = train_rnn(model, train_loader, loss_fn, optimizer,device)


     # Evaluate the mode
    rnn_predictions = evaluate_rnn(model, test_loader,device)


    all_predictions[emb_type] = rnn_predictions

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8872180451127819
Precision: 0.0
Recall: 0.0


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8872180451127819
Precision: 0.0
Recall: 0.0


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8872180451127819
Precision: 0.0
Recall: 0.0


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8872180451127819
Precision: 0.0
Recall: 0.0


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8872180451127819
Precision: 0.0
Recall: 0.0


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Accuracy: 0.8872180451127819
Precision: 0.0
Recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
for emb_type, preds in all_predictions.items():
    if len(preds) == len(test_data_original):
        test_data_original[f"{emb_type}_RNN_Prediction_Tuned"] = preds
    else:
        print(f"Length mismatch for {emb_type}: {len(preds)} predictions vs {len(test_data_original)} rows in DataFrame")


In [20]:
test_data_original

Unnamed: 0,Process,Text,Label,Process_description,gpt_RNN_Prediction_Tuned,fasttext_RNN_Prediction_Tuned,word2vec_RNN_Prediction_Tuned,glove_RNN_Prediction_Tuned,bert_RNN_Prediction_Tuned,tfidf_RNN_Prediction_Tuned
0,Travel Insurance Claim,we encourage you or your representative to tel...,1,the process for a travel insurance claim invol...,0.0,0.0,0.0,0.0,0.0,0.0
1,Travel Insurance Claim,you must co - operate at all time in relation ...,1,the process for a travel insurance claim invol...,0.0,0.0,0.0,0.0,0.0,0.0
2,Travel Insurance Claim,once we have all relevant information and have...,1,the process for a travel insurance claim invol...,0.0,0.0,0.0,0.0,0.0,0.0
3,Travel Insurance Claim,we comply with the principles of the privacy a...,1,the process for a travel insurance claim invol...,0.0,0.0,0.0,0.0,0.0,0.0
4,Travel Insurance Claim,we will tell you about the progress of your cl...,1,the process for a travel insurance claim invol...,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
261,SM2_2,the active power limit value of the meter must...,0,this process involve the smart meter be turn o...,0.0,0.0,0.0,0.0,0.0,0.0
262,SM2_2,roasting software : with the quest link to a c...,0,this process involve the smart meter be turn o...,0.0,0.0,0.0,0.0,0.0,0.0
263,SM2_2,for a plan push operation the readout plan ( t...,0,this process involve the smart meter be turn o...,0.0,0.0,0.0,0.0,0.0,0.0
264,SM2_2,if there be a time difference between 2 and 9 ...,0,this process involve the smart meter be turn o...,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Save the combined DataFrame
test_data_original.to_csv(project_dir / "test_data_rnn_predictions_tuned.csv", index=False)