In [1]:
!python --version

Python 3.10.12


In [2]:
!pip install -q accelerate

In [3]:
import os
from pathlib import Path

import torch
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel
import pandas as pd

In [4]:
def get_data_spec():
    data_spec = {}

    data_spec['Propaganda'] = {
        'label2id': {
          "Not Propaganda": 0,
          "Propaganda": 1,
          "Unclear": 2,
          "Not Applicable": 3,
        },
        'id2label': {
          "0": "Not Propaganda",
          "1": "Propaganda",
          "2": "Unclear",
          "3": "Not Applicable",
        },
        'label_names': ["Not Propaganda", "Propaganda", "Unclear", "Not Applicable"],
    }

    data_spec['Bias'] = {

        'label2id': {
          "Unbiased": 0,
          "Biased against Palestine": 1,
          "Biased against Israel": 2,
          "Biased against both Palestine and Israel": 3,
          "Biased against others": 4,
          "Unclear": 5,
          "Not Applicable": 6
        },
        'id2label': {
          "0": "Unbiased",
          "1": "Biased against Palestine",
          "2": "Biased against Israel",
          "3": "Biased against both Palestine and Israel",
          "4": "Biased against others",
          "5": "Unclear",
          "6": "Not Applicable"
        },
        'label_names': [
            "Unbiased",
            "Biased against Palestine",
            "Biased against Israel",
            "Biased against both Palestine and Israel",
            "Biased against others",
            "Unclear",
            "Not Applicable"
        ],
    }

    return data_spec


In [5]:
def get_embedding_model_spec():
    embedding_model_spec = {
        'ML-E5-large': {
            'model_name': 'intfloat/multilingual-e5-large',
            'max_length': 512,
            'pooling_type': 'mean',
            'normalize': True,
            'batch_size': 1,
            'kwargs': {'device_map': 'cuda', 'torch_dtype': torch.float16}
        },
        'BGE-M3': {
            'model_name': 'BAAI/bge-m3',
            'max_length': 8192,
            'pooling_type': 'cls',
            'normalize': True,
            'batch_size': 1,
            'kwargs': {'device_map': 'cuda', 'torch_dtype': torch.float16}
        },
        'E5-mistral-7b': {
            'model_name': 'intfloat/e5-mistral-7b-instruct',
            'max_length': 32768,
            'pooling_type': 'last_token',
            'normalize': True,
            'batch_size': 1,
            'kwargs': {'load_in_4bit': True, 'bnb_4bit_compute_dtype': torch.float16}
        },
        'Nomic-Embed': {
            'model_name': 'nomic-ai/nomic-embed-text-v1',
            'max_length': 8192,
            'pooling_type': 'mean',
            'normalize': True,
            'batch_size': 1,
            'kwargs': {'device_map': 'cuda', 'trust_remote_code': True}
        }
    }

    return embedding_model_spec

In [6]:
def get_train_test_splits(task, data_spec):
    task_data_spec = data_spec[task]

    data_dir = os.path.join(os.getcwd(), 'data')
    task_data_dir = os.path.join(data_dir, task.lower())

    train_data = pd.read_excel(os.path.join(task_data_dir, f'{task.lower()}_train_data.xlsx'))
    test_data = pd.read_excel(os.path.join(task_data_dir, f'{task.lower()}_test_data.xlsx'))

    label2id = task_data_spec['label2id']

    train_data[task] = train_data[task].map(label2id)
    test_data[task] = test_data[task].map(label2id)

    data_splits = {"train": train_data, "test": test_data}

    return data_splits

In [7]:
def mean_pooling(model_output):
    return torch.mean(model_output["last_hidden_state"], dim=1)


def cls_pooling(model_output):
    return model_output[0][:, 0]


def last_token_pooling(model_output):
    return model_output[0][:, -1]

In [8]:
def get_sentence_embedding(
        text,
        tokenizer,
        embed_model,
        normalize,
        max_length,
        pooling_type='cls'
):
    if pooling_type == "last_token":
        encoded_input = tokenizer(
            text,
            max_length=max_length,
            return_attention_mask=False,
            padding=False,
            truncation=True
        )
        encoded_input['input_ids'] = encoded_input['input_ids'] + [tokenizer.eos_token_id]
        encoded_input = tokenizer.pad(
            [encoded_input],
            padding=True,
            return_attention_mask=True,
            return_tensors='pt'
        ).to("cuda")

    else:
        encoded_input = tokenizer(
            text,
            return_tensors="pt",
            max_length=max_length,
            truncation=True
        ).to("cuda")

    with torch.no_grad():
        model_output = embed_model(**encoded_input)

    sentence_embeddings = None
    match pooling_type:
        case "cls":
            sentence_embeddings = cls_pooling(model_output)
        case "mean":
            sentence_embeddings = mean_pooling(model_output)
        case "last_token":
            sentence_embeddings = last_token_pooling(model_output)

    if normalize:
        sentence_embeddings = F.normalize(sentence_embeddings)

    return sentence_embeddings

In [9]:
def embed(embed_model, tokenizer, data, model_spec):
    embeddings = [
        get_sentence_embedding(
            sentence,
            tokenizer,
            embed_model,
            model_spec['normalize'],
            model_spec['max_length'],
            model_spec['pooling_type']
        ) for sentence in data
    ]

    embeddings = torch.cat(embeddings)

    return embeddings

In [14]:
def create_embeddings(embedding_method, embedding_model_spec, data, save_to):
    print("Processing model : " + str(embedding_model_spec))

    tokenizer = AutoTokenizer.from_pretrained(
        embedding_model_spec[embedding_method]['model_name']
    )

    embed_model = AutoModel.from_pretrained(
        embedding_model_spec[embedding_method]['model_name'],
        **embedding_model_spec[embedding_method]['kwargs']
    )

    if embedding_method == "Nomic-Embed":
        embed_model.to('cuda')

    embeddings = embed(embed_model, tokenizer, data, embedding_model_spec[embedding_method])
    embeddings = embeddings.detach().cpu()
    print(f"{embedding_method} embedding shape = {embeddings.shape}")
    torch.save(embeddings, save_to)

In [16]:
def generate(data_spec, embedding_model_spec, task, embedding_method):

    data_splits = get_train_test_splits(task, data_spec)
    train_data = data_splits["train"]
    test_data = data_splits["test"]

    train_texts = train_data["Text"].tolist()
    test_texts = test_data["Text"].tolist()

    create_embeddings(
        embedding_method,
        embedding_model_spec,
        train_texts,
        os.path.join(os.getcwd(), 'embeddings', task.lower(), f'{embedding_method}_train_embeddings.pt')
    )

    create_embeddings(
        embedding_method,
        embedding_model_spec,
        test_texts,
        os.path.join(os.getcwd(), 'embeddings', task.lower(), f'{embedding_method}_test_embeddings.pt')
    )

In [20]:
def main():

    # Use one of the following values for task parameter: 'Propaganda' or 'Bias'
    task = 'Propaganda'

    # Use one of the following values for embedding_method parameter: 'ML-E5-large', 'BGE-M3', 'E5-mistral-7b', 'Nomic-Embed'
    embedding_method = 'BGE-M3'

    embeddings_dir = os.path.join(os.getcwd(), 'embeddings')
    Path(embeddings_dir).mkdir(parents=True, exist_ok=True)
    Path(os.path.join(embeddings_dir, task.lower())).mkdir(parents=True, exist_ok=True)

    data_spec = get_data_spec()
    embedding_model_spec = get_embedding_model_spec()

    generate(data_spec, embedding_model_spec, task, embedding_method)

In [21]:
main()

Processing model : {'ML-E5-large': {'model_name': 'intfloat/multilingual-e5-large', 'max_length': 512, 'pooling_type': 'mean', 'normalize': True, 'batch_size': 1, 'kwargs': {'device_map': 'cuda', 'torch_dtype': torch.float16}}, 'BGE-M3': {'model_name': 'BAAI/bge-m3', 'max_length': 8192, 'pooling_type': 'cls', 'normalize': True, 'batch_size': 1, 'kwargs': {'device_map': 'cuda', 'torch_dtype': torch.float16}}, 'E5-mistral-7b': {'model_name': 'intfloat/e5-mistral-7b-instruct', 'max_length': 32768, 'pooling_type': 'last_token', 'normalize': True, 'batch_size': 1, 'kwargs': {'load_in_4bit': True, 'bnb_4bit_compute_dtype': torch.float16}}, 'Nomic-Embed': {'model_name': 'nomic-ai/nomic-embed-text-v1', 'max_length': 8192, 'pooling_type': 'mean', 'normalize': True, 'batch_size': 1, 'kwargs': {'device_map': 'cuda', 'trust_remote_code': True}}}


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

BGE-M3 embedding shape = torch.Size([1448, 1024])
Processing model : {'ML-E5-large': {'model_name': 'intfloat/multilingual-e5-large', 'max_length': 512, 'pooling_type': 'mean', 'normalize': True, 'batch_size': 1, 'kwargs': {'device_map': 'cuda', 'torch_dtype': torch.float16}}, 'BGE-M3': {'model_name': 'BAAI/bge-m3', 'max_length': 8192, 'pooling_type': 'cls', 'normalize': True, 'batch_size': 1, 'kwargs': {'device_map': 'cuda', 'torch_dtype': torch.float16}}, 'E5-mistral-7b': {'model_name': 'intfloat/e5-mistral-7b-instruct', 'max_length': 32768, 'pooling_type': 'last_token', 'normalize': True, 'batch_size': 1, 'kwargs': {'load_in_4bit': True, 'bnb_4bit_compute_dtype': torch.float16}}, 'Nomic-Embed': {'model_name': 'nomic-ai/nomic-embed-text-v1', 'max_length': 8192, 'pooling_type': 'mean', 'normalize': True, 'batch_size': 1, 'kwargs': {'device_map': 'cuda', 'trust_remote_code': True}}}
BGE-M3 embedding shape = torch.Size([364, 1024])
