In [12]:
import os
from pprint import pprint

from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import xgboost as xgb
import torch
import pandas as pd

In [2]:
def create_cls_model_spec(task, data_spec):
    cls_model_spec = {
        'knn': {
            'cls': make_pipeline(
                StandardScaler(),
                KNeighborsClassifier(n_neighbors=len(data_spec[task]['label_names']))
            )
        },
        'svc': {
            'cls': make_pipeline(StandardScaler(), SVC(gamma='auto'))
        },

        'xgboost': {
            'cls': xgb.XGBClassifier()
        }
    }

    return cls_model_spec

In [3]:
def get_data_spec():
    data_spec = {}

    data_spec['Propaganda'] = {
        'label2id': {
          "Not Propaganda": 0,
          "Propaganda": 1,
          "Unclear": 2,
          "Not Applicable": 3,
        },
        'id2label': {
          "0": "Not Propaganda",
          "1": "Propaganda",
          "2": "Unclear",
          "3": "Not Applicable",
        },
        'label_names': ["Not Propaganda", "Propaganda", "Unclear", "Not Applicable"],
    }

    data_spec['Bias'] = {

        'label2id': {
          "Unbiased": 0,
          "Biased against Palestine": 1,
          "Biased against Israel": 2,
          "Biased against both Palestine and Israel": 3,
          "Biased against others": 4,
          "Unclear": 5,
          "Not Applicable": 6
        },
        'id2label': {
          "0": "Unbiased",
          "1": "Biased against Palestine",
          "2": "Biased against Israel",
          "3": "Biased against both Palestine and Israel",
          "4": "Biased against others",
          "5": "Unclear",
          "6": "Not Applicable"
        },
        'label_names': [
            "Unbiased",
            "Biased against Palestine",
            "Biased against Israel",
            "Biased against both Palestine and Israel",
            "Biased against others",
            "Unclear",
            "Not Applicable"
        ],
    }

    return data_spec

In [4]:
def get_train_test_splits(task, data_spec):
    task_data_spec = data_spec[task]

    data_dir = os.path.join(os.getcwd(), 'data')
    task_data_dir = os.path.join(data_dir, task.lower())

    train_data = pd.read_excel(os.path.join(task_data_dir, f'{task.lower()}_train_data.xlsx'))
    test_data = pd.read_excel(os.path.join(task_data_dir, f'{task.lower()}_test_data.xlsx'))

    label2id = task_data_spec['label2id']

    train_data[task] = train_data[task].map(label2id)
    test_data[task] = test_data[task].map(label2id)

    data_splits = {"train": train_data, "test": test_data}

    return data_splits

In [5]:
def classify(cls, train_data, test_data, label_names):
    x_train, y_train = train_data
    x_test, y_test = test_data

    cls.fit(x_train, y_train)
    y_pred = cls.predict(x_test)

    report = classification_report(y_test, y_pred, target_names=label_names, output_dict=True)
    return report

In [9]:
def get_embeddings(task, embedding_method):
    train_embeddings = torch.load(
        os.path.join(os.getcwd(), 'embeddings', task.lower(), f'{embedding_method}_train_embeddings.pt')
    )

    test_embeddings = torch.load(
        os.path.join(os.getcwd(), 'embeddings', task.lower(), f'{embedding_method}_test_embeddings.pt')
    )

    embeddings = {
        "train": train_embeddings,
        "test": test_embeddings
    }

    return embeddings

In [20]:
def main():

    # Use one of the following values for task parameter: 'Propaganda' or 'Bias'
    task = 'Propaganda'

    # Use one of the following values for embedding_method parameter: 'ML-E5-large', 'BGE-M3', 'E5-mistral-7b', or 'Nomic-Embed'
    embedding_method = 'ML-E5-large'

    # Use one of the following values for cls_method parameter: 'svc', 'knn', or 'xgboost'
    cls_method = 'knn'

    data_spec = get_data_spec()
    cls_model_spec = create_cls_model_spec(task, data_spec)

    cls = cls_model_spec[cls_method]["cls"]

    embeddings = get_embeddings(task, embedding_method)
    train_embeddings = embeddings["train"]
    test_embeddings = embeddings["test"]

    data_splits = get_train_test_splits(task, data_spec)
    train_data = data_splits["train"]
    test_data = data_splits["test"]

    x_train = train_embeddings
    y_train = train_data[task].tolist()
    x_test = test_embeddings
    y_test = test_data[task].tolist()

    label_names = data_spec[task]["label_names"]
    report = classify(cls, (x_train, y_train), (x_test, y_test), label_names)

    print(
        f"The classification report for {task} task with {embedding_method} "
        f"embedding method and {cls_method} classifier:\n"
    )

    pprint(report)

In [21]:
main()

The classification report for Propaganda task with ML-E5-large embedding method and knn classifier: 

{'Not Applicable': {'f1-score': 0.48,
                    'precision': 0.75,
                    'recall': 0.35294117647058826,
                    'support': 17},
 'Not Propaganda': {'f1-score': 0.7548806941431669,
                    'precision': 0.6666666666666666,
                    'recall': 0.87,
                    'support': 200},
 'Propaganda': {'f1-score': 0.4842105263157894,
                'precision': 0.5542168674698795,
                'recall': 0.42990654205607476,
                'support': 107},
 'Unclear': {'f1-score': 0.15384615384615383,
             'precision': 0.3333333333333333,
             'recall': 0.1,
             'support': 40},
 'accuracy': 0.6318681318681318,
 'macro avg': {'f1-score': 0.4682343435762775,
               'precision': 0.5760542168674698,
               'recall': 0.4382119296316658,
               'support': 364},
 'weighted avg': {'f1-sco