In [1]:
import pandas as pd
from transformers import pipeline
import matplotlib.pyplot as plt
import numpy as np

***DeBERTa Zero Shot Model***

In [2]:
import sys
import os

data_path = os.path.abspath(os.path.join(os.getcwd(), '../../data'))
sys.path.append(data_path)

In [3]:
from data import sample, evaluation
deberta1 = evaluation.copy()
deberta2 = evaluation.copy()

In [4]:
political_labels = ["Liberal", "Conservative", "Neutral"]
political_labels_n = ["Liberal", "Conservative"]

In [5]:
# roberta and deberta, deberta out-performs roberta
# cross-encoder/nli-deberta-v3-base

# bert
pipe1 = pipeline(model="cross-encoder/nli-deberta-v3-base")

def classify_sentence(sentence):
    result = pipe1(sentence, candidate_labels=political_labels)
    top_label = result["labels"][0]
    return top_label

deberta1["predicted_label"] = deberta1["sentence"].apply(classify_sentence)

label_counts_deberta1 = deberta1["predicted_label"].value_counts()



In [6]:
pipe2 = pipeline(model="cross-encoder/nli-deberta-v3-base")

def classify_sentence(sentence):
    result = pipe2(sentence, candidate_labels=political_labels_n)
    top_label = result["labels"][0]
    return top_label

deberta2["predicted_label"] = deberta2["sentence"].apply(classify_sentence)

label_counts_deberta2 = deberta2["predicted_label"].value_counts()



In [7]:
print(label_counts_deberta1, label_counts_deberta2)

predicted_label
Neutral         305
Conservative    302
Liberal         143
Name: count, dtype: int64 predicted_label
Conservative    513
Liberal         237
Name: count, dtype: int64


In [18]:
deberta_correct = deberta1[deberta1["label"] == deberta1["predicted_label"]]
deberta_correct.label.value_counts()

label
Conservative    134
Liberal          63
Name: count, dtype: int64

***Finetuned BERT base model***

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from data import sample_dataset, evaluation_dataset

In [11]:
# evaluate on same sample dataset
infer_tokenizer = AutoTokenizer.from_pretrained("lhz1/pid-ft-bert")
ft_model = AutoModelForSequenceClassification.from_pretrained("lhz1/pid-ft-bert")


def run_model(dataset):
    predictions = []
    accurate = 0
    for example in dataset:
        inputs = infer_tokenizer(example["sentence"], return_tensors="pt")
        label = example["label"]

        with torch.no_grad():
            logits = ft_model(**inputs).logits

            predicted_class_id = logits.argmax().item()
            if predicted_class_id == label: accurate += 1
            predictions.append((ft_model.config.id2label[predicted_class_id], label == predicted_class_id))

    return predictions, accurate / len(sample_dataset)



In [13]:
#sample dataset
preds = run_model(sample_dataset)
pred_labels = pd.DataFrame(data=preds[0])
acc = preds[1]

pred_labels.value_counts()

0           
Conservative    82
Liberal         36
Neutral         32
Name: count, dtype: int64

In [None]:
#larger (n=750) with no neutral examples
preds2 = run_model(evaluation_dataset)
pred_labels2 = pd.DataFrame(data=preds2[0])
acc2 = preds2[1]

0             1    
Conservative  True     270
              False    204
Liberal       True     142
              False     74
Neutral       False     60
Name: count, dtype: int64

In [16]:
bert_correct = pred_labels2[pred_labels2[1] == True]
bert_correct[0].value_counts()

0
Conservative    270
Liberal         142
Name: count, dtype: int64