In [1]:
import pathlib
basedir = pathlib.Path.cwd().parent
deviceId = '0'
pathClf = basedir / 'data' / 'processed' / f'calibrated_classifier_{'DEBERTA'}_{'95perc'}.pkl'
pathPCA = basedir / 'data' / 'processed' / f'pca_projector_{'DEBERTA'}_{'95perc'}.pkl'


In [12]:
import os
# Set environment variables
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['CUDA_VISIBLE_DEVICES'] = deviceId
os.environ["WORLD_SIZE"] = "1"
import joblib
import torch
# from transformers import DistilBertTokenizer, DistilBertModel
from transformers import DebertaV2Tokenizer, DebertaV2Model


class StanceClassifier:
    def __init__(self, pathPCA, pathClf, device):
        self.device = device
        # Load the text embedder (DistilBERT)
        self.tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v2-xlarge')
        self.model = DebertaV2Model.from_pretrained('microsoft/deberta-v2-xlarge').to(device)
        # Load the trained classifier and PCA projector
        self.calibrated_clf = joblib.load(pathClf)
        self.pca = joblib.load(pathPCA)

    def get_embeddings(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().squeeze().numpy()

    def predict_proba(self, text):
        # Extract embeddings using DistilBERT
        embedding = self.get_embeddings(text).reshape(1, -1)
        
        # Compress embedding using trained PCA
        embedding_pca = self.pca.transform(embedding)
        
        # Compute probability score using trained calibrated logistic regression
        prob_score = self.calibrated_clf.predict_proba(embedding_pca)[:, 1][0]
        
        return prob_score



In [13]:
# Example usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
classifier = StanceClassifier(pathPCA=pathPCA, pathClf=pathClf, device=device)


In [42]:
example_text = "Brexit? I support Nigel Farage and UKIP reasons to leave the EU market"
probability_score = classifier.predict_proba(example_text)
print(f'Predicted probability score: {probability_score:.4f}')



Predicted probability score: 0.0267


In [49]:
example_text = "I will vote for leave but I like Schengen free travel area"
probability_score = classifier.predict_proba(example_text)
print(f'Predicted probability score: {probability_score:.4f}')



Predicted probability score: 0.1784


In [50]:
example_text = "I am half between remain and leave"
probability_score = classifier.predict_proba(example_text)
print(f'Predicted probability score: {probability_score:.4f}')



Predicted probability score: 0.5708


In [54]:
example_text = "I strongly support Remain! Brexit will destroy the UK economy"
probability_score = classifier.predict_proba(example_text)
print(f'Predicted probability score: {probability_score:.4f}')



Predicted probability score: 0.6368


In [55]:
example_text = "I vote Remain because I am an immigrant and I want to feel welcomed."
probability_score = classifier.predict_proba(example_text)
print(f'Predicted probability score: {probability_score:.4f}')


Predicted probability score: 0.7305


In [56]:
example_text = "I vote Remain because I am an immigrant and I want to feel welcomed. #StrongerIn"
probability_score = classifier.predict_proba(example_text)
print(f'Predicted probability score: {probability_score:.4f}')


Predicted probability score: 0.9913
