In [None]:

import pandas as pd

df_raw=# load your data
# select only value-expressive and/or politically oriented posts:
df=df_raw.loc[df_raw.if_political_OR_if_value==1]

values_list=['Self-direction','Stimulation','Hedonism','Achievement','Power','Security','Conformity','Tradition','Benevolence','Universalism']

In [None]:
import joblib

# download TF-IDF vectorizer:
vectorizer = joblib.load("../models/xlm-roberta-large/tfidf_vectorizer_for_train_data.pkl")

# vectorize new data:
X_tfidf_new = vectorizer.transform(df["text"]).toarray()


In [None]:
import torch.nn as nn

class BERTWithTFIDF(nn.Module):
    def __init__(self, bert_model, tfidf_dim, num_labels=10):
        super(BERTWithTFIDF, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.3)
        self.tfidf_layer = nn.Linear(tfidf_dim, 128)  
        self.relu = nn.ReLU()
        self.batch_norm = nn.LayerNorm(self.bert.config.hidden_size + 128)  
        self.classifier = nn.Linear(self.bert.config.hidden_size + 128, num_labels)

    def forward(self, input_ids, attention_mask, tfidf_features):
         
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        tfidf_embedding = self.relu(self.tfidf_layer(tfidf_features))
        concat = torch.cat((bert_output, tfidf_embedding), dim=1)
        concat = self.batch_norm(concat)  
        logits = self.classifier(self.dropout(concat))
        return logits  
    

# Focal Loss
class FocalLoss(nn.Module):
    def __init__(self, class_weights, gamma=0.3):
        super(FocalLoss, self).__init__()
        self.class_weights = class_weights  # add class weights
        self.gamma = gamma
        self.bce = nn.BCEWithLogitsLoss(reduction="none")  

    def forward(self, outputs, targets):
        bce_loss = self.bce(outputs, targets)
        p_t = torch.exp(-bce_loss)  
        focal_loss = (1 - p_t) ** self.gamma * bce_loss
        weighted_focal_loss = focal_loss * self.class_weights.to(outputs.device)
        return weighted_focal_loss.mean()

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

# ============================
# 1. Load xlm-roberta-large
# ============================

torch.cuda.set_device(1)
MODEL_NAME = "FacebookAI/xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME).to("cuda:1")  
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
# ============================

model_bert = BERTWithTFIDF(bert_model, 5000).to(device)

# Load vodel weights:
loaded_model="../models/xlm-roberta-large/model_finetuned_xlm-roberta-large.pth" 
model_bert.load_state_dict(torch.load(loaded_model, map_location=device))

In [None]:
class MultiLabelDataset_for_new_data(Dataset):
    def __init__(self, texts, tfidf_features, tokenizer, max_length=512):
        self.texts = texts
        self.tfidf_features = tfidf_features
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tfidf_vector = torch.tensor(self.tfidf_features[idx], dtype=torch.float32)
        
        encoding = self.tokenizer(
            text, padding="max_length", truncation=True, 
            max_length=self.max_length, return_tensors="pt"
        )
        
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
#             "labels": labels,
            "tfidf_features": tfidf_vector
        }

In [9]:
new_dataset = MultiLabelDataset_for_new_data(df.text.to_list(), X_tfidf_new, tokenizer)
new_loader = DataLoader(new_dataset, batch_size=16, shuffle=False)

In [None]:
from tqdm import tqdm
import numpy as np

def predict_from_loader(model, data_loader, device=device):
    model.eval()
    all_preds = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Predicting"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            tfidf_features = batch["tfidf_features"].to(device)
            logits = model(input_ids, attention_mask, tfidf_features)
            probs = torch.sigmoid(logits).cpu().numpy()  
            all_preds.append(probs)

    return np.vstack(all_preds)

In [None]:
predictions = predict_from_loader(model_bert, new_loader)
df[values_list]=predictions
for value in values_list:
    df[value] = df[value].round(3)

# save df.to_csv()