In [1]:
import numpy as np
import pandas as pd
import os
from transformers import pipeline, set_seed
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import time
from sklearn.model_selection import train_test_split

In [16]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
def batch_tokenize(X_text, tokenizer, max_length=512, batch_size=64):

    # Dictionary to hold tokenized batches
    encodings = {}

    # Calculate the number of batches needed
    num_batches = len(X_text) // batch_size + int(len(X_text) % batch_size > 0)

    # Iterate over the data in batches
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min(len(X_text), (i + 1) * batch_size)

        # Tokenize the current batch of texts
        batch_encodings = tokenizer.batch_encode_plus(
            list(X_text[batch_start:batch_end]),
            padding='max_length',
            truncation=True,
            max_length=max_length
        )

        # Merge the batch tokenizations into the main dictionary
        for key, val in batch_encodings.items():
            if key not in encodings:
                encodings[key] = []
            encodings[key].extend(val)

    return encodings

In [19]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(model, data_loader, device):
    with torch.no_grad():
        all_predictions = []
        all_labels = []

        for batch_idx, batch in enumerate(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Get the predicted class labels
            predicted_labels = torch.argmax(logits, dim=1)

            all_predictions.extend(predicted_labels.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        
        accuracy = accuracy_score(all_labels, all_predictions)
        precision = precision_score(all_labels, all_predictions)
        recall = recall_score(all_labels, all_predictions)
        f1 = f1_score(all_labels, all_predictions)

        metrics = {'accuracy': accuracy,  'f1': f1 }

        return metrics

In [21]:
home_dir = '/g100/home/userexternal/pbose000/'
work_dir = '/g100_work/IscrC_mental'

wdata_dir = os.path.join(work_dir, 'data')
uc_dir = os.path.join(wdata_dir, 'user_classification')
model_dir = os.path.join(uc_dir,'trained_models/gender')

In [22]:
# load test set from german data
path  = os.path.join(uc_dir, 'german_data/data_for_models_german_data.pkl')
df_test = pd.read_pickle(path)

df_test['male'] = df_test['is_male'].astype(int)
df_test['text']  = 'bio: ' + df_test['masked_bio'] + '. ' + 'tweets: ' + df_test['long_text'] 
df_test['text'] = df_test['text'].str.replace('\r|\n', ' ', regex=True)

X_test = df_test['text'].values
y_test = df_test['male'].values

In [23]:
xlm_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")
xlm_test_encodings = batch_tokenize(X_test, xlm_tokenizer)
xlm_test_dataset = TweetDataset(xlm_test_encodings, y_test)


In [24]:
BATCH_SIZE = 32

xlm_loader = torch.utils.data.DataLoader(
    xlm_test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    pin_memory=True
)

In [25]:
def test_eval(model_path, data_loader, device):
    
    # Load the saved model
    model = torch.load(model_path)
    model = model.to(device)
    
    # Set the model to evaluation mode
    model.eval()
    
    # Compute the metrics
    metrics = compute_metrics(model, data_loader, device)
    
    return metrics


In [26]:
path_xlm =  os.path.join(model_dir ,'XLM_gender.pt')

In [27]:
test_eval(path_xlm, xlm_loader, DEVICE)

{'accuracy': 0.8043818466353677, 'f1': 0.8751248751248751}