In [21]:
import numpy as np
import pandas as pd
import os
from transformers import pipeline, set_seed
import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [27]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32

In [38]:
def batch_tokenize(X_text, tokenizer, max_length=512, batch_size=64):

    # Dictionary to hold tokenized batches
    encodings = {}

    # Calculate the number of batches needed
    num_batches = len(X_text) // batch_size + int(len(X_text) % batch_size > 0)

    # Iterate over the data in batches
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min(len(X_text), (i + 1) * batch_size)

        # Tokenize the current batch of texts
        batch_encodings = tokenizer.batch_encode_plus(
            list(X_text[batch_start:batch_end]),
            padding='max_length',
            truncation=True,
            max_length=max_length
        )

        # Merge the batch tokenizations into the main dictionary
        for key, val in batch_encodings.items():
            if key not in encodings:
                encodings[key] = []
            encodings[key].extend(val)

    return encodings

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def compute_metrics(model, data_loader, device, average):
    with torch.no_grad():
        all_predictions = []
        all_labels = []

        for batch_idx, batch in enumerate(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Get the predicted class labels
            predicted_labels = torch.argmax(logits, dim=1)

            all_predictions.extend(predicted_labels.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        
        accuracy = accuracy_score(all_labels, all_predictions)
        #precision = precision_score(all_labels, all_predictions)
        #recall = recall_score(all_labels, all_predictions)
        if average == 'binary':
            f1 = f1_score(all_labels, all_predictions)
        else:
            f1 = f1_score(all_labels, all_predictions, average = average)
        metrics = {'accuracy': accuracy,  'f1': f1 }

        return metrics

def test_eval(model, data_loader, device, compute_performance=False, average='binary'):
    
    # Set the model to evaluation mode
    model.eval()
    
    # store predicted probs
    class_probs = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']

            # Convert logits to probabilities
            probabilities = F.softmax(logits, dim=1)
            class_probs.extend(probabilities.cpu().numpy().tolist())
            
    # Compute the metrics
    if compute_performance:
        metrics = compute_metrics(model, data_loader, device, average=average)
        return metrics, np.array(class_probs)
    
    return np.array(class_probs)

In [7]:
# load test set from german data
path  = os.path.join(uc_dir, 'german_data/data_for_models_german_data.pkl')
df_test = pd.read_pickle(path)

df_test['male'] = df_test['is_male'].astype(int)
df_test['text']  = 'bio: ' + df_test['masked_bio'] + '. ' + 'tweets: ' + df_test['long_text'] 
df_test['text'] = df_test['text'].str.replace('\r|\n', ' ', regex=True)

X_test = df_test['text'].values
y_test = df_test['male'].values

In [8]:
# tokenize features
xlm_tokenizer = AutoTokenizer.from_pretrained("lorelupo/twitter-xlm-gender-prediction-italian")
xlm_test_encodings = batch_tokenize(X_test, xlm_tokenizer)
# create dataset and its loader
xlm_test_dataset = TweetDataset(xlm_test_encodings, y_test)
xlm_loader = torch.utils.data.DataLoader(
    xlm_test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    pin_memory=True
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

In [30]:
# load model
xlm_gender = AutoModelForSequenceClassification.from_pretrained("lorelupo/twitter-xlm-gender-prediction-italian")
xlm_gender = xlm_gender.to(DEVICE)

In [31]:
# run model over test set and compute metrics
metrics, probs = test_eval(xlm_gender, xlm_loader, DEVICE, average='binary')

In [33]:
metrics

{'accuracy': 0.8195876288659794, 'f1': 0.8833333333333333}

In [34]:
records = [{'user_id':x,'prob_female':y[0],'prob_male':y[1]} for x,y in zip(df_test['user_id'],probs)]

In [36]:
df_pred = pd.DataFrame.from_records(records)
df_pred

Unnamed: 0,user_id,prob_female,prob_male
0,3430018155,0.095967,0.904033
1,269803152,0.006755,0.993245
2,1372423700,0.299473,0.700527
3,1184400427988803584,0.135628,0.864372
4,2951843386,0.472516,0.527484
...,...,...,...
383,1178286769353175040,0.037096,0.962904
384,407722041,0.224812,0.775188
385,3140335966,0.001280,0.998720
386,899250898127507456,0.842391,0.157609
