In [1]:
import numpy as np
import pandas as pd
import os
from transformers import pipeline, set_seed
import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import time
from sklearn.model_selection import train_test_split

In [2]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def batch_tokenize(X_text, tokenizer, max_length=512, batch_size=64):

    # Dictionary to hold tokenized batches
    encodings = {}

    # Calculate the number of batches needed
    num_batches = len(X_text) // batch_size + int(len(X_text) % batch_size > 0)

    # Iterate over the data in batches
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min(len(X_text), (i + 1) * batch_size)

        # Tokenize the current batch of texts
        batch_encodings = tokenizer.batch_encode_plus(
            list(X_text[batch_start:batch_end]),
            padding='max_length',
            truncation=True,
            max_length=max_length
        )

        # Merge the batch tokenizations into the main dictionary
        for key, val in batch_encodings.items():
            if key not in encodings:
                encodings[key] = []
            encodings[key].extend(val)

    return encodings

In [4]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(model, data_loader, device, average):
    with torch.no_grad():
        all_predictions = []
        all_labels = []

        for batch_idx, batch in enumerate(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Get the predicted class labels
            predicted_labels = torch.argmax(logits, dim=1)

            all_predictions.extend(predicted_labels.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        
        accuracy = accuracy_score(all_labels, all_predictions)
        #precision = precision_score(all_labels, all_predictions)
        #recall = recall_score(all_labels, all_predictions)
        if average == 'binary':
            f1 = f1_score(all_labels, all_predictions)
        else:
            f1 = f1_score(all_labels, all_predictions, average = average)
        metrics = {'accuracy': accuracy,  'f1': f1 }

        return metrics

In [2]:
home_dir = '/g100/home/userexternal/pbose000/'
work_dir = '/g100_work/IscrC_mental'

wdata_dir = os.path.join(work_dir, 'data')
uc_dir = os.path.join(wdata_dir, 'user_classification')
model_dir = os.path.join(uc_dir,'trained_models/gender')

In [3]:
# load test set from german data
path  = os.path.join(uc_dir, 'german_data/data_for_models_german_data.pkl')
df_test = pd.read_pickle(path)

df_test['male'] = df_test['is_male'].astype(int)
df_test['text']  = 'bio: ' + df_test['masked_bio'] + '. ' + 'tweets: ' + df_test['long_text'] 
df_test['text'] = df_test['text'].str.replace('\r|\n', ' ', regex=True)

X_test = df_test['text'].values
y_test = df_test['male'].values

In [41]:
xlm_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")
xlm_test_encodings = batch_tokenize(X_test, xlm_tokenizer)
xlm_test_dataset = TweetDataset(xlm_test_encodings, y_test)


In [42]:
BATCH_SIZE = 32

xlm_loader = torch.utils.data.DataLoader(
    xlm_test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    pin_memory=True
)

In [10]:
def test_eval(model_path, data_loader, device, average='binary'):
    
    # Load the saved model
    model = torch.load(model_path)
    model = model.to(device)
    
    # Set the model to evaluation mode
    model.eval()
    
    # store predicted probs
    class_probs = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']

            # Convert logits to probabilities
            probabilities = F.softmax(logits, dim=1)
            class_probs.extend(probabilities.cpu().numpy().tolist())
            
    # Compute the metrics
    metrics = compute_metrics(model, data_loader, device, average=average)
    
    return metrics, np.array(class_probs)


In [11]:
path_xlm =  os.path.join(model_dir ,'XLM_gender.pt')
path_xlm_age =  os.path.join(uc_dir,'trained_models/age/XLM_age.pt')

In [44]:
metrics, probs = test_eval(path_xlm, xlm_loader, DEVICE,average='macro')

In [13]:
records = [{'user_id':x,'prob_female':y[0],'prob_male':y[1]} for x,y in zip(df_test['user_id'],probs)]

In [14]:
df_pred = pd.DataFrame.from_records(records)

In [46]:
df_test['age_cat'] = pd.cut(df_test['age'], [0,20,30,40,150],labels=[0,1,2,3])
y_test_age = df_test['age_cat'].values
xlm_test_dataset_age= TweetDataset(xlm_test_encodings, y_test)
xlm_loader_age = torch.utils.data.DataLoader(
    xlm_test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    pin_memory=True
)

Unnamed: 0,user_id,masked_bio,age,is_male,long_text,male,text
26,2336685730,", hat nowgorod erobert, über kiew geherrscht u...",1005,1.0,"It is, what it is! Ein Seniorentreff. Der Film...",1,"bio: , hat nowgorod erobert, über kiew geherrs..."
88,3300616767,", ist eduard j., der einzig wahre, nun, nach s...",110,1.0,@Borschtel_ @FCBayern LAAAANGWEILIG!!!! #Jedua...,1,"bio: , ist eduard j., der einzig wahre, nun, n..."
108,420941203,in castro.rauxel.,982,1.0,"Thilo Sarrazin provoziert bei ""Günther Jauch"" ...",1,bio: in castro.rauxel.. tweets: Thilo Sarrazi...
126,899876379667439616,"könig heinrich iv, geschlecht der ottonen,",967,1.0,Während ich durch #Buße dem #Anathem entgegenw...,1,"bio: könig heinrich iv, geschlecht der ottonen..."
228,1184850545015693312,i am a mother of three children. i was . addit...,115,0.0,everyone had been happy the war is over\r\nI h...,0,bio: i am a mother of three children. i was . ...


In [47]:
metrics_age, probs_age = test_eval(path_xlm_age, xlm_loader_age, DEVICE,average='macro')

In [17]:
records_age = [{'user_id':x,'prob_0_19':y[0],'prob_20_29':y[1],'prob_30_39':y[2],'prob_40_100':y[3]} for x,y in zip(df_test['user_id'],probs_age)]

In [18]:
df_pred_age = pd.DataFrame.from_records(records_age)
df_pred = df_pred.merge(df_pred_age,on='user_id',how='inner',validate='1:1')

In [63]:
df_pred.to_pickle('/g100_work/IscrC_mental/data/user_classification/trained_models/german_age_and_gender.pkl')

In [49]:
metrics

{'accuracy': 0.8043818466353677, 'f1': 0.7119306686093689}

## Dutch data

### Gender prediction

In [21]:
# load test set from german data
path  = os.path.join(uc_dir, 'dutch_data/data_for_models_dutch_data.pkl')
df_test = pd.read_pickle(path)

df_test['male'] = df_test['is_male'].astype(int)
df_test['text']  = 'bio: ' + df_test['masked_bio'] + '. ' + 'tweets: ' + df_test['long_text'] 
df_test['text'] = df_test['text'].str.replace('\r|\n', ' ', regex=True)

X_test = df_test['text'].values
y_test = df_test['male'].values

In [25]:
xlm_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")
xlm_test_encodings = batch_tokenize(X_test, xlm_tokenizer)
xlm_test_dataset = TweetDataset(xlm_test_encodings, y_test)

In [26]:
BATCH_SIZE = 32

xlm_loader = torch.utils.data.DataLoader(
    xlm_test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    pin_memory=True
)

In [33]:
metrics, probs = test_eval(path_xlm, xlm_loader, DEVICE,average='macro')

In [28]:
records = [{'user_id':x,'prob_female':y[0],'prob_male':y[1]} for x,y in zip(df_test['user_id'],probs)]

In [29]:
df_pred = pd.DataFrame.from_records(records)

In [31]:
df_test['age_cat'] = pd.cut(df_test['age'], [0,20,30,40,150],labels=[0,1,2,3])
y_test_age = df_test['age_cat'].values
xlm_test_dataset_age= TweetDataset(xlm_test_encodings, y_test)
xlm_loader_age = torch.utils.data.DataLoader(
    xlm_test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    pin_memory=True
)

In [32]:
metrics_age, probs_age = test_eval(path_xlm_age, xlm_loader_age, DEVICE,average='macro')

In [34]:
records_age = [{'user_id':x,'prob_0_19':y[0],'prob_20_29':y[1],'prob_30_39':y[2],'prob_40_100':y[3]} for x,y in zip(df_test['user_id'],probs_age)]

In [35]:
df_pred_age = pd.DataFrame.from_records(records_age)
df_pred = df_pred.merge(df_pred_age,on='user_id',how='inner',validate='1:1')

In [39]:
df_pred.to_pickle('/g100_work/IscrC_mental/data/user_classification/trained_models/dutch_age_and_gender.pkl')