In [1]:
import torch
import transformers
import pandas as pd
import warnings
import evaluate

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_from_disk
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from argparse import Namespace

2025-08-03 18:30:31.362830: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
# Options for models in the hub
# m2im/ml-e5-large_finetuned_violence_twitter_all_labels

# Options for path_to_model_on_disk
# /data4/mmendieta/models/labse_finetuned_twitter_all_labels/legendary-eon-1/epoch_19/
# /data4/mmendieta/models/xlmt_finetuned_twitter_all_labels/polished-oath-32/epoch_17/
# /data4/mmendieta/models/smallLabse_finetuned_twitter_all_labels
# /data4/mmendieta/models/ml-e5-large_finetuned_twitter_all_labels

# Options for path_to_tokenized datasets_on_disk
# /data4/mmendieta/data/geo_corpus.0.0.1_tok_test_ds_e5_inference_results
# /data4/mmendieta/data/geo_corpus.0.0.1_tok_test_ds_xlmt_inference_results
# /data4/mmendieta/data/geo_corpus.0.0.1_tok_test_ds_labse_inference_results


config = {
    "cuda_device": 3,
    "path_to_model_on_disk": "/data4/mmendieta/models/labse_finetuned_twitter_all_labels/legendary-eon-1/epoch_19/", 
    "model_ckpt": "",
    "max_length": 32,
    "batch_size": 1024,
    "dataset_name": "/data4/mmendieta/data/geo_corpus.0.0.1_tok_test_ds_labse_inference_results",
    "fout": "/data3/mmendieta/Violence_data/csv_files_global_scale/labse_inference_test_set_all_labels.csv"
}

args = Namespace(**config)

# 1. Instantiate the required pipeline
Choose of the three pipelines of interest. The XLM-T pipeline is the best performing model.

### LaBSE pipeline

In [3]:
violence_pipe = pipeline("text-classification",
                         model=args.path_to_model_on_disk, 
                         device=args.cuda_device,
                         framework="pt",
                         return_all_scores=True)



### small-LaBSE pipeline

In [None]:
violence_pipe = pipeline(model=args.path_to_model_on_disk,
                         task="text-classification",
                         device=args.cuda_device,
                         framework="pt",
                         return_all_scores=True)

### XLM-T pipeline

In [None]:
violence_pipe = pipeline(model=args.path_to_model_on_disk,
                         task="text-classification",
                         device=args.cuda_device,
                         framework="pt",
                         return_all_scores=True)

### E5-pipeline

In [None]:
violence_pipe = pipeline(model=args.path_to_model_on_disk,
                         task="text-classification", # This line helps with e5. For the other models is not necessary
                         device=args.cuda_device,
                         framework="pt",
                         return_all_scores=True)

In [None]:
# test the pipeline. Use a short text just like you would do in a twitter. 
# Keep in mind that we are not doing any text preprocessing for this test case.
text = "today is a sunny day"
outputs = violence_pipe(text)

In [None]:
pd.DataFrame(outputs[0])

In [None]:
outputs

# 2. Use the pipeline to make predictions with the violence dataset

### Load the tokenized test dataset to disk
This is the tokenized dataset that includes the columns *text*, *tweetid*, *geo_x*, *geo_y*, and *lang*

In [4]:
ds_tok = load_from_disk(args.dataset_name)

In [None]:
# Select a subsmample for testing purposes
test_tokenized_ds_sample = ds_tok.shuffle().select(range(10000))

In [None]:
test_tokenized_ds_sample[0]

### a. Single sample (pipeline)

In [None]:
sample = test_tokenized_ds_sample[10]
print(f"Text: {sample['text']} | Language: {sample['lang']}")

In [None]:
outputs = violence_pipe(sample['text'])
pd.DataFrame(outputs[0])

### b. Multiple sample pipeline (visualizing text, true labels, predictions, and other columns)
Make sure to select the apropriate batch size.

In [None]:
# For samples only
preds = []

for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(test_tokenized_ds_sample, "text"), 
                                               batch_size=args.batch_size,
                                               truncation=True),
                                 total=len(test_tokenized_ds_sample))):
    text = test_tokenized_ds_sample[i]['text']
    labels = test_tokenized_ds_sample[i]['labels'].tolist()
    tweetid = test_tokenized_ds_sample[i]['tweetid']
    lang = test_tokenized_ds_sample[i]['lang']
    geo_x = float(test_tokenized_ds_sample[i]['geo_x']) # cast tensor to float
    geo_y = float(test_tokenized_ds_sample[i]['geo_y']) # cast tensor to float
    preds.append({
        'tweetid': tweetid,
        'text': text,
        'lang': lang,
        'geo_x': geo_x,
        'geo_y': geo_y,
        'labels': labels,
        'outputs': outputs
    })

In [5]:
# For the entire dataset
preds = []

for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(ds_tok, "text"), batch_size=args.batch_size,
                                              truncation=True),
                                 total=len(ds_tok))):
    text = ds_tok[i]['text']
    labels = ds_tok[i]['labels'].tolist()
    tweetid = ds_tok[i]['tweetid']
    lang = ds_tok[i]['lang']
    geo_x = float(ds_tok[i]['geo_x']) # cast tensor to float
    geo_y = float(ds_tok[i]['geo_y']) # cast tensor to float
    preds.append({
        'tweetid': tweetid,
        'text': text,
        'lang': lang,
        'geo_x': geo_x,
        'geo_y': geo_y,
        'labels': labels,
        'outputs': outputs
    })

  0%|          | 0/2329158 [00:00<?, ?it/s]

In [None]:
preds

In [6]:
processed_data = []
for pred in preds:
    row = {
        'tweetid': pred['tweetid'],  # Include tweet ID
        'text': pred['text'],        # Include text
        'lang': pred['lang'],        # Include language
        'geo_x': pred['geo_x'],      # Include geo_x coordinate
        'geo_y': pred['geo_y'],      # Include geo_y coordinate
    }
    
    # True labels for each column   
    row['post1geo10_true'] = pred['labels'][0]
    row['post1geo20_true'] = pred['labels'][1]
    row['post1geo30_true'] = pred['labels'][2]
    row['post1geo50_true'] = pred['labels'][3]
    row['post1geo70_true'] = pred['labels'][4]
    row['post2geo10_true'] = pred['labels'][5]
    row['post2geo20_true'] = pred['labels'][6]
    row['post2geo30_true'] = pred['labels'][7]
    row['post2geo50_true'] = pred['labels'][8]
    row['post2geo70_true'] = pred['labels'][9]
    row['post3geo10_true'] = pred['labels'][10]
    row['post3geo20_true'] = pred['labels'][11]
    row['post3geo30_true'] = pred['labels'][12]
    row['post3geo50_true'] = pred['labels'][13]
    row['post3geo70_true'] = pred['labels'][14]
    row['post7geo10_true'] = pred['labels'][15]
    row['post7geo20_true'] = pred['labels'][16]
    row['post7geo30_true'] = pred['labels'][17]
    row['post7geo50_true'] = pred['labels'][18]
    row['post7geo70_true'] = pred['labels'][19]
    row['pre1geo10_true'] = pred['labels'][20]
    row['pre1geo20_true'] = pred['labels'][21]
    row['pre1geo30_true'] = pred['labels'][22]
    row['pre1geo50_true'] = pred['labels'][23]
    row['pre1geo70_true'] = pred['labels'][24]
    row['pre2geo10_true'] = pred['labels'][25]
    row['pre2geo20_true'] = pred['labels'][26]
    row['pre2geo30_true'] = pred['labels'][27]
    row['pre2geo50_true'] = pred['labels'][28]
    row['pre2geo70_true'] = pred['labels'][29]
    row['pre3geo10_true'] = pred['labels'][30]
    row['pre3geo20_true'] = pred['labels'][31]
    row['pre3geo30_true'] = pred['labels'][32]
    row['pre3geo50_true'] = pred['labels'][33]
    row['pre3geo70_true'] = pred['labels'][34]
    row['pre7geo10_true'] = pred['labels'][35]
    row['pre7geo20_true'] = pred['labels'][36]
    row['pre7geo30_true'] = pred['labels'][37]
    row['pre7geo50_true'] = pred['labels'][38]
    row['pre7geo70_true'] = pred['labels'][39]
    
    # Predicted scores for each column   
    row['post1geo10'] = pred['outputs'][0]['score']
    row['post1geo20'] = pred['outputs'][1]['score']
    row['post1geo30'] = pred['outputs'][2]['score']
    row['post1geo50'] = pred['outputs'][3]['score']
    row['post1geo70'] = pred['outputs'][4]['score']
    row['post2geo10'] = pred['outputs'][5]['score']
    row['post2geo20'] = pred['outputs'][6]['score']
    row['post2geo30'] = pred['outputs'][7]['score']
    row['post2geo50'] = pred['outputs'][8]['score']
    row['post2geo70'] = pred['outputs'][9]['score']
    row['post3geo10'] = pred['outputs'][10]['score']
    row['post3geo20'] = pred['outputs'][11]['score']
    row['post3geo30'] = pred['outputs'][12]['score']
    row['post3geo50'] = pred['outputs'][13]['score']
    row['post3geo70'] = pred['outputs'][14]['score']
    row['post7geo10'] = pred['outputs'][15]['score']
    row['post7geo20'] = pred['outputs'][16]['score']
    row['post7geo30'] = pred['outputs'][17]['score']
    row['post7geo50'] = pred['outputs'][18]['score']
    row['post7geo70'] = pred['outputs'][19]['score']
    row['pre1geo10'] = pred['outputs'][20]['score']
    row['pre1geo20'] = pred['outputs'][21]['score']
    row['pre1geo30'] = pred['outputs'][22]['score']
    row['pre1geo50'] = pred['outputs'][23]['score']
    row['pre1geo70'] = pred['outputs'][24]['score']
    row['pre2geo10'] = pred['outputs'][25]['score']
    row['pre2geo20'] = pred['outputs'][26]['score']
    row['pre2geo30'] = pred['outputs'][27]['score']
    row['pre2geo50'] = pred['outputs'][28]['score']
    row['pre2geo70'] = pred['outputs'][29]['score']
    row['pre3geo10'] = pred['outputs'][30]['score']
    row['pre3geo20'] = pred['outputs'][31]['score']
    row['pre3geo30'] = pred['outputs'][32]['score']
    row['pre3geo50'] = pred['outputs'][33]['score']
    row['pre3geo70'] = pred['outputs'][34]['score']
    row['pre7geo10'] = pred['outputs'][35]['score']
    row['pre7geo20'] = pred['outputs'][36]['score']
    row['pre7geo30'] = pred['outputs'][37]['score']
    row['pre7geo50'] = pred['outputs'][38]['score']
    row['pre7geo70'] = pred['outputs'][39]['score']
    
    processed_data.append(row)

# Convert to DataFrame
df = pd.DataFrame(processed_data)

In [7]:
# Reorder columns for the specified order of all labels and their predictions
df = df[['tweetid', 'text', 'lang', 'geo_x', 'geo_y',
         # All 'true' label columns
         'post1geo10_true', 'post1geo20_true', 'post1geo30_true', 'post1geo50_true', 'post1geo70_true',
         'post2geo10_true', 'post2geo20_true', 'post2geo30_true', 'post2geo50_true', 'post2geo70_true',
         'post3geo10_true', 'post3geo20_true', 'post3geo30_true', 'post3geo50_true', 'post3geo70_true',
         'post7geo10_true', 'post7geo20_true', 'post7geo30_true', 'post7geo50_true', 'post7geo70_true',
         'pre1geo10_true', 'pre1geo20_true', 'pre1geo30_true', 'pre1geo50_true', 'pre1geo70_true',
         'pre2geo10_true', 'pre2geo20_true', 'pre2geo30_true', 'pre2geo50_true', 'pre2geo70_true',
         'pre3geo10_true', 'pre3geo20_true', 'pre3geo30_true', 'pre3geo50_true', 'pre3geo70_true',
         'pre7geo10_true', 'pre7geo20_true', 'pre7geo30_true', 'pre7geo50_true', 'pre7geo70_true',
         # All prediction score columns
         'post1geo10', 'post1geo20', 'post1geo30', 'post1geo50', 'post1geo70',
         'post2geo10', 'post2geo20', 'post2geo30', 'post2geo50', 'post2geo70',
         'post3geo10', 'post3geo20', 'post3geo30', 'post3geo50', 'post3geo70',
         'post7geo10', 'post7geo20', 'post7geo30', 'post7geo50', 'post7geo70',
         'pre1geo10', 'pre1geo20', 'pre1geo30', 'pre1geo50', 'pre1geo70',
         'pre2geo10', 'pre2geo20', 'pre2geo30', 'pre2geo50', 'pre2geo70',
         'pre3geo10', 'pre3geo20', 'pre3geo30', 'pre3geo50', 'pre3geo70',
         'pre7geo10', 'pre7geo20', 'pre7geo30', 'pre7geo50', 'pre7geo70']]

In [8]:
df[60:65]

Unnamed: 0,tweetid,text,lang,geo_x,geo_y,post1geo10_true,post1geo20_true,post1geo30_true,post1geo50_true,post1geo70_true,...,pre3geo10,pre3geo20,pre3geo30,pre3geo50,pre3geo70,pre7geo10,pre7geo20,pre7geo30,pre7geo50,pre7geo70
60,487300699391524864,#Baloncesto | El seguro ha retrasado la incorp...,es,-66.879189,10.48801,0.0,0.0,0.0,0.0,0.0,...,0.362646,0.341025,0.500007,0.495193,0.456237,0.43346,0.391691,0.507995,0.503377,0.447357
61,447660809615716352,？あなたの質問はモンゴル語ができるかどうかは、関係ないと思いますが。,ja,34.333328,31.41667,0.0,0.0,0.0,0.0,0.0,...,0.453739,0.509758,0.543539,0.465144,0.448671,0.380959,0.414354,0.475345,0.369792,0.367156
62,490608380818776065,الحمدلله,ar,35.203289,31.92157,0.0,0.0,0.0,0.0,0.0,...,0.574156,0.606797,0.644596,0.607871,0.626594,0.521864,0.572039,0.611882,0.552581,0.566595
63,481456072243552256,Ben hocaları ders konuşan resmi insanlar sanır...,tr,36.567219,36.269169,0.0,0.0,0.0,1.0,1.0,...,0.005823,0.020455,0.167665,0.624175,0.878551,0.006803,0.0183,0.124957,0.469324,0.837316
64,476767010660294657,#11J,und,-66.879189,10.48801,0.0,0.0,0.0,0.0,0.0,...,0.020096,0.017566,0.014054,0.011244,0.010837,0.961599,0.963988,0.96259,0.946629,0.916835


In [9]:
# save the dataframe to disk
df.to_csv(args.fout, index=False)