In [1]:
import torch
import transformers
import pandas as pd
import warnings
import evaluate

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_from_disk
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from argparse import Namespace

2025-08-02 22:54:28.442642: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [8]:
# Options for models in the hub
# m2im/ml-e5-large_finetuned_violence_twitter_all_labels

# Options for path_to_model_on_disk
# /data4/mmendieta/models/labse_finetuned_twitter_all_labels
# /data4/mmendieta/models/xlmt_finetuned_twitter_all_labels
# /data4/mmendieta/models/smallLabse_finetuned_twitter_all_labels
# /data4/mmendieta/models/ml-e5-large_finetuned_twitter_all_labels

# Options for path_to_tokenized datasets_on_disk
# /data4/mmendieta/data/geo_corpus.0.0.1_tok_test_ds_e5_inference_results
# /data4/mmendieta/data/geo_corpus.0.0.1_tok_test_ds_xlmt_inference_results


config = {
    "cuda_device": 0,
    "path_to_model_on_disk": "/data4/mmendieta/models/ml-e5-large_finetuned_twitter_all_labels", 
    "model_ckpt": "m2im/ml-e5-large_finetuned_violence_twitter_all_labels",
    "max_length": 32,
    "batch_size": 1024,
    "dataset_name": "/data4/mmendieta/data/geo_corpus.0.0.1_tok_test_ds_e5_inference_results",
    "fout": "/data3/mmendieta/Violence_data/csv_files_global_scale/e5_inference_test_set_all_labels.csv"
}

args = Namespace(**config)

# 1. Instantiate the required pipeline
Choose of the three pipelines of interest. The XLM-T pipeline is the best performing model.

### LaBSE pipeline

In [None]:
violence_pipe = pipeline("text-classification",
                         model=args.path_to_model_on_disk, 
                         device=args.cuda_device,
                         framework="pt",
                         return_all_scores=True)

### small-LaBSE pipeline

In [None]:
violence_pipe = pipeline(model=args.path_to_model_on_disk,
                         task="text-classification",
                         device=args.cuda_device,
                         framework="pt",
                         return_all_scores=True)

### XLM-T pipeline

In [None]:
violence_pipe = pipeline(model=args.path_to_model_on_disk,
                         task="text-classification",
                         device=args.cuda_device,
                         framework="pt",
                         return_all_scores=True)

### E5-pipeline

In [3]:
violence_pipe = pipeline(model=args.path_to_model_on_disk,
                         task="text-classification", # This line helps with e5. For the other models is not necessary
                         device=args.cuda_device,
                         framework="pt",
                         return_all_scores=True)



In [4]:
# test the pipeline. Use a short text just like you would do in a twitter. 
# Keep in mind that we are not doing any text preprocessing for this test case.
text = "today is a sunny day"
outputs = violence_pipe(text)

In [5]:
pd.DataFrame(outputs[0])

Unnamed: 0,label,score
0,post1geo10,0.432805
1,post1geo20,0.446945
2,post1geo30,0.444677
3,post1geo50,0.49891
4,post1geo70,0.571062
5,post2geo10,0.437088
6,post2geo20,0.451133
7,post2geo30,0.445039
8,post2geo50,0.502674
9,post2geo70,0.576518


In [6]:
outputs

[[{'label': 'post1geo10', 'score': 0.43280521035194397},
  {'label': 'post1geo20', 'score': 0.44694456458091736},
  {'label': 'post1geo30', 'score': 0.4446774125099182},
  {'label': 'post1geo50', 'score': 0.49890971183776855},
  {'label': 'post1geo70', 'score': 0.5710616111755371},
  {'label': 'post2geo10', 'score': 0.43708768486976624},
  {'label': 'post2geo20', 'score': 0.45113325119018555},
  {'label': 'post2geo30', 'score': 0.4450390934944153},
  {'label': 'post2geo50', 'score': 0.5026735067367554},
  {'label': 'post2geo70', 'score': 0.5765184164047241},
  {'label': 'post3geo10', 'score': 0.4296078085899353},
  {'label': 'post3geo20', 'score': 0.44261234998703003},
  {'label': 'post3geo30', 'score': 0.433727890253067},
  {'label': 'post3geo50', 'score': 0.49758416414260864},
  {'label': 'post3geo70', 'score': 0.5741784572601318},
  {'label': 'post7geo10', 'score': 0.39940935373306274},
  {'label': 'post7geo20', 'score': 0.4107460081577301},
  {'label': 'post7geo30', 'score': 0.3904

# 2. Use the pipeline to make predictions with the violence dataset

### Load the tokenized test dataset to disk
This is the tokenized dataset that includes the columns *text*, *tweetid*, *geo_x*, *geo_y*, and *lang*

In [9]:
ds_tok = load_from_disk(args.dataset_name)

In [10]:
# Select a subsmample for testing purposes
test_tokenized_ds_sample = ds_tok.shuffle().select(range(10000))

In [11]:
test_tokenized_ds_sample[0]

{'tweetid': '448310607977414657',
 'geo_x': tensor(-43.0808),
 'geo_y': tensor(-22.8325),
 'lang': 'pt',
 'text': 'então rlx q eles vão te devolver kkkkkk',
 'labels': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 1., 1., 1.]),
 'input_ids': tensor([     0,  27573,      6,  54123,    425,   8096,  15208,  48096,    120,
         137087,     56,      6,   3218,   3218,   3218,      2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'pred_post1geo10': tensor(0.1864),
 'pred_post1geo20': tensor(0.2273),
 'pred_post1geo30': tensor(0.2490),
 'pred_post1geo50': tensor(0.2342),
 'pred_post1geo70': tensor(0.2057),
 'pred_post2geo10': tensor(0.2675),
 'pred_post2geo20': tensor(0.3118),
 'pred_post2geo30': tensor(0.3201),
 'pred_post2geo50': tensor(0.2853),
 'pred_post2geo70': tensor(0.2452),
 'pred_post3geo10': tensor(0.3207),
 'pred_pos

### a. Single sample (pipeline)

In [12]:
sample = test_tokenized_ds_sample[10]
print(f"Text: {sample['text']} | Language: {sample['lang']}")

Text: Les deseo mucho éxito a todo el team de  a mi querido coach  y  a todos los que se unirán ma… | Language: es


In [13]:
outputs = violence_pipe(sample['text'])
pd.DataFrame(outputs[0])

Unnamed: 0,label,score
0,post1geo10,0.012595
1,post1geo20,0.014505
2,post1geo30,0.041616
3,post1geo50,0.097532
4,post1geo70,0.115222
5,post2geo10,0.057619
6,post2geo20,0.058985
7,post2geo30,0.108208
8,post2geo50,0.162037
9,post2geo70,0.165247


### b. Multiple sample pipeline (visualizing text, true labels, predictions, and other columns)
Make sure to select the apropriate batch size.

In [None]:
# For samples only
preds = []

for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(test_tokenized_ds_sample, "text"), 
                                               batch_size=args.batch_size,
                                               truncation=True),
                                 total=len(test_tokenized_ds_sample))):
    text = test_tokenized_ds_sample[i]['text']
    labels = test_tokenized_ds_sample[i]['labels'].tolist()
    tweetid = test_tokenized_ds_sample[i]['tweetid']
    lang = test_tokenized_ds_sample[i]['lang']
    geo_x = float(test_tokenized_ds_sample[i]['geo_x']) # cast tensor to float
    geo_y = float(test_tokenized_ds_sample[i]['geo_y']) # cast tensor to float
    preds.append({
        'tweetid': tweetid,
        'text': text,
        'lang': lang,
        'geo_x': geo_x,
        'geo_y': geo_y,
        'labels': labels,
        'outputs': outputs
    })

In [19]:
preds = []

for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(ds_tok, "text"), batch_size=args.batch_size,
                                              truncation=True),
                                 total=len(ds_tok))):
    text = ds_tok[i]['text']
    labels = ds_tok[i]['labels'].tolist()
    tweetid = ds_tok[i]['tweetid']
    lang = ds_tok[i]['lang']
    geo_x = float(ds_tok[i]['geo_x']) # cast tensor to float
    geo_y = float(ds_tok[i]['geo_y']) # cast tensor to float
    preds.append({
        'tweetid': tweetid,
        'text': text,
        'lang': lang,
        'geo_x': geo_x,
        'geo_y': geo_y,
        'labels': labels,
        'outputs': outputs
    })

  0%|          | 0/2329158 [00:00<?, ?it/s]

In [None]:
preds

In [20]:
processed_data = []
for pred in preds:
    row = {
        'tweetid': pred['tweetid'],  # Include tweet ID
        'text': pred['text'],        # Include text
        'lang': pred['lang'],        # Include language
        'geo_x': pred['geo_x'],      # Include geo_x coordinate
        'geo_y': pred['geo_y'],      # Include geo_y coordinate
    }
    
    # True labels for each column   
    row['post1geo10_true'] = pred['labels'][0]
    row['post1geo20_true'] = pred['labels'][1]
    row['post1geo30_true'] = pred['labels'][2]
    row['post1geo50_true'] = pred['labels'][3]
    row['post1geo70_true'] = pred['labels'][4]
    row['post2geo10_true'] = pred['labels'][5]
    row['post2geo20_true'] = pred['labels'][6]
    row['post2geo30_true'] = pred['labels'][7]
    row['post2geo50_true'] = pred['labels'][8]
    row['post2geo70_true'] = pred['labels'][9]
    row['post3geo10_true'] = pred['labels'][10]
    row['post3geo20_true'] = pred['labels'][11]
    row['post3geo30_true'] = pred['labels'][12]
    row['post3geo50_true'] = pred['labels'][13]
    row['post3geo70_true'] = pred['labels'][14]
    row['post7geo10_true'] = pred['labels'][15]
    row['post7geo20_true'] = pred['labels'][16]
    row['post7geo30_true'] = pred['labels'][17]
    row['post7geo50_true'] = pred['labels'][18]
    row['post7geo70_true'] = pred['labels'][19]
    row['pre1geo10_true'] = pred['labels'][20]
    row['pre1geo20_true'] = pred['labels'][21]
    row['pre1geo30_true'] = pred['labels'][22]
    row['pre1geo50_true'] = pred['labels'][23]
    row['pre1geo70_true'] = pred['labels'][24]
    row['pre2geo10_true'] = pred['labels'][25]
    row['pre2geo20_true'] = pred['labels'][26]
    row['pre2geo30_true'] = pred['labels'][27]
    row['pre2geo50_true'] = pred['labels'][28]
    row['pre2geo70_true'] = pred['labels'][29]
    row['pre3geo10_true'] = pred['labels'][30]
    row['pre3geo20_true'] = pred['labels'][31]
    row['pre3geo30_true'] = pred['labels'][32]
    row['pre3geo50_true'] = pred['labels'][33]
    row['pre3geo70_true'] = pred['labels'][34]
    row['pre7geo10_true'] = pred['labels'][35]
    row['pre7geo20_true'] = pred['labels'][36]
    row['pre7geo30_true'] = pred['labels'][37]
    row['pre7geo50_true'] = pred['labels'][38]
    row['pre7geo70_true'] = pred['labels'][39]
    
    # Predicted scores for each column   
    row['post1geo10'] = pred['outputs'][0]['score']
    row['post1geo20'] = pred['outputs'][1]['score']
    row['post1geo30'] = pred['outputs'][2]['score']
    row['post1geo50'] = pred['outputs'][3]['score']
    row['post1geo70'] = pred['outputs'][4]['score']
    row['post2geo10'] = pred['outputs'][5]['score']
    row['post2geo20'] = pred['outputs'][6]['score']
    row['post2geo30'] = pred['outputs'][7]['score']
    row['post2geo50'] = pred['outputs'][8]['score']
    row['post2geo70'] = pred['outputs'][9]['score']
    row['post3geo10'] = pred['outputs'][10]['score']
    row['post3geo20'] = pred['outputs'][11]['score']
    row['post3geo30'] = pred['outputs'][12]['score']
    row['post3geo50'] = pred['outputs'][13]['score']
    row['post3geo70'] = pred['outputs'][14]['score']
    row['post7geo10'] = pred['outputs'][15]['score']
    row['post7geo20'] = pred['outputs'][16]['score']
    row['post7geo30'] = pred['outputs'][17]['score']
    row['post7geo50'] = pred['outputs'][18]['score']
    row['post7geo70'] = pred['outputs'][19]['score']
    row['pre1geo10'] = pred['outputs'][20]['score']
    row['pre1geo20'] = pred['outputs'][21]['score']
    row['pre1geo30'] = pred['outputs'][22]['score']
    row['pre1geo50'] = pred['outputs'][23]['score']
    row['pre1geo70'] = pred['outputs'][24]['score']
    row['pre2geo10'] = pred['outputs'][25]['score']
    row['pre2geo20'] = pred['outputs'][26]['score']
    row['pre2geo30'] = pred['outputs'][27]['score']
    row['pre2geo50'] = pred['outputs'][28]['score']
    row['pre2geo70'] = pred['outputs'][29]['score']
    row['pre3geo10'] = pred['outputs'][30]['score']
    row['pre3geo20'] = pred['outputs'][31]['score']
    row['pre3geo30'] = pred['outputs'][32]['score']
    row['pre3geo50'] = pred['outputs'][33]['score']
    row['pre3geo70'] = pred['outputs'][34]['score']
    row['pre7geo10'] = pred['outputs'][35]['score']
    row['pre7geo20'] = pred['outputs'][36]['score']
    row['pre7geo30'] = pred['outputs'][37]['score']
    row['pre7geo50'] = pred['outputs'][38]['score']
    row['pre7geo70'] = pred['outputs'][39]['score']
    
    processed_data.append(row)

# Convert to DataFrame
df = pd.DataFrame(processed_data)

In [21]:
# Reorder columns for the specified order of all labels and their predictions
df = df[['tweetid', 'text', 'lang', 'geo_x', 'geo_y',
         # All 'true' label columns
         'post1geo10_true', 'post1geo20_true', 'post1geo30_true', 'post1geo50_true', 'post1geo70_true',
         'post2geo10_true', 'post2geo20_true', 'post2geo30_true', 'post2geo50_true', 'post2geo70_true',
         'post3geo10_true', 'post3geo20_true', 'post3geo30_true', 'post3geo50_true', 'post3geo70_true',
         'post7geo10_true', 'post7geo20_true', 'post7geo30_true', 'post7geo50_true', 'post7geo70_true',
         'pre1geo10_true', 'pre1geo20_true', 'pre1geo30_true', 'pre1geo50_true', 'pre1geo70_true',
         'pre2geo10_true', 'pre2geo20_true', 'pre2geo30_true', 'pre2geo50_true', 'pre2geo70_true',
         'pre3geo10_true', 'pre3geo20_true', 'pre3geo30_true', 'pre3geo50_true', 'pre3geo70_true',
         'pre7geo10_true', 'pre7geo20_true', 'pre7geo30_true', 'pre7geo50_true', 'pre7geo70_true',
         # All prediction score columns
         'post1geo10', 'post1geo20', 'post1geo30', 'post1geo50', 'post1geo70',
         'post2geo10', 'post2geo20', 'post2geo30', 'post2geo50', 'post2geo70',
         'post3geo10', 'post3geo20', 'post3geo30', 'post3geo50', 'post3geo70',
         'post7geo10', 'post7geo20', 'post7geo30', 'post7geo50', 'post7geo70',
         'pre1geo10', 'pre1geo20', 'pre1geo30', 'pre1geo50', 'pre1geo70',
         'pre2geo10', 'pre2geo20', 'pre2geo30', 'pre2geo50', 'pre2geo70',
         'pre3geo10', 'pre3geo20', 'pre3geo30', 'pre3geo50', 'pre3geo70',
         'pre7geo10', 'pre7geo20', 'pre7geo30', 'pre7geo50', 'pre7geo70']]

In [22]:
df[60:65]

Unnamed: 0,tweetid,text,lang,geo_x,geo_y,post1geo10_true,post1geo20_true,post1geo30_true,post1geo50_true,post1geo70_true,...,pre3geo10,pre3geo20,pre3geo30,pre3geo50,pre3geo70,pre7geo10,pre7geo20,pre7geo30,pre7geo50,pre7geo70
60,487300699391524864,#Baloncesto | El seguro ha retrasado la incorp...,es,-66.879189,10.48801,0.0,0.0,0.0,0.0,0.0,...,0.17276,0.185882,0.289839,0.332935,0.31315,0.336011,0.334951,0.404795,0.449539,0.423122
61,447660809615716352,？あなたの質問はモンゴル語ができるかどうかは、関係ないと思いますが。,ja,34.333328,31.41667,0.0,0.0,0.0,0.0,0.0,...,0.057529,0.100382,0.177725,0.142036,0.21837,0.160163,0.205631,0.308839,0.223797,0.298765
62,490608380818776065,الحمدلله,ar,35.203289,31.92157,0.0,0.0,0.0,0.0,0.0,...,0.582823,0.623713,0.654785,0.623128,0.630909,0.521218,0.575416,0.612139,0.559383,0.561637
63,481456072243552256,Ben hocaları ders konuşan resmi insanlar sanır...,tr,36.567219,36.269169,0.0,0.0,0.0,1.0,1.0,...,0.001255,0.003811,0.049802,0.654684,0.90557,0.00372,0.006831,0.051607,0.513235,0.888442
64,476767010660294657,#11J,und,-66.879189,10.48801,0.0,0.0,0.0,0.0,0.0,...,0.016269,0.017479,0.018452,0.014311,0.021876,0.92088,0.936481,0.941245,0.933251,0.938693


In [23]:
# save the dataframe to disk
df.to_csv(args.fout, index=False)