In [1]:
import torch
import transformers
import pandas as pd
import warnings
import evaluate

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_from_disk
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from argparse import Namespace

2024-12-16 21:17:59.553496: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
# Options for models in the hub
# m2im/XLMT-T_finetuned_violence_twitter
# m2im/smallLabse_finetuned_twitter
# m2im/labse_finetuned_twitter

# Options for path_to_model_on_disk
# /home/mmendieta/labse_finetuned_twitter/
# /data3/mmendieta/models/xlmt_finetuned_twitter
# /data3/mmendieta/models/smallLabse_finetuned_twitter

# Options for path_to_tokenized datasets_on_disk
# /data3/mmendieta/Violence_data/geo_corpus.0.0.1_tok_ds_small_labse_inference
# /data3/mmendieta/Violence_data/geo_corpus.0.0.1_tok_ds_xlmt_inference
# /data3/mmendieta/Violence_data/geo_corpus.0.0.1_tok_ds_labse_inference

config = {
    "cuda_device": 14,
    "path_to_model_on_disk": "/home/mmendieta/xlmt_finetuned_twitter/", 
    "model_ckpt": "m2im/labse_finetuned_twitter",
    "max_length": 32,
    "batch_size": 512,
    "dataset_name": "/data3/mmendieta/Violence_data/geo_corpus.0.0.1_tok_ds_small_labse_inference",
    "fout": "/data3/mmendieta/Violence_data/csv_files_global_scale/small_labse_inference_test_set.csv"
}

args = Namespace(**config)

# 1. Instantiate the required pipeline
Choose of the three pipelines of interest. The XLM-T pipeline is the best performing model.

### LaBSE pipeline

In [None]:
violence_pipe = pipeline("text-classification",
                         model="m2im/labse_finetuned_twitter", 
                         device=args.cuda_device,
                         return_all_scores=True)

### small-LaBSE pipeline

In [3]:
violence_pipe = pipeline(model="m2im/smallLabse_finetuned_twitter", 
                         device=args.cuda_device,
                         return_all_scores=True)

Downloading:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

2024-12-16 21:18:24.886029: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2024-12-16 21:18:24.886968: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2024-12-16 21:18:26.056559: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:1e:00.0 name: Tesla V100-SXM3-32GB computeCapability: 7.0
coreClock: 1.597GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 913.62GiB/s
2024-12-16 21:18:26.057706: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 1 with properties: 
pciBusID: 0000:23:00.0 name: Tesla V100-SXM3-32GB computeCapability: 7.0
coreClock: 1.597GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 913.62GiB/s
2024-12-16 21:18:26.058773: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 2 with properties: 
pciBusID: 0000:28:00.0 name: Te

### XLM-T pipeline

In [None]:
violence_pipe = pipeline(model="m2im/XLM-T_finetuned_violence_twitter", 
                         device=args.cuda_device,
                         return_all_scores=True)

In [None]:
# test the pipeline. Use a short text just like you would do in a twitter. 
# Keep in mind that we are not doing any text preprocessing for this test case.
text = "today is a sunny day"
outputs = violence_pipe(text)

In [None]:
pd.DataFrame(outputs[0])

In [None]:
outputs

# 2. Use the pipeline to make predictions with the violence dataset

### Load the tokenized test dataset to disk
This is the tokenized dataset that includes the columns *text*, *tweetid*, *geo_x*, *geo_y*, and *lang*

In [4]:
ds_tok = load_from_disk(args.dataset_name)

In [None]:
# Select a subsmample for testing purposes
test_tokenized_ds_sample = ds_tok.shuffle().select(range(10000))

In [None]:
test_tokenized_ds_sample[0]

### a. Single sample (pipeline)

In [None]:
sample = test_tokenized_ds_sample[10]
print(f"Text: {sample['text']} | Language: {sample['lang']}")

In [None]:
outputs = violence_pipe(sample['text'])
pd.DataFrame(outputs[0])

### b. Multiple sample pipeline (visualizing text, true labels, predictions, and other columns)
Make sure to select the apropriate batch size.

In [None]:
# For samples only
preds = []

for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(test_tokenized_ds_sample, "text"), 
                                               batch_size=args.batch_size,
                                               truncation=True),
                                 total=len(test_tokenized_ds_sample))):
    text = test_tokenized_ds_sample[i]['text']
    labels = test_tokenized_ds_sample[i]['labels'].tolist()
    tweetid = test_tokenized_ds_sample[i]['tweetid']
    lang = test_tokenized_ds_sample[i]['lang']
    geo_x = float(test_tokenized_ds_sample[i]['geo_x']) # cast tensor to float
    geo_y = float(test_tokenized_ds_sample[i]['geo_y']) # cast tensor to float
    preds.append({
        'tweetid': tweetid,
        'text': text,
        'lang': lang,
        'geo_x': geo_x,
        'geo_y': geo_y,
        'labels': labels,
        'outputs': outputs
    })

In [5]:
preds = []

for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(ds_tok, "text"), batch_size=args.batch_size,
                                              truncation=True),
                                 total=len(ds_tok))):
    text = ds_tok[i]['text']
    labels = ds_tok[i]['labels'].tolist()
    tweetid = ds_tok[i]['tweetid']
    lang = ds_tok[i]['lang']
    geo_x = float(ds_tok[i]['geo_x']) # cast tensor to float
    geo_y = float(ds_tok[i]['geo_y']) # cast tensor to float
    preds.append({
        'tweetid': tweetid,
        'text': text,
        'lang': lang,
        'geo_x': geo_x,
        'geo_y': geo_y,
        'labels': labels,
        'outputs': outputs
    })

  0%|          | 0/2329158 [00:00<?, ?it/s]

In [None]:
preds

In [6]:
processed_data = []
for pred in preds:
    row = {
        'tweetid': pred['tweetid'],  # Include tweet ID
        'text': pred['text'],        # Include text
        'lang': pred['lang'],        # Include language
        'geo_x': pred['geo_x'],      # Include geo_x coordinate
        'geo_y': pred['geo_y'],      # Include geo_y coordinate
    }
    
    # True labels for each column
    row['post7geo10_true'] = pred['labels'][0]
    row['post7geo30_true'] = pred['labels'][1]
    row['post7geo50_true'] = pred['labels'][2]
    row['pre7geo10_true'] = pred['labels'][3]
    row['pre7geo30_true'] = pred['labels'][4]
    row['pre7geo50_true'] = pred['labels'][5]
    
    # Predicted scores for each column
    row['post7geo10'] = pred['outputs'][0]['score']
    row['post7geo30'] = pred['outputs'][1]['score']
    row['post7geo50'] = pred['outputs'][2]['score']
    row['pre7geo10'] = pred['outputs'][3]['score']
    row['pre7geo30'] = pred['outputs'][4]['score']
    row['pre7geo50'] = pred['outputs'][5]['score']
    
    processed_data.append(row)

# Convert to DataFrame
df = pd.DataFrame(processed_data)

In [7]:
# Reorder columns for the specified order
df = df[['tweetid', 'text', 'lang', 'geo_x', 'geo_y',
         'post7geo10_true', 'post7geo30_true', 'post7geo50_true', 
         'pre7geo10_true', 'pre7geo30_true', 'pre7geo50_true',
         'post7geo10', 'post7geo30', 'post7geo50',
         'pre7geo10', 'pre7geo30', 'pre7geo50']]

In [8]:
df[60:65]

Unnamed: 0,tweetid,text,lang,geo_x,geo_y,post7geo10_true,post7geo30_true,post7geo50_true,pre7geo10_true,pre7geo30_true,pre7geo50_true,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
60,487300699391524864,#Baloncesto | El seguro ha retrasado la incorp...,es,-66.879189,10.48801,1.0,1.0,1.0,0.0,0.0,0.0,0.301021,0.447158,0.561377,0.335758,0.460488,0.589329
61,447660809615716352,？あなたの質問はモンゴル語ができるかどうかは、関係ないと思いますが。,ja,34.333328,31.41667,0.0,0.0,0.0,0.0,0.0,0.0,0.280725,0.532827,0.531862,0.300278,0.515056,0.511221
62,490608380818776065,الحمدلله,ar,35.203289,31.92157,0.0,0.0,0.0,1.0,1.0,1.0,0.374464,0.555044,0.633924,0.384644,0.565593,0.642552
63,481456072243552256,Ben hocaları ders konuşan resmi insanlar sanır...,tr,36.567219,36.269169,0.0,1.0,1.0,0.0,1.0,1.0,0.00412,0.083924,0.563934,0.003861,0.07994,0.549076
64,476767010660294657,#11J,und,-66.879189,10.48801,0.0,0.0,0.0,1.0,1.0,1.0,0.024238,0.02719,0.027362,0.956298,0.979338,0.986539


In [9]:
# save the dataframe to disk
df.to_csv(args.fout, index=False)