In [1]:
import torch
import transformers
import pandas as pd
import warnings

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_from_disk
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from argparse import Namespace

2024-10-30 21:44:40.764237: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
config = {
    "cuda_device": 15,
    "path_to_model_on_disk": "/data3/mmendieta/models/xlmt_finetuned_twitter/", 
    "model_ckpt": "m2im/XLM-T_finetuned_violence_twitter",
    "max_length": 32,
    "dataset_name": "/data3/mmendieta/Violence_data/geo_corpus.0.0.1_tok_ds_xlmt_with_text",
    "batch_size": 64
}

args = Namespace(**config)

In [3]:
# Instantiate the pipeline with the model of choice
violence_pipe = pipeline(model=args.model_ckpt, 
                         device=args.cuda_device,
                         return_all_scores=True)

Downloading:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

2024-10-30 21:44:49.453151: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2024-10-30 21:44:49.454968: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2024-10-30 21:44:49.630542: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:1e:00.0 name: Tesla V100-SXM3-32GB computeCapability: 7.0
coreClock: 1.597GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 913.62GiB/s
2024-10-30 21:44:49.631624: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 1 with properties: 
pciBusID: 0000:23:00.0 name: Tesla V100-SXM3-32GB computeCapability: 7.0
coreClock: 1.597GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 913.62GiB/s
2024-10-30 21:44:49.632678: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 2 with properties: 
pciBusID: 0000:28:00.0 name: Te

In [4]:
# Recall that the test dataset has 2.329.158 observations. The inference is done in batches to exploit GPU resources
ds_tok = load_from_disk(args.dataset_name)

In [5]:
preds = []

for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(ds_tok['test'], "text"), batch_size=args.batch_size,
                                              truncation=True),
                                 total=len(ds_tok['test']))):
    preds.append(outputs)

  0%|          | 0/2329158 [00:00<?, ?it/s]

In [6]:
processed_data = []
for pred in preds:
    scores = {item['label']: item['score'] for item in pred}
    processed_data.append(scores)

In [7]:
# Convert to dataframe
df = pd.DataFrame(processed_data)
df

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
0,0.226729,0.327225,0.441367,0.219655,0.319141,0.428590
1,0.293337,0.432246,0.528523,0.447481,0.553608,0.688987
2,0.387055,0.544625,0.583162,0.392447,0.559619,0.588813
3,0.035168,0.038399,0.056211,0.041120,0.046594,0.070071
4,0.523534,0.921772,0.950220,0.902491,0.977367,0.988067
...,...,...,...,...,...,...
2329153,0.233221,0.262610,0.517162,0.212350,0.244978,0.467469
2329154,0.344274,0.470150,0.588610,0.358045,0.453263,0.578047
2329155,0.410559,0.470784,0.603613,0.320064,0.375958,0.494749
2329156,0.045838,0.062574,0.111670,0.047550,0.065505,0.121237


In [8]:
mean_preds = df.mean().to_frame(name='mean').T
mean_preds 

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0.345729,0.457622,0.576199,0.34781,0.445743,0.563775


In [9]:
mean_preds_th = (mean_preds >= 0.5).astype(int)
mean_preds_th

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0,0,1,0,0,1
