In [1]:
import torch
import transformers
import pandas as pd
import warnings

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_from_disk
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from argparse import Namespace

2024-11-27 14:31:46.426785: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
# Options for models in the hub
# m2im/XLMT-T_finetuned_violence_twitter
# m2im/smallLabse_finetuned_twitter
# m2im/labse_finetuned_twitter

# Options for path_to_model_on_disk
# /home/mmendieta/labse_finetuned_twitter/
# /data3/mmendieta/models/xlmt_finetuned_twitter
# /data3/mmendieta/models/smallLabse_finetuned_twitter

config = {
    "cuda_device": 14,
    "path_to_model_on_disk": "/home/mmendieta/xlmt_finetuned_twitter/", 
    "model_ckpt": "m2im/XLM-T_finetuned_violence_twitter",
    "max_length": 32,
    "dataset_name": "/data3/mmendieta/Violence_data/geo_corpus.0.0.1_tok_ds_xlmt"
}

args = Namespace(**config)

# 1. Instantiate the required pipeline
Choose of the three pipelines of interest. The XLM-T pipeline is the best performing model.

### LaBSE pipeline

In [None]:
violence_pipe = pipeline("text-classification",
                         model="m2im/labse_finetuned_twitter", 
                         device=args.cuda_device,
                         return_all_scores=True)

### small-LaBSE pipeline

In [None]:
violence_pipe = pipeline(model="m2im/smallLabse_finetuned_twitter", 
                         device=args.cuda_device,
                         return_all_scores=True)

### XLM-T pipeline

In [4]:
violence_pipe = pipeline(model="m2im/XLM-T_finetuned_violence_twitter", 
                         device=args.cuda_device,
                         return_all_scores=True)

Downloading:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

In [5]:
# test the pipeline. Use a short text just like you would do in a twitter. 
# Keep in mind that we are not doing any text preprocessing for this test case.
text = "today is a sunny day"
outputs = violence_pipe(text)

In [6]:
pd.DataFrame(outputs[0])

Unnamed: 0,label,score
0,post7geo10,0.225355
1,post7geo30,0.298239
2,post7geo50,0.528795
3,pre7geo10,0.198816
4,pre7geo30,0.270994
5,pre7geo50,0.481664


In [7]:
outputs

[[{'label': 'post7geo10', 'score': 0.22535526752471924},
  {'label': 'post7geo30', 'score': 0.2982390224933624},
  {'label': 'post7geo50', 'score': 0.5287949442863464},
  {'label': 'pre7geo10', 'score': 0.19881635904312134},
  {'label': 'pre7geo30', 'score': 0.2709938883781433},
  {'label': 'pre7geo50', 'score': 0.48166388273239136}]]

# 2. Use the pipeline to make predictions with the violence dataset

### Load the tokenized dataset to disk
This is the tokenized dataset that includes the column *text*

In [8]:
ds_tok = load_from_disk("/data3/mmendieta/Violence_data/geo_corpus.0.0.1_tok_ds_xlmt_with_text")

In [9]:
# Select a subsmample for testing purposes
test_tokenized_ds_sample = ds_tok["test"].shuffle().select(range(100))

### a. Single sample (pipeline)

In [10]:
sample = test_tokenized_ds_sample[10]['text']
sample

'Por mis últimas horas d este MARAVILLOSO 2013 en mi México Hermoso y Amado GRACIAS por TANTO mi 2da Patria #GRATITUD'

In [11]:
outputs = violence_pipe(sample)
pd.DataFrame(outputs[0])

Token indices sequence length is longer than the specified maximum sequence length for this model (36 > 32). Running this sequence through the model will result in indexing errors


Unnamed: 0,label,score
0,post7geo10,0.813419
1,post7geo30,0.906668
2,post7geo50,0.961734
3,pre7geo10,0.003347
4,pre7geo30,0.019694
5,pre7geo50,0.043607


### b. Multiple samples pipeline - predictions only
Make sure to select the apropriate batch size

In [12]:
# Hide the large number of deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
preds = []

for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(test_tokenized_ds_sample, "text"), batch_size=8,
                                              truncation=True),
                                 total=len(test_tokenized_ds_sample))):
    preds.append(outputs)
                        

  0%|          | 0/100 [00:00<?, ?it/s]

In [13]:
processed_data = []
for pred in preds:
    scores = {item['label']: item['score'] for item in pred}
    processed_data.append(scores)

In [14]:
# Convert to dataframe
df = pd.DataFrame(processed_data)
df

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
0,0.342312,0.463959,0.591682,0.351697,0.440490,0.572073
1,0.120784,0.170307,0.380582,0.139021,0.176037,0.480341
2,0.026485,0.042070,0.585672,0.019112,0.032616,0.507280
3,0.156097,0.214044,0.257531,0.655035,0.743272,0.807771
4,0.685858,0.793750,0.840339,0.183909,0.216985,0.236476
...,...,...,...,...,...,...
95,0.339237,0.459091,0.580015,0.348046,0.446097,0.568889
96,0.266162,0.358625,0.462050,0.461983,0.544352,0.680309
97,0.446046,0.598032,0.663540,0.449042,0.605008,0.669135
98,0.200728,0.313689,0.601028,0.181046,0.298158,0.594994


In [15]:
mean_preds = df.mean().to_frame(name='mean').T
mean_preds 

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0.32457,0.447574,0.567676,0.338834,0.436642,0.569874


In [16]:
mean_preds_th = (mean_preds >= 0.5).astype(int)
mean_preds_th

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0,0,1,0,0,1


### c. Multiple sample pipeline (visualizing text, true labels, and predictions)
Make sure to select the apropriate batch size.

In [17]:
# Hide the large number of deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
preds = []

for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(test_tokenized_ds_sample, "text"), batch_size=10,
                                              truncation=True),
                                 total=len(test_tokenized_ds_sample))):
    text = test_tokenized_ds_sample[i]['text']
    labels = test_tokenized_ds_sample[0]['labels'].tolist()
    preds.append({
        'text': text,
        'labels': labels,
        'outputs': outputs
    })

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
preds

In [19]:
processed_data = []
for pred in preds:
    row = {
        'text': pred['text'],
        'true_labels': pred['labels'],
    }
    # Add each of the six pedicted scores as separate columns
    row['post7geo10'] = pred['outputs'][0]['score']
    row['post7geo30'] = pred['outputs'][1]['score']
    row['post7geo50'] = pred['outputs'][2]['score']
    row['pre7geo10'] = pred['outputs'][3]['score']
    row['pre7geo30'] = pred['outputs'][4]['score']
    row['pre7geo50'] = pred['outputs'][5]['score']
    
    processed_data.append(row)
    
# Convert to DataFrame
df = pd.DataFrame(processed_data)
    

In [None]:
df[60:70]

In [20]:
mean_preds = df.mean().to_frame(name='mean').T
mean_preds 

  mean_preds = df.mean().to_frame(name='mean').T


Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0.32457,0.447574,0.567676,0.338834,0.436642,0.569874


In [21]:
mean_preds_th = (mean_preds >= 0.5).astype(int)
mean_preds_th

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0,0,1,0,0,1


### d. Multiple values - select specific cases

In [22]:
# Define the label mapping
id2label: {
    "0": "post7geo10",
    "1": "post7geo30",
    "2": "post7geo50",
    "3": "pre7geo10",
    "4": "pre7geo30",
    "5": "pre7geo50"
}

# Define the filtering function to match the required conditions
def filter_condition(example):
    return (example['labels'][0] == 1 and # post7geo10
            example['labels'][1] == 1 and # post7geo30
            example['labels'][2] == 1 and # post7geo50
            example['labels'][3] == 0 and # pre7geo10
            example['labels'][4] == 0 and # pre7geo30
            example['labels'][5] == 0 # pre7geo50
    )

In [23]:
# ds_tok was instantiated previously
# This filter operation on the dataset takes approximately 3' 21s to complete
# A more efficient approach would be using map function
filtered_dataset = ds_tok["test"].filter(filter_condition)

Loading cached processed dataset at /data3/mmendieta/Violence_data/geo_corpus.0.0.1_tok_ds_xlmt_with_text/test/cache-0bf953b0cd156b77.arrow


In [None]:
filtered_dataset

In [24]:
# Select subsample or the whole dataset
filtered_dataset_sample = filtered_dataset.shuffle().select(range(5000))

In [None]:
filtered_dataset_sample['labels'][:10]

In [25]:
# Hide the large number of deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
preds = []

# change the dataset accordigly. Either use the sample dataset or the whole dataset
# change batch_size as appropriate
for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(filtered_dataset_sample, "text"), batch_size=1,
                                              truncation=True),
                                 total=len(filtered_dataset_sample))):
    preds.append(outputs)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
preds

In [26]:
processed_data = []
for pred in preds:
    scores = {item['label']: item['score'] for item in pred}
    processed_data.append(scores)
    
# Convert to DataFrame
df = pd.DataFrame(processed_data)

In [None]:
df[30:40]

In [27]:
mean_preds = df.mean().to_frame(name='mean').T
mean_preds 

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0.456322,0.560909,0.642424,0.331207,0.415657,0.493026


In [28]:
mean_preds_th = (mean_preds >= 0.5).astype(int)
mean_preds_th

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0,1,1,0,0,0


# X. Tokenize the whole dataset
Run this code if you want to tokenize the whole dataset again. This step is necessary because the column 'text' was removed during training, and we want this column for making predictions. The code below was already tokenized with the *text* column for the XLM-T model.

In [None]:
# Load the dataset (untokenized) from disk
ds = load_from_disk("/data3/mmendieta/Violence_data/geo_corpus.0.0.1_dataset_for_train")

### Tokenize the whole dataset

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)

In [None]:
# Instantiate the tokenizer 
model_ckpt = args.model_ckpt
tokenizer = AutoTokenizer.from_pretrained(model_ckpt,
                                         model_max_length=args.max_length
                                         )

In [None]:
# This code takes 14min and 33 s to run
%time tokenized_ds = ds.map(tokenize, batched=True)

In [None]:
tokenized_ds.set_format('torch')

In [None]:
tokenized_ds

### Save tokenized dataset to disk

In [None]:
tokenized_ds.save_to_disk("/data3/mmendieta/Violence_data/geo_corpus.0.0.1_tok_ds_xlmt_with_text")