In [1]:
import torch
import transformers
import pandas as pd
import warnings

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_from_disk
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from argparse import Namespace

2024-10-22 18:26:28.187666: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
# Options for models in the hub
# m2im/XLMT-T_finetuned_violence_twitter
# m2im/smallLabse_finetuned_twitter
# m2im/labse_finetuned_twitter

# Options for path_to_model_on_disk
# /home/mmendieta/labse_finetuned_twitter/
# /data3/mmendieta/models/xlmt_finetuned_twitter
# /data3/mmendieta/models/smallLabse_finetuned_twitter

config = {
    "cuda_device": 14,
    "path_to_model_on_disk": "/home/mmendieta/xlmt_finetuned_twitter/", 
    "model_ckpt": "m2im/XLM-T_finetuned_violence_twitter",
    "max_length": 32,
    "dataset_name": "/data3/mmendieta/Violence_data/geo_corpus.0.0.1_tok_ds_xlmt"
}

args = Namespace(**config)

# 1. Instantiate the required pipeline
Choose of the three pipelines of interest. The XLM-T pipeline is the best performing model.

### LaBSE pipeline

In [None]:
violence_pipe = pipeline("text-classification",
                         model="m2im/labse_finetuned_twitter", 
                         device=args.cuda_device,
                         return_all_scores=True)

### small-LaBSE pipeline

In [None]:
violence_pipe = pipeline(model="m2im/smallLabse_finetuned_twitter", 
                         device=args.cuda_device,
                         return_all_scores=True)

### XLM-T pipeline

In [4]:
violence_pipe = pipeline(model="m2im/XLM-T_finetuned_violence_twitter", 
                         device=args.cuda_device,
                         return_all_scores=True)

Downloading:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

In [5]:
# test the pipeline. Use a short text just like you would do in a twitter. 
# Keep in mind that we are not doing any text preprocessing for this test case.
text = "today is a sunny day"
outputs = violence_pipe(text)

In [6]:
pd.DataFrame(outputs[0])

Unnamed: 0,label,score
0,post7geo10,0.225355
1,post7geo30,0.298239
2,post7geo50,0.528795
3,pre7geo10,0.198816
4,pre7geo30,0.270994
5,pre7geo50,0.481664


In [7]:
outputs

[[{'label': 'post7geo10', 'score': 0.22535526752471924},
  {'label': 'post7geo30', 'score': 0.2982390224933624},
  {'label': 'post7geo50', 'score': 0.5287949442863464},
  {'label': 'pre7geo10', 'score': 0.19881635904312134},
  {'label': 'pre7geo30', 'score': 0.2709938883781433},
  {'label': 'pre7geo50', 'score': 0.48166388273239136}]]

# 2. Use the pipeline to make predictions with the violence dataset

### Load the tokenized dataset to disk
This is the tokenized dataset that includes the column *text*

In [8]:
ds_tok = load_from_disk("/data3/mmendieta/Violence_data/geo_corpus.0.0.1_tok_ds_xlmt_with_text")

In [9]:
# Select a subsmample for testing purposes
test_tokenized_ds_sample = ds_tok["test"].shuffle().select(range(100))

### a. Single sample (pipeline)

In [10]:
sample = test_tokenized_ds_sample[10]['text']
sample

'No Smh ok ukilala lala salama ...'

In [11]:
outputs = violence_pipe(sample)
pd.DataFrame(outputs[0])

Unnamed: 0,label,score
0,post7geo10,0.470881
1,post7geo30,0.505947
2,post7geo50,0.507277
3,pre7geo10,0.517721
4,pre7geo30,0.545605
5,pre7geo50,0.546617


### b. Multiple samples pipeline - predictions only
Make sure to select the apropriate batch size

In [13]:
# Hide the large number of deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
preds = []

for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(test_tokenized_ds_sample, "text"), batch_size=8,
                                              truncation=True),
                                 total=len(test_tokenized_ds_sample))):
    preds.append(outputs)
                        

  0%|          | 0/100 [00:00<?, ?it/s]

In [14]:
processed_data = []
for pred in preds:
    scores = {item['label']: item['score'] for item in pred}
    processed_data.append(scores)

In [15]:
# Convert to dataframe
df = pd.DataFrame(processed_data)
df

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
0,0.427101,0.539089,0.638307,0.355822,0.426994,0.511917
1,0.356495,0.457299,0.469535,0.437356,0.554138,0.559007
2,0.349793,0.467913,0.588818,0.380808,0.472066,0.602454
3,0.380328,0.475288,0.569262,0.406602,0.513477,0.613990
4,0.375031,0.467088,0.603675,0.352403,0.441746,0.580952
...,...,...,...,...,...,...
95,0.103036,0.142469,0.406985,0.147355,0.220874,0.871733
96,0.006811,0.075864,0.460971,0.005686,0.066353,0.425883
97,0.408591,0.531296,0.617614,0.407955,0.522020,0.608210
98,0.108417,0.186370,0.357110,0.138447,0.223436,0.428629


In [16]:
mean_preds = df.mean().to_frame(name='mean').T
mean_preds 

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0.371763,0.497251,0.609071,0.354458,0.453502,0.550492


In [17]:
mean_preds_th = (mean_preds >= 0.5).astype(int)
mean_preds_th

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0,0,1,0,0,1


### c. Multiple sample pipeline (visualizing text, true labels, and predictions)
Make sure to select the apropriate batch size.

In [18]:
# Hide the large number of deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
preds = []

for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(test_tokenized_ds_sample, "text"), batch_size=10,
                                              truncation=True),
                                 total=len(test_tokenized_ds_sample))):
    text = test_tokenized_ds_sample[i]['text']
    labels = test_tokenized_ds_sample[0]['labels'].tolist()
    preds.append({
        'text': text,
        'labels': labels,
        'outputs': outputs
    })

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
preds

In [19]:
processed_data = []
for pred in preds:
    row = {
        'text': pred['text'],
        'true_labels': pred['labels'],
    }
    # Add each of the six pedicted scores as separate columns
    row['post7geo10'] = pred['outputs'][0]['score']
    row['post7geo30'] = pred['outputs'][1]['score']
    row['post7geo50'] = pred['outputs'][2]['score']
    row['pre7geo10'] = pred['outputs'][3]['score']
    row['pre7geo30'] = pred['outputs'][4]['score']
    row['pre7geo50'] = pred['outputs'][5]['score']
    
    processed_data.append(row)
    
# Convert to DataFrame
df = pd.DataFrame(processed_data)
    

In [20]:
df[60:70]

Unnamed: 0,text,true_labels,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
60,como lo enfrentas??,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.34995,0.455875,0.551612,0.370866,0.458722,0.555347
61,É claro que a culpa é sua. Foi o seu abraço qu...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.805361,0.854945,0.881902,0.146588,0.148897,0.150301
62,Sabe o que eu odeio? Pessoas que fingem gostar...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.031335,0.038398,0.043345,0.873187,0.950432,0.96832
63,3n jad ya bbe ?,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.327539,0.578267,0.592841,0.363413,0.635036,0.645931
64,"Find someone who saw all your imperfections , ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.577492,0.674514,0.716702,0.573271,0.669907,0.711424
65,Tweet de tarde; #campaña por el #follow de #p...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.576159,0.690722,0.744706,0.408739,0.48644,0.528998
66,"Para los simpatizantes del ManU, nos dolió la ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.230188,0.353578,0.466396,0.324089,0.505584,0.62823
67,أحمد ماهر من محبسه: لم أكن أتخيل ما يحدث لي 25...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.337853,0.911037,0.917587,0.030052,0.571552,0.587524
68,Estos últimos funcionarios del cuadrante 5 de ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.889228,0.937577,0.969243,0.013155,0.03326,0.043526
69,via cola autopista sentido oeste altura los ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.389441,0.502545,0.628821,0.364735,0.440221,0.562975


In [21]:
mean_preds = df.mean().to_frame(name='mean').T
mean_preds 

  mean_preds = df.mean().to_frame(name='mean').T


Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0.371763,0.497251,0.609071,0.354458,0.453502,0.550492


In [22]:
mean_preds_th = (mean_preds >= 0.5).astype(int)
mean_preds_th

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0,0,1,0,0,1


### d. Multiple values - select specific cases

In [109]:
# Define the label mapping
id2label: {
    "0": "post7geo10",
    "1": "post7geo30",
    "2": "post7geo50",
    "3": "pre7geo10",
    "4": "pre7geo30",
    "5": "pre7geo50"
}

# Define the filtering function to match the required conditions
def filter_condition(example):
    return (example['labels'][0] == 1 and # post7geo10
            example['labels'][1] == 1 and # post7geo30
            example['labels'][2] == 1 and # post7geo50
            example['labels'][3] == 0 and # pre7geo10
            example['labels'][4] == 0 and # pre7geo30
            example['labels'][5] == 0 # pre7geo50
    )

In [110]:
# ds_tok was instantiated previously
# This filter operation on the dataset takes approximately 3' 21s to complete
# A more efficient approach would be using map function
filtered_dataset = ds_tok["test"].filter(filter_condition)

  0%|          | 0/2330 [00:00<?, ?ba/s]

In [111]:
filtered_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 437139
})

In [139]:
# Select subsample or the whole dataset
filtered_dataset_sample = filtered_dataset.shuffle().select(range(500))

In [None]:
filtered_dataset_sample['labels'][:10]

In [140]:
# Hide the large number of deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
preds = []

# change the dataset accordigly. Either use the sample dataset or the whole dataset
# change batch_size as appropriate
for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(filtered_dataset_sample, "text"), batch_size=64,
                                              truncation=True),
                                 total=len(filtered_dataset_sample))):
    preds.append(outputs)



  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
preds

In [141]:
processed_data = []
for pred in preds:
    scores = {item['label']: item['score'] for item in pred}
    processed_data.append(scores)
    
# Convert to DataFrame
df = pd.DataFrame(processed_data)

In [None]:
df[30:40]

In [142]:
mean_preds = df.mean().to_frame(name='mean').T
mean_preds 

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0.467055,0.570313,0.651688,0.337344,0.42227,0.501435


In [143]:
mean_preds_th = (mean_preds >= 0.5).astype(int)
mean_preds_th

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0,1,1,0,0,1


# X. Tokenize the whole dataset
Run this code if you want to tokenize the whole dataset again. This step is necessary because the column 'text' was removed during training, and we want this column for making predictions. The code below was already tokenized with the *text* column for the XLM-T model.

In [None]:
# Load the dataset (untokenized) from disk
ds = load_from_disk("/data3/mmendieta/Violence_data/geo_corpus.0.0.1_dataset_for_train")

### Tokenize the whole dataset

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)

In [None]:
# Instantiate the tokenizer 
model_ckpt = args.model_ckpt
tokenizer = AutoTokenizer.from_pretrained(model_ckpt,
                                         model_max_length=args.max_length
                                         )

In [None]:
# This code takes 14min and 33 s to run
%time tokenized_ds = ds.map(tokenize, batched=True)

In [None]:
tokenized_ds.set_format('torch')

In [None]:
tokenized_ds

### Save tokenized dataset to disk

In [None]:
tokenized_ds.save_to_disk("/data3/mmendieta/Violence_data/geo_corpus.0.0.1_tok_ds_xlmt_with_text")

In [None]:
test_tokenized_ds[i]['text'], test_tokenized_ds[i]['labels'].tolist()