In [14]:
import torch
import transformers
import pandas as pd
import re
import html
import matplotlib.pyplot as plt
import numpy as np
import evaluate
from datasets import load_dataset, Features, Value
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from sklearn.metrics import roc_auc_score, roc_curve
from tqdm.auto import tqdm
from argparse import Namespace

In [28]:
# Options for 'dataset_name'
# "/data3/mmendieta/Violence_data/case_studies/Russia_Ukraine_combined_with_labels.csv"
# "/data3/mmendieta/Violence_data/case_studies/Israel_Oct7_23_combined_with_labels.csv"
# "/data3/mmendieta/Violence_data/case_studies/Trump_Capitol_Hill_combined_with_labels.csv"

config = {
    "cuda_device": 15,
    "path_to_model_on_disk": "/data3/mmendieta/models/xlmt_finetuned_twitter/", 
    "model_ckpt": "m2im/XLM-T_finetuned_violence_twitter",
    "max_length": 32,
    "dataset_name": "/data3/mmendieta/Violence_data/case_studies/Trump_Capitol_Hill_combined_with_labels.csv",
    "batch_size": 64
}

args = Namespace(**config)

In [5]:
# Define the features with explicit types to prevent automatic casting to integer when reading the csv file
# Cast integer labels to float in order to calculate the predictions 
features = Features({
    'ID': Value('string'),
    'AuthorID': Value('string'),
    'AuthorAlias': Value('string'),
    'CreatedDate': Value('string'),
    'Text': Value('string'),
    'post7geo10': Value('float32'),
    'post7geo30': Value('float32'),
    'post7geo50': Value('float32'),
    'pre7geo10': Value('float32'),
    'pre7geo30': Value('float32'),
    'pre7geo50': Value('float32'),
}) 

In [6]:
# Load the dataset from disk (516ms)
%time ds = load_dataset('csv', data_files= args.dataset_name, sep=",", features=features)

Using custom data configuration default-68d0faf8e5159cd0


Downloading and preparing dataset csv/default to /home/mmendieta/.cache/huggingface/datasets/csv/default-68d0faf8e5159cd0/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Dataset csv downloaded and prepared to /home/mmendieta/.cache/huggingface/datasets/csv/default-68d0faf8e5159cd0/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 667 ms, sys: 71.9 ms, total: 739 ms
Wall time: 907 ms


In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['ID', 'AuthorID', 'AuthorAlias', 'CreatedDate', 'Text', 'post7geo10', 'post7geo30', 'post7geo50', 'pre7geo10', 'pre7geo30', 'pre7geo50'],
        num_rows: 110702
    })
})

In [8]:
# rename the column 'Text' to 'text'
dataset = ds.rename_column('Text', 'text')

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'AuthorID', 'AuthorAlias', 'CreatedDate', 'text', 'post7geo10', 'post7geo30', 'post7geo50', 'pre7geo10', 'pre7geo30', 'pre7geo50'],
        num_rows: 110702
    })
})

### Preprocess the dataset before inference

In [15]:
# Define a pre-processing function to use with datasets.map()
def clean_tweet(example):
    tweet = example['text']
    tweet = tweet.replace("\n"," ") #cleaning newline ‚Äú\n‚Äù from the tweets
    tweet = html.unescape(tweet) # decode html characters
    tweet = re.sub("@[A-Za-z0-9_:]+","", tweet) # remove mentions
    tweet = re.sub(r'http\S+', '', tweet) # remove urls
    tweet = re.sub('RT ', '', tweet) # remove mentions
    return {'text': tweet.strip()} #strip white spaces

In [16]:
# Filter rows with blank tweets for pre-processing
%time dataset = dataset.filter(lambda x: x["text"] is not None)

  0%|          | 0/107 [00:00<?, ?ba/s]

CPU times: user 3.95 s, sys: 18.1 ms, total: 3.97 s
Wall time: 3.93 s


In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'AuthorID', 'AuthorAlias', 'CreatedDate', 'text', 'post7geo10', 'post7geo30', 'post7geo50', 'pre7geo10', 'pre7geo30', 'pre7geo50'],
        num_rows: 106699
    })
})

In [18]:
# Apply the text preprocessing function to the entire dataset
%time dataset_clean = dataset.map(clean_tweet)

  0%|          | 0/106699 [00:00<?, ?ex/s]

CPU times: user 15.9 s, sys: 761 ms, total: 16.7 s
Wall time: 15.8 s


In [19]:
#  Filter rows with blank text for post-processing
%time dataset_clean = dataset_clean.filter(lambda x: x["text"] is not None)

  0%|          | 0/107 [00:00<?, ?ba/s]

CPU times: user 1.67 s, sys: 13.3 ms, total: 1.69 s
Wall time: 1.67 s


In [20]:
dataset_clean

DatasetDict({
    train: Dataset({
        features: ['ID', 'AuthorID', 'AuthorAlias', 'CreatedDate', 'text', 'post7geo10', 'post7geo30', 'post7geo50', 'pre7geo10', 'pre7geo30', 'pre7geo50'],
        num_rows: 106699
    })
})

In [21]:
# Remove unncesary columns
keep_cols = ['text', 'pre7geo10', 'pre7geo30', 'pre7geo50', 'post7geo10', 
             'post7geo30', 'post7geo50']
remove_columns = [col for col in dataset_clean['train'].column_names if col not in keep_cols]

In [22]:
dataset_clean = dataset_clean.remove_columns(remove_columns)

In [23]:
dataset_clean["train"].features

{'text': Value(dtype='string', id=None),
 'post7geo10': Value(dtype='float32', id=None),
 'post7geo30': Value(dtype='float32', id=None),
 'post7geo50': Value(dtype='float32', id=None),
 'pre7geo10': Value(dtype='float32', id=None),
 'pre7geo30': Value(dtype='float32', id=None),
 'pre7geo50': Value(dtype='float32', id=None)}

In [24]:
# create 'labels' columm
cols = dataset_clean['train'].column_names
dataset_clean = dataset_clean.map(lambda x : {"labels": [x[c] for c in cols if c != "text"]})
dataset_clean

  0%|          | 0/106699 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'post7geo10', 'post7geo30', 'post7geo50', 'pre7geo10', 'pre7geo30', 'pre7geo50', 'labels'],
        num_rows: 106699
    })
})

In [25]:
# Keep the dataset only with columns 'text' and 'labels'
col_names = dataset_clean["train"].column_names
col_names.remove("labels")
col_names.remove('text')
ds_clean = dataset_clean.remove_columns(col_names)
ds_clean

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 106699
    })
})

# Inference

### Instantiate the required pipeline

In [27]:
violence_pipe = pipeline(model=args.model_ckpt, 
                         device=args.cuda_device,
                         return_all_scores=True)

Downloading:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

### Select specific cases of interest

In [145]:
# Define the label mapping
id2label: {
    "0": "post7geo10",
    "1": "post7geo30",
    "2": "post7geo50",
    "3": "pre7geo10",
    "4": "pre7geo30",
    "5": "pre7geo50"
}

# Define the filtering function to match the required conditions
def filter_condition(example):
    return (example['labels'][0] == 0 and # post7geo10
            example['labels'][1] == 0 and # post7geo30
            example['labels'][2] == 1 and # post7geo50
            example['labels'][3] == 0 and # pre7geo10
            example['labels'][4] == 0 and # pre7geo30
            example['labels'][5] == 0 # pre7geo50
    )

In [146]:
filtered_ds = ds_clean['train'].filter(filter_condition)

  0%|          | 0/107 [00:00<?, ?ba/s]

In [147]:
filtered_ds

Dataset({
    features: ['text', 'labels'],
    num_rows: 6468
})

In [None]:
filtered_ds[0]['labels']

### Perform inference

In [148]:
# Calculate the predictions
preds = []
for i, outputs in enumerate(tqdm(violence_pipe(KeyDataset(filtered_ds, "text"), batch_size=args.batch_size,
                                              truncation=True),
                                 total=len(filtered_ds))):
    text = filtered_ds[i]['text']
    labels = filtered_ds[i]['labels']
    preds.append({
        'text': text,
        'labels': labels,
        'outputs': outputs
    })



  0%|          | 0/6468 [00:00<?, ?it/s]

In [149]:
# Iterate over the predictions to visualize the text, true labels, and predictions
processed_data = []
for pred in preds:
    row = {
        'text': pred['text'],
        'true_labels': pred['labels'],
    }
    # Add each of the six pedicted scores as separate columns
    row['post7geo10'] = pred['outputs'][0]['score']
    row['post7geo30'] = pred['outputs'][1]['score']
    row['post7geo50'] = pred['outputs'][2]['score']
    row['pre7geo10'] = pred['outputs'][3]['score']
    row['pre7geo30'] = pred['outputs'][4]['score']
    row['pre7geo50'] = pred['outputs'][5]['score']
    
    processed_data.append(row)
    
# Convert to DataFrame
df = pd.DataFrame(processed_data)

In [150]:
df.head(3)

Unnamed: 0,text,true_labels,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
0,What would you do ü•±,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",0.275789,0.375137,0.489425,0.274,0.360397,0.466075
1,They were LET IN! just like we said #CapitolB...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",0.763389,0.81038,0.863128,0.294917,0.328344,0.369649
2,You are correct Sir! üëÄüá∫üá∏‚ÄºÔ∏è,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",0.295641,0.355861,0.465822,0.275853,0.341913,0.439394


In [151]:
mean_preds = df.mean().to_frame(name='mean').T
mean_preds 

  mean_preds = df.mean().to_frame(name='mean').T


Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0.310626,0.397994,0.546078,0.303492,0.384985,0.52347


In [101]:
mean_preds_th = (mean_preds >= 0.5).astype(int)
mean_preds_th

Unnamed: 0,post7geo10,post7geo30,post7geo50,pre7geo10,pre7geo30,pre7geo50
mean,0,0,1,0,0,1


### ROC and AUC score

In [None]:
# Step 1: Calculate the ROC-AUC score
# Extract the ground truth labels and predicted probabilities for each label
true_labels = np.array([item['labels'] for item in preds])
predicted_scores = np.array([item['outputs'] for item in preds])

In [None]:
# Initialize the Evaluate API for ROC-AUC
roc_auc = evaluate.load("roc_auc")

In [None]:
# Calculate ROC-AUC scores for each label
roc_auc_scores = {}
for i, label_name in enumerate(["post7geo10", "post7geo30", "post7geo50", "pre7geo10", "pre7geo30", "pre7geo50"]):
    roc_auc_scores[label_name] = roc_auc.compute(predictions=predicted_scores[:, i], references=true_labels[:, i])['roc_auc']

In [None]:
# Display ROC-AUC scores
for label, score in roc_auc_scores.items():
    print(f"ROC-AUC score for {label}: {score}")

In [None]:
# Step 2: Plot the ROC curve for each label
plt.figure(figsize=(10, 8))

for i, label_name in enumerate(["post7geo10", "post7geo30", "post7geo50", "pre7geo10", "pre7geo30", "pre7geo50"]):
    # Calculate false positive rate and true positive rate
    fpr, tpr, _ = roc_curve(true_labels[:, i], predicted_scores[:, i])
    
    # Plot the ROC curve
    plt.plot(fpr, tpr, label=f"{label_name} (AUC = {roc_auc_scores[label_name]:.2f})")

In [None]:
# Plot configuration
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random chance
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for the Capitol Hill case study')
plt.legend(loc="lower right")
plt.grid()
plt.show()

In [None]:
# Save to disk
fig.savefig("/data3/mmendieta/Violence_data/case_studies/figs/ROC_Capitol_Hill.eps", format="eps", dpi=300,  bbox_inches="tight")