In [1]:
import torch
import os
from transformers import pipeline
from datasets import load_from_disk, Dataset, Sequence, Value
from argparse import Namespace

2025-07-31 15:24:40.770933: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
labels = ['post1geo10', 'post1geo20', 'post1geo30', 'post1geo50', 'post1geo70', 'post2geo10', 'post2geo20', 
          'post2geo30', 'post2geo50', 'post2geo70', 'post3geo10', 'post3geo20', 'post3geo30', 'post3geo50', 
          'post3geo70', 'post7geo10', 'post7geo20', 'post7geo30', 'post7geo50', 'post7geo70', 'pre1geo10', 
          'pre1geo20', 'pre1geo30', 'pre1geo50', 'pre1geo70', 'pre2geo10', 'pre2geo20', 'pre2geo30', 
          'pre2geo50', 'pre2geo70', 'pre3geo10', 'pre3geo20', 'pre3geo30', 'pre3geo50', 'pre3geo70', 
          'pre7geo10', 'pre7geo20', 'pre7geo30', 'pre7geo50', 'pre7geo70']

In [3]:
# Dynamically extract label names from dataset and create mappings
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

In [None]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of GPUs detected: {torch.cuda.device_count()}")

In [4]:
config = {
    "cuda_device": 14, # Make sure this device is available, else use -1 for CPU
    # "path_to_model_on_disk": "/data4/mmendieta/models/ml-e5-large_finetuned_twitter_all_labels/", # This seems to be your previous model, not the NLI one
    "model_ckpt": "mjwong/multilingual-e5-large-xnli", # The xnli model
    "max_length": 32,
    "dataset_name": "/data4/mmendieta/data/geo_corpus.0.0.1_tok_test_ds_e5_inference_results", # This is your saved dataset path
    "batch_size": 1024,
    "fout_nli_csv": "/data4/mmendieta/data/geo_corpus.0.0.1_tok_test_ds_e5_inference_results_nli_multilabel_newCategories.csv"
}

args = Namespace(**config)

## 4.1 Load the saved dataset

In [5]:
print(f"Loading dataset from: {args.dataset_name}")
try:
    ds_with_predictions = load_from_disk(args.dataset_name)
    print(f"Dataset loaded. Number of examples: {len(ds_with_predictions)}")
    print(f"Features: {ds_with_predictions.features}")
except Exception as e:
    print(f"ERROR: Could not load dataset from {args.dataset_name}. Error: {e}")
    exit()

# --- NEW: Filter out empty or whitespace-only texts ---
original_num_examples = len(ds_with_predictions)
ds_with_predictions = ds_with_predictions.filter(
    lambda example: example['text'] is not None and len(example['text'].strip()) > 0,
    desc="Filtering out empty or whitespace texts"
)
filtered_num_examples = len(ds_with_predictions)
print(f"Dataset after filtering: {filtered_num_examples} examples (Removed {original_num_examples - filtered_num_examples} empty/whitespace texts).")

if filtered_num_examples == 0:
    print("ERROR: Dataset is empty after filtering! Cannot proceed with NLI.")
    exit()

Loading dataset from: /data4/mmendieta/data/geo_corpus.0.0.1_tok_test_ds_e5_inference_results
Dataset loaded. Number of examples: 2329158
Features: {'tweetid': Value(dtype='string', id=None), 'geo_x': Value(dtype='float64', id=None), 'geo_y': Value(dtype='float64', id=None), 'lang': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'pred_post1geo10': Value(dtype='float32', id=None), 'pred_post1geo20': Value(dtype='float32', id=None), 'pred_post1geo30': Value(dtype='float32', id=None), 'pred_post1geo50': Value(dtype='float32', id=None), 'pred_post1geo70': Value(dtype='float32', id=None), 'pred_post2geo10': Value(dtype='float32', id=None), 'pred_post2geo20': Value(dtype='float32', id=None), 'pred_post2geo30': Value(dtype=

In [6]:
# Verify a sample text after filtering
if len(ds_with_predictions) > 0:
    sample_text = ds_with_predictions[0]['text']
    print(f"Sample text from filtered dataset: '{sample_text}' (Type: {type(sample_text)})")
    if not isinstance(sample_text, str) or len(sample_text.strip()) == 0:
        print("CRITICAL WARNING: Filtering failed, first example's text is still empty or not a string.")

Sample text from filtered dataset: 'talking abt my case ☺️' (Type: <class 'str'>)


## 4.2 Define NLI Hypotheses and Instantiate the NLI Pipeline inside the function

In [7]:
# The six hypothesis classes (emotions)
nli_hypotheses = ["fear", "anger", "joy", "hostility", "hate", "love", "disgust", "sadness", "surprise", "trust", "anticipation", "grief"]

In [8]:
# Instantiate the NLI pipeline (using zero-shot-classification task for NLI)
nli_pipe = pipeline(
    "zero-shot-classification",
    model=args.model_ckpt,
    device=args.cuda_device, 
    framework="pt",
    batch_size=args.batch_size, # Use a batch size that fits your GPU memory
    # You might want to set multi_label=True if a tweet can express multiple emotions,
    # though for NLI entailment scores, it often handles each hypothesis independently.
    # The default for zero-shot is multi_label=False (scores sum to 1 over candidates)
    # but for NLI it can also be True (scores are independent). Let's start with default.
)

print(f"NLI pipeline loaded on device: {nli_pipe.device}")

NLI pipeline loaded on device: cuda:14


## 4.3 Define the Function to Add NLI Scores (Batched) 

This function will take a batch of texts, run the NLI pipeline on them, and return the scores.

In [9]:
def add_nli_scores_to_example_batched(examples):
    
    texts = examples['text'] # Get the list of texts for the current batch

    # Defensive check (though filter should largely prevent this for 'texts')
    # If, for some reason, 'texts' in a batch still contains None or empty strings,
    # the pipeline might still struggle. This converts them to a placeholder.
    # However, filtering *before* this function is the better approach.
    cleaned_texts = [t.strip() if isinstance(t, str) and t.strip() else "[EMPTY_TEXT]" for t in texts]
    
    # --- IMPORTANT: Check for empty batches after cleaning ---
    # If all texts in a batch became empty, the pipeline would still fail.
    # Return a placeholder for these.
    if not cleaned_texts or all(t == "[EMPTY_TEXT]" for t in cleaned_texts):
        # Create dummy scores for this batch to match the expected output structure
        num_examples_in_batch = len(texts)
        nli_scores_data = {f"nli_{hypothesis}": [0.0] * num_examples_in_batch for hypothesis in nli_hypotheses}
        return nli_scores_data
    
    # Run the NLI pipeline on the batch of texts
    # For zero-shot-classification, input is text and candidate_labels
    nli_results_batch = nli_pipe(
        cleaned_texts,
        candidate_labels=nli_hypotheses,
        multi_label=True # Consider this if you want independent probabilities per emotion
    )

    # Initialize lists to store scores for each hypothesis for the current batch
    nli_scores_data = {f"nli_{hypothesis}": [] for hypothesis in nli_hypotheses}

    # Process results for each example in the batch
    for result_for_one_text in nli_results_batch:
        # result_for_one_text will be a dict like:
        # {'sequence': '...', 'labels': ['fear', 'anger', ...], 'scores': [0.9, 0.1, ...]}
        
        # Create a dictionary for quick lookup of scores by label for this specific text
        scores_map = {label: score for label, score in zip(result_for_one_text['labels'], result_for_one_text['scores'])}

        # Append the score for each hypothesis to its respective list
        for hypothesis in nli_hypotheses:
            # We are interested in the entailment score for each hypothesis.
            # The zero-shot-classification pipeline internally maps each candidate_label
            # to one of the NLI outcomes (entailment, neutral, contradiction)
            # and then typically takes the entailment score as the score for that label.
            # The 'labels' and 'scores' fields in result_for_one_text are already
            # sorted according to candidate_labels.
            score = scores_map.get(hypothesis, 0.0) # Get the score, default to 0.0 if not found
            nli_scores_data[f"nli_{hypothesis}"].append(float(score))

    return nli_scores_data

## 4.4 Update Dataset Features and Apply the Mapping

In [10]:
# Define new features for NLI scores
nli_features = {}
for hypothesis in nli_hypotheses:
    nli_features[f"nli_{hypothesis}"] = Value("float32")

# Copy existing features and add NLI features
final_dataset_features_with_nli = ds_with_predictions.features.copy()
final_dataset_features_with_nli.update(nli_features)

print("\nDefined final features for dataset after adding NLI scores:")
print(final_dataset_features_with_nli)

print(f"Adding NLI scores to the dataset using (batched mode)...")
ds_with_all_predictions = ds_with_predictions.map(
    add_nli_scores_to_example_batched,
    batched=True,             # Enable batching for NLI pipeline 
    batch_size=args.batch_size,
    features=final_dataset_features_with_nli,
    desc="Adding NLI Scores"
)

print("NLI scores added to the dataset.")
print(ds_with_all_predictions.features)


Defined final features for dataset after adding NLI scores:
{'tweetid': Value(dtype='string', id=None), 'geo_x': Value(dtype='float64', id=None), 'geo_y': Value(dtype='float64', id=None), 'lang': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'pred_post1geo10': Value(dtype='float32', id=None), 'pred_post1geo20': Value(dtype='float32', id=None), 'pred_post1geo30': Value(dtype='float32', id=None), 'pred_post1geo50': Value(dtype='float32', id=None), 'pred_post1geo70': Value(dtype='float32', id=None), 'pred_post2geo10': Value(dtype='float32', id=None), 'pred_post2geo20': Value(dtype='float32', id=None), 'pred_post2geo30': Value(dtype='float32', id=None), 'pred_post2geo50': Value(dtype='float32', id=None), 'pred_post2geo

Adding NLI Scores:   0%|          | 0/2276118 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


NLI scores added to the dataset.
{'tweetid': Value(dtype='string', id=None), 'geo_x': Value(dtype='float64', id=None), 'geo_y': Value(dtype='float64', id=None), 'lang': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'pred_post1geo10': Value(dtype='float32', id=None), 'pred_post1geo20': Value(dtype='float32', id=None), 'pred_post1geo30': Value(dtype='float32', id=None), 'pred_post1geo50': Value(dtype='float32', id=None), 'pred_post1geo70': Value(dtype='float32', id=None), 'pred_post2geo10': Value(dtype='float32', id=None), 'pred_post2geo20': Value(dtype='float32', id=None), 'pred_post2geo30': Value(dtype='float32', id=None), 'pred_post2geo50': Value(dtype='float32', id=None), 'pred_post2geo70': Value(dtype='float32', 

In [11]:
def unpack_ground_truth_labels_batched(examples):
    # 'examples' here will contain lists of tensors for 'labels' (e.g., [tensor([1,0,1]), tensor([0,0,1])])
    batch_unpacked_data = {label_name: [] for label_name in labels}

    for i in range(len(examples['labels'])): # Iterate through each example in the batch
        # Convert tensor to list for the current example
        gt_values = examples['labels'][i].tolist() if isinstance(examples['labels'][i], torch.Tensor) else examples['labels'][i]

        if len(gt_values) != len(labels):
            # This should ideally not happen if data is consistent, but good to check
            raise ValueError(
                f"Mismatch for example in batch: 'labels' tensor has {len(gt_values)} values, "
                f"but 'gt_label_names' has {len(labels)} names. "
                "Ensure gt_label_names has exactly 40 elements in the correct order."
            )

        for j, label_name in enumerate(labels):
            batch_unpacked_data[label_name].append(float(gt_values[j])) # Ensure it's a standard float

    return batch_unpacked_data

In [12]:
print("\nUnpacking ground truth labels into individual columns...")

# 1. Start with the features of ds_with_all_predictions
target_features_for_unpacking = ds_with_all_predictions.features.copy()

# 2. Explicitly remove columns that should *not* be in the final schema, including input_ids and attention_mask
# This ensures your 'features' blueprint matches the final desired structure.
columns_to_remove_from_schema = ["labels", "input_ids", "attention_mask"] # List all columns to be removed

for col_name in columns_to_remove_from_schema:
    if col_name in target_features_for_unpacking:
        del target_features_for_unpacking[col_name]

# 3. Add the new ground truth columns to this target schema
# Assuming 'labels' is indeed your list of 40 ground truth label names as you stated.
for name in labels: # Keeping 'labels' as per your confirmation
    target_features_for_unpacking[name] = Value("float32")

print("\nTarget features for dataset after unpacking GT labels (after removal):")
print(target_features_for_unpacking) # Print this to verify

# Apply the unpacking map
ds_final_for_csv = ds_with_all_predictions.map(
    unpack_ground_truth_labels_batched, # Use the batched unpacking function
    batched=True,                       # Apply in batched mode for performance
    batch_size=args.batch_size,         # Use the same batch size
    features=target_features_for_unpacking,
    remove_columns=["labels", "input_ids", "attention_mask"],    # Remove the original 'labels' Sequence column
    desc="Unpacking Ground Truth Labels"
)

print("Ground truth labels unpacked. Original 'labels' column removed.")
print(ds_final_for_csv.features)


Unpacking ground truth labels into individual columns...

Target features for dataset after unpacking GT labels (after removal):
{'tweetid': Value(dtype='string', id=None), 'geo_x': Value(dtype='float64', id=None), 'geo_y': Value(dtype='float64', id=None), 'lang': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'pred_post1geo10': Value(dtype='float32', id=None), 'pred_post1geo20': Value(dtype='float32', id=None), 'pred_post1geo30': Value(dtype='float32', id=None), 'pred_post1geo50': Value(dtype='float32', id=None), 'pred_post1geo70': Value(dtype='float32', id=None), 'pred_post2geo10': Value(dtype='float32', id=None), 'pred_post2geo20': Value(dtype='float32', id=None), 'pred_post2geo30': Value(dtype='float32', id=None), 'pred_post2geo50': Value(dtype='float32', id=None), 'pred_post2geo70': Value(dtype='float32', id=None), 'pred_post3geo10': Value(dtype='float32', id=None), 'pred_post3geo20': Value(dtype='float32', id=None), 'pred_post3geo30': Value(dtype='float3

Unpacking Ground Truth Labels:   0%|          | 0/2276118 [00:00<?, ? examples/s]

Ground truth labels unpacked. Original 'labels' column removed.
{'tweetid': Value(dtype='string', id=None), 'geo_x': Value(dtype='float64', id=None), 'geo_y': Value(dtype='float64', id=None), 'lang': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'pred_post1geo10': Value(dtype='float32', id=None), 'pred_post1geo20': Value(dtype='float32', id=None), 'pred_post1geo30': Value(dtype='float32', id=None), 'pred_post1geo50': Value(dtype='float32', id=None), 'pred_post1geo70': Value(dtype='float32', id=None), 'pred_post2geo10': Value(dtype='float32', id=None), 'pred_post2geo20': Value(dtype='float32', id=None), 'pred_post2geo30': Value(dtype='float32', id=None), 'pred_post2geo50': Value(dtype='float32', id=None), 'pred_post2geo70': Value(dtype='float32', id=None), 'pred_post3geo10': Value(dtype='float32', id=None), 'pred_post3geo20': Value(dtype='float32', id=None), 'pred_post3geo30': Value(dtype='float32', id=None), 'pred_post3geo50': Value(dtype='float32', id=None), 

In [13]:
print("\nAll column names in the final dataset:")
all_final_column_names = list(ds_final_for_csv.features.keys())
print(all_final_column_names)


All column names in the final dataset:
['tweetid', 'geo_x', 'geo_y', 'lang', 'text', 'pred_post1geo10', 'pred_post1geo20', 'pred_post1geo30', 'pred_post1geo50', 'pred_post1geo70', 'pred_post2geo10', 'pred_post2geo20', 'pred_post2geo30', 'pred_post2geo50', 'pred_post2geo70', 'pred_post3geo10', 'pred_post3geo20', 'pred_post3geo30', 'pred_post3geo50', 'pred_post3geo70', 'pred_post7geo10', 'pred_post7geo20', 'pred_post7geo30', 'pred_post7geo50', 'pred_post7geo70', 'pred_pre1geo10', 'pred_pre1geo20', 'pred_pre1geo30', 'pred_pre1geo50', 'pred_pre1geo70', 'pred_pre2geo10', 'pred_pre2geo20', 'pred_pre2geo30', 'pred_pre2geo50', 'pred_pre2geo70', 'pred_pre3geo10', 'pred_pre3geo20', 'pred_pre3geo30', 'pred_pre3geo50', 'pred_pre3geo70', 'pred_pre7geo10', 'pred_pre7geo20', 'pred_pre7geo30', 'pred_pre7geo50', 'pred_pre7geo70', 'nli_fear', 'nli_anger', 'nli_joy', 'nli_hostility', 'nli_hate', 'nli_love', 'nli_disgust', 'nli_sadness', 'nli_surprise', 'nli_trust', 'nli_anticipation', 'nli_grief', 'post

In [14]:
print("\nSample of the final dataset (first row) before CSV export:")
if len(ds_final_for_csv) > 0:
    first_sample_final = ds_final_for_csv[0]
    # Print a few key original, prediction, NLI, and GT columns
    print(f"tweetid: {first_sample_final['tweetid']}")
    print(f"text: {first_sample_final['text']}")
    print(f"pred_post1geo10: {first_sample_final['pred_post1geo10']}")
    print(f"nli_fear: {first_sample_final['nli_fear']}")
    print(f"{labels[0]}: {first_sample_final[labels[0]]}") # First GT label
    print(f"{labels[1]}: {first_sample_final[labels[1]]}") # Second GT label
    print(f"{labels[-1]}: {first_sample_final[labels[-1]]}") # Last GT label
else:
    print("Dataset is empty after unpacking.")


Sample of the final dataset (first row) before CSV export:
tweetid: 388328898662268928
text: talking abt my case ☺️
pred_post1geo10: 0.012126137502491474
nli_fear: 0.1177188903093338
post1geo10: 0.0
post1geo20: 0.0
pre7geo70: 1.0


## 4.5 Save the final dataset to csv

In [15]:
print(f"\nExporting final dataset to CSV: {args.fout_nli_csv}")
try:
    ds_final_for_csv.to_csv(args.fout_nli_csv)
    print(f"Final dataset successfully exported to CSV: {args.fout_nli_csv}")
except Exception as e:
    print(f"ERROR: Failed to export final dataset to CSV: {e}")


Exporting final dataset to CSV: /data4/mmendieta/data/geo_corpus.0.0.1_tok_test_ds_e5_inference_results_nli_multilabel_newCategories.csv


Creating CSV from Arrow format:   0%|          | 0/2277 [00:00<?, ?ba/s]

Final dataset successfully exported to CSV: /data4/mmendieta/data/geo_corpus.0.0.1_tok_test_ds_e5_inference_results_nli_multilabel_newCategories.csv


In [None]:
# Load the dataset in pandas to check the first 3 observations
import pandas as pd

# Define the path where you saved the CSV
csv_file_path = args.fout_nli_csv # Or directly use the string path

print(f"\nLoading CSV from: {csv_file_path}")
try:
    # Load the CSV into a pandas DataFrame
    df = pd.read_csv(csv_file_path)

    print(f"CSV loaded successfully. Total rows: {len(df)}")

    # Check the first 3 observations
    print("\nFirst 3 observations from the loaded CSV:")
    print(df.head(3))

    # You can also check the columns and their data types in the DataFrame
    print("\nDataFrame Info:")
    df.info()

except FileNotFoundError:
    print(f"ERROR: CSV file not found at {csv_file_path}. Please ensure the save operation was successful.")
except Exception as e:
    print(f"ERROR: Failed to load CSV from {csv_file_path}: {e}")