<a href="https://colab.research.google.com/github/malojan/edf_un_climate/blob/master/code/measurement/classifier_inference.ipynb?authuser=0" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Classifier inference on the UNGA corpus

Author : Malo Jan

In this notebook, I use the ECCA classifier trained to predict sentences that are climate-related for inference on the rest of the United Nations General Assembly Corpus.

#### Setup

In [None]:
# check if on colab
try:
    import google.colab
    COLAB = True
except:
    COLAB=False
print('on colab:', COLAB)

on colab: True


In [None]:
# set working directory to base of repo

import os

if COLAB:
    from google.colab import drive
    drive.mount('/content/drive')  # Might need to adapt to your path
    if not os.getcwd().endswith("edf_un_climate"):
        os.chdir("drive/MyDrive/edf_un_climate")
else:
    if not os.getcwd().endswith("edf_un_climate"):
        os.chdir(os.path.dirname(os.path.dirname(os.getcwd())))
    print(os.getcwd())


Mounted at /content/drive


In [None]:
# Import packages

import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm.auto import tqdm
from torch.nn.functional import softmax

#### Import data

In [None]:
df = pd.read_parquet("data/un_speeches_tokenized.parquet", engine = "pyarrow")

df

Unnamed: 0,doc_id,text,year,iso_code,session,country,name_of_person_speaking,post,text_id,id,word_count
0,ARG_01_1946.txt,At the resumption of the first session of the ...,1946.0,ARG,1.0,Argentina,Mr. Arce,,0,ARG_01_1946.txt_0,24
1,ARG_01_1946.txt,Politics are determined by circumstances.,1946.0,ARG,1.0,Argentina,Mr. Arce,,1,ARG_01_1946.txt_1,5
2,ARG_01_1946.txt,"Accordingly, in making these remarks, we do no...",1946.0,ARG,1.0,Argentina,Mr. Arce,,2,ARG_01_1946.txt_2,39
3,ARG_01_1946.txt,We hope that these remarks will be interpreted...,1946.0,ARG,1.0,Argentina,Mr. Arce,,3,ARG_01_1946.txt_3,12
4,ARG_01_1946.txt,"We are not sceptics, but our relative optimism...",1946.0,ARG,1.0,Argentina,Mr. Arce,,4,ARG_01_1946.txt_4,22
...,...,...,...,...,...,...,...,...,...,...,...
1319416,ZWE_79_2024.txt,"The challenges we face are immense, but not in...",2024.0,ZWE,79.0,Zimbabwe,Frederick Makamure Shava,Minister of Foreign Affairs,64,ZWE_79_2024.txt_64,9
1319417,ZWE_79_2024.txt,"By acting together, we can advance peace, prom...",2024.0,ZWE,79.0,Zimbabwe,Frederick Makamure Shava,Minister of Foreign Affairs,65,ZWE_79_2024.txt_65,19
1319418,ZWE_79_2024.txt,Zimbabwe stands ready to collaborate with all ...,2024.0,ZWE,79.0,Zimbabwe,Frederick Makamure Shava,Minister of Foreign Affairs,66,ZWE_79_2024.txt_66,21
1319419,ZWE_79_2024.txt,"Let us work in unison, guided by the spirit of...",2024.0,ZWE,79.0,Zimbabwe,Frederick Makamure Shava,Minister of Foreign Affairs,67,ZWE_79_2024.txt_67,35


In [None]:

df = df[df['year'] >= 1985]

df_sample = df.sample(n=1000)

#### Inference on out-of-sample data

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mljn/unga-climate-classifier")
model = AutoModelForSequenceClassification.from_pretrained("mljn/unga-climate-classifier")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

In [None]:
def classify_climate(df, text_column, model_name, batch_size=32):
    """
    Run inference on a pandas DataFrame using a Hugging Face model with batch processing.

    :param df: pandas DataFrame containing the text data.
    :param text_column: Name of the column in df that contains the text.
    :param model_name: Name of the model on Hugging Face Model Hub.
    :param batch_size: Size of batches for processing.
    :return: DataFrame with original data and new columns 'prediction' and 'probability'.
    """
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.eval()

    # Prepare device (GPU or CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Store results
    results = []

    for i in tqdm(range(0, len(df), batch_size), desc="Processing"):
        batch_texts = df.iloc[i:i + batch_size][text_column].tolist()
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            probs = softmax(outputs.logits, dim=1)
            predictions = torch.argmax(probs, dim=1).tolist()
            probabilities = probs.max(dim=1).values.tolist()

        results.extend(zip(df.iloc[i:i + batch_size]['id'], predictions, probabilities))

    # Convert results to DataFrame
    results_df = pd.DataFrame(results, columns=['id', 'prediction', 'probability'])

    # Merge with original DataFrame
    final_df = df.merge(results_df, on='id', how='left')

    return final_df

In [None]:
model_name = "mljn/unga-climate-classifier"  # Replace with your model's name
df_predicted = classify_climate(df, 'text', model_name)

Processing:   0%|          | 0/21173 [00:00<?, ?it/s]

In [None]:
df_predicted

Unnamed: 0,doc_id,text,year,iso_code,session,country,name_of_person_speaking,post,text_id,id,word_count,prediction,probability
0,AFG_40_1985.txt,I wish at the outset to congratulate the Presi...,1985.0,AFG,40.0,Afghanistan,Mr. Dost,,0,AFG_40_1985.txt_0,23,0,0.993638
1,AFG_40_1985.txt,While offering the full co-operation of the de...,1985.0,AFG,40.0,Afghanistan,Mr. Dost,,1,AFG_40_1985.txt_1,25,0,0.995708
2,AFG_40_1985.txt,May I also put on record our appreciation of t...,1985.0,AFG,40.0,Afghanistan,Mr. Dost,,2,AFG_40_1985.txt_2,30,0,0.995496
3,AFG_40_1985.txt,On the grievous and sad occasion of the traged...,1985.0,AFG,40.0,Afghanistan,Mr. Dost,,3,AFG_40_1985.txt_3,39,0,0.995881
4,AFG_40_1985.txt,of the basic principles of the policy of the D...,1985.0,AFG,40.0,Afghanistan,Mr. Dost,,4,AFG_40_1985.txt_4,30,0,0.995613
...,...,...,...,...,...,...,...,...,...,...,...,...,...
677521,ZWE_79_2024.txt,"The challenges we face are immense, but not in...",2024.0,ZWE,79.0,Zimbabwe,Frederick Makamure Shava,Minister of Foreign Affairs,64,ZWE_79_2024.txt_64,9,0,0.994195
677522,ZWE_79_2024.txt,"By acting together, we can advance peace, prom...",2024.0,ZWE,79.0,Zimbabwe,Frederick Makamure Shava,Minister of Foreign Affairs,65,ZWE_79_2024.txt_65,19,0,0.966357
677523,ZWE_79_2024.txt,Zimbabwe stands ready to collaborate with all ...,2024.0,ZWE,79.0,Zimbabwe,Frederick Makamure Shava,Minister of Foreign Affairs,66,ZWE_79_2024.txt_66,21,0,0.996158
677524,ZWE_79_2024.txt,"Let us work in unison, guided by the spirit of...",2024.0,ZWE,79.0,Zimbabwe,Frederick Makamure Shava,Minister of Foreign Affairs,67,ZWE_79_2024.txt_67,35,0,0.995390


#### Save predictions

In [None]:
df_predicted.to_parquet('data/un_predictions_all.parquet', index=False)