In [1]:
# check if on colab
try:
    import google.colab
    COLAB = True
except:
    COLAB=False
print('on colab:', COLAB)

on colab: False


In [4]:
# set working directory to base of repo

import os 

if COLAB:
    from google.colab import drive
    drive.mount('/content/drive')  # Might need to adapt to your path
    if not os.getcwd().endswith("edf_un_climate"):
        os.chdir("drive/MyDrive/edf_un_climate") 
else:
    if not os.getcwd().endswith("edf_un_climate"):
        os.chdir(os.path.dirname(os.path.dirname(os.getcwd())))
    print(os.getcwd())



/Users/malo/Library/CloudStorage/GoogleDrive-malo.jan@sciencespo.fr/.shortcut-targets-by-id/17Ie9FRNLITIRVq7pV8zYiq6P5Op2F6Lu/edf_un_climate


In [5]:
# Import packages

import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm.auto import tqdm
from torch.nn.functional import softmax

  from .autonotebook import tqdm as notebook_tqdm


#### Import data

In [8]:
df = pd.read_parquet("data/un_speeches_tokenized.parquet", engine = "pyarrow")

#### Inference

In [18]:
# Create function that classify sentences using a Hugging Face model

def classify_climate(df, text_column, model_name, batch_size=32):
    """
    Run inference on a pandas DataFrame using a Hugging Face model with batch processing.

    :param df: pandas DataFrame containing the text data.
    :param text_column: Name of the column in df that contains the text.
    :param model_name: Name of the model on Hugging Face Model Hub.
    :param batch_size: Size of batches for processing.
    :return: DataFrame with original data and new columns 'prediction' and 'probability'.
    """
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.eval()

    # Prepare device (GPU or CPU)
    device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
    model.to(device)

    # Store results
    results = []

    for i in tqdm(range(0, len(df), batch_size), desc="Processing"):
        batch_texts = df.iloc[i:i + batch_size][text_column].tolist()
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            probs = softmax(outputs.logits, dim=1)
            predictions = torch.argmax(probs, dim=1).tolist()
            probabilities = probs.max(dim=1).values.tolist()

        results.extend(zip(df.iloc[i:i + batch_size]['id'], predictions, probabilities))

    # Convert results to DataFrame
    results_df = pd.DataFrame(results, columns=['id', 'prediction_climateBert', 'probability_climateBert'])

    # Merge with original DataFrame
    final_df = df.merge(results_df, on='id', how='left')

    return final_df

#### Import the climateBERT from Huggingface and use it to classify the corpus

In [26]:
model_name = "climatebert/distilroberta-base-climate-detector" 
df_predicted = classify_climate(df, 'text', model_name)

Processing: 100%|██████████| 40267/40267 [3:38:18<00:00,  3.07it/s]  


In [27]:
df_predicted[df_predicted['prediction_climateBert'] == 1].text.tolist()[:10]

['As a member of the Security Council, Australia was also a member of the Atomic Energy Commission, established by the General Assembly at the first part of the first session.',
 'It is plain that each and every nation entering into the atomic energy agreement must be bound by all its obligations.',
 'The air we breathe is more invigorating.',
 'It is therefore to be expected that during the coming weeks we must work with unremitting energy in order to complete our work.',
 'I wish to say a word now about the work of the Atomic Energy Commission.',
 'There is a great diversity of opinions regarding the results of the Paris Conference.',
 'We must ensure, by all means, that the smouldering Spanish problem does not become the bone of contention between East and West.',
 'In conclusion, I would like to mention the Conference of Paris.',
 'Now we may look back and measure the distance already covered.',
 'During the last few months the United Nations has had to examine one of the most torm

In [30]:
df_predicted.value_counts('prediction_climateBert')

prediction_climateBert
0    1221922
1      66591
Name: count, dtype: int64

#### Save predictions

In [28]:
# Save predictions of climateBert

df_predicted.to_parquet("data/un_predictions_climateBert.parquet", engine = "pyarrow")