<a href="https://colab.research.google.com/github/lschreiber9/Training-BERT/blob/main/BERT_for_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training BERT

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Necessary Libraries

In [3]:
import pandas as pd
import torch
import gc

### Import Datasets from Huggingface

These Datasets are expert annotated.

In [4]:
datasets = {
    "climate_specificity": {
        "train": "hf://datasets/climatebert/climate_specificity/data/train-00000-of-00001-298fad749f8929f7.parquet",
        "test": "hf://datasets/climatebert/climate_specificity/data/test-00000-of-00001-2588e03729a1bfe7.parquet"
    },
    "climate_sentiment": {
        "train": "hf://datasets/climatebert/climate_sentiment/data/train-00000-of-00001-04b49ae22f595095.parquet",
        "test": "hf://datasets/climatebert/climate_sentiment/data/test-00000-of-00001-3f9f7af4f5914b8e.parquet"
    },
    "climate_commitments_actions": {
        "train": "hf://datasets/climatebert/climate_commitments_actions/data/train-00000-of-00001-2044cce9e261c6b3.parquet",
        "test": "hf://datasets/climatebert/climate_commitments_actions/data/test-00000-of-00001-77f76c0960abb9c6.parquet"
    },
    "environmental_claims": {
        "train": "hf://datasets/climatebert/environmental_claims/data/train-00000-of-00001-98aa5228a06a17d0.parquet",
        "validation": "hf://datasets/climatebert/environmental_claims/data/validation-00000-of-00001-2553e47d408fab28.parquet",
        "test": "hf://datasets/climatebert/environmental_claims/data/test-00000-of-00001-79fd931297fff765.parquet"
    },
    "climate_detection": {
        "train": "hf://datasets/climatebert/climate_detection/data/train-00000-of-00001-4b831beb8839bf3e.parquet",
        "test": "hf://datasets/climatebert/climate_detection/data/test-00000-of-00001-87f8706e009e9b75.parquet"
    }
}


### Load Dataset and Split Into Training and Testing

In [5]:
def load_data(dataset_name):
    if dataset_name not in datasets:
        raise ValueError(f"Dataset '{dataset_name}' is not defined.")

    dataset_info = datasets[dataset_name]
    train_df = pd.read_parquet(dataset_info["train"])
    test_df = pd.read_parquet(dataset_info["test"])

    return train_df, test_df

for dataset_name in datasets.keys():
    globals()[f"train_{dataset_name}"], globals()[f"test_{dataset_name}"] = load_data(dataset_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Import Tokenizer

In [6]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### Tokenize Text Data and Prepare Encodings and Labels

In [7]:
def preprocess_data(df, text_column, label_column, max_length=512):
    encodings = tokenizer(
        df[text_column].tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )

    labels = df[label_column].tolist()

    return encodings, labels

### Preprocess Datasets in Training and Test Data

In [8]:
preprocessed_data = {}

for dataset_name in datasets.keys():
    train_df = globals()[f"train_{dataset_name}"]
    test_df = globals()[f"test_{dataset_name}"]

    train_encodings, train_labels = preprocess_data(train_df, text_column="text", label_column="label")
    test_encodings, test_labels = preprocess_data(test_df, text_column="text", label_column="label")

    preprocessed_data[dataset_name] = {
        "train": (train_encodings, train_labels),
        "test": (test_encodings, test_labels)
    }

### Encodings and Labels Wrapped Into a PyTorch Dataset


In [9]:
from torch.utils.data import Dataset

class ClimateDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

### Efficient Batch Processing for Training and Testing


In [10]:
from torch.utils.data import DataLoader

dataloaders = {}

for dataset_name, splits in preprocessed_data.items():
    train_encodings, train_labels = splits["train"]
    test_encodings, test_labels = splits["test"]

    train_dataset = ClimateDataset(train_encodings, train_labels)
    test_dataset = ClimateDataset(test_encodings, test_labels)

    dataloaders[dataset_name] = {
        "train": DataLoader(train_dataset, batch_size=16, shuffle=True),
        "test": DataLoader(test_dataset, batch_size=64, shuffle=False)
    }


### Choosing the BERT Model


In [11]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Define and Optimize Loss Function

In [12]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



### Choosing the Processing Unit

For the model training a 12 GB CPU was not sufficient. Hence, we used a GPU from the colab environment.

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

### Training Loop for One Dataset

In [14]:
def train_model(model, optimizer, train_loader, test_loader, device, epochs=3):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=1)
            correct += (predictions == labels).sum().item()

        accuracy = correct / len(train_loader.dataset)
        print(f"Epoch {epoch + 1}: Loss = {total_loss:.4f}, Accuracy = {accuracy:.4f}")

    evaluate_model(model, test_loader, device)

### Loss-Evaluation Function

In [15]:
def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total_loss = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=1)
            correct += (predictions == labels).sum().item()

    accuracy = correct / len(test_loader.dataset)
    print(f"Test Loss = {total_loss:.4f}, Test Accuracy = {accuracy:.4f}")

### Fine Tuning BERT with Loaded Datasets

Initially, we were uncertain about the dataset selection. After careful consideration, we chose climate_sentiment to answer our research question. The code is flexible and can easily train the model on other datasets by changing the dataset_name.

In [19]:
from datetime import datetime

dataset_name = "climate_sentiment"
if dataset_name in dataloaders:
    print(f"Training on {dataset_name} dataset...")

    loaders = dataloaders[dataset_name]
    train_loader = loaders["train"]
    test_loader = loaders["test"]

    num_labels = 3 if dataset_name == "climate_sentiment" else 2

    try:
        model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
        optimizer = AdamW(model.parameters(), lr=5e-5)
        model.to(device)

        print(f"Started training on {dataset_name} at {datetime.now()}")
        train_model(model, optimizer, train_loader, test_loader, device, epochs=3)

        save_path = f"/content/drive/MyDrive/Big Data Project/NLP analysis/{dataset_name}_model"
        model.save_pretrained(save_path)
        print(f"Model for {dataset_name} saved at {save_path}!")

    except RuntimeError as e:
        print(f"Error during training on {dataset_name}: {e}")
        torch.cuda.empty_cache()

    finally:
        del train_loader, test_loader
        del model, optimizer
        gc.collect()
        torch.cuda.empty_cache()

Training on climate_sentiment dataset...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Started training on climate_sentiment at 2024-12-08 13:55:19.182877
Epoch 1: Loss = 42.3371, Accuracy = 0.7120
Epoch 2: Loss = 21.3749, Accuracy = 0.8770
Epoch 3: Loss = 16.0341, Accuracy = 0.9050
Test Loss = 2.7097, Test Accuracy = 0.8031
Model for climate_sentiment saved at /content/drive/MyDrive/Big Data Project/NLP analysis/climate_sentiment_model!


# Creating ESG-Reports Dataset



### Necessary Libraries

In [20]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.25.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/20.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/20.0 MB[0m [31m144.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/20.0 MB[0m [31m110.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m12.2/20.0 MB[0m [31m114.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m20.0/20.0 MB[0m [31m211.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m20.0/20.0 MB[0m [31m211.0 MB/s[0m eta [36m0:00:01[0m

In [21]:
import fitz
import os
import json
import re

### Extract Text from PDFs

In [None]:
pdf_directory = "/content/drive/MyDrive/Big Data Project/NLP analysis/text_files/Sustainability Report "
output_json_dir = "/content/drive/MyDrive/Big Data Project/NLP analysis/text_files/Extracted Text"

os.makedirs(output_json_dir, exist_ok=True)

def extract_text_as_json(pdf_path, output_file):
    pdf_data = {"file_name": os.path.basename(pdf_path), "pages": []}

    with fitz.open(pdf_path) as doc:
        for page_num, page in enumerate(doc, start=1):
            text = page.get_text()
            pdf_data["pages"].append({"page_number": page_num, "text": text})

    with open(output_file, "w") as outfile:
        json.dump(pdf_data, outfile, indent=4)

def process_pdfs_to_json(pdf_directory, output_json_dir):
    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)
            output_file = os.path.join(output_json_dir, f"{os.path.splitext(filename)[0]}.json")

            if not os.path.exists(pdf_path):
                print(f"File not found: {pdf_path}. Skipping...")
                continue

            extract_text_as_json(pdf_path, output_file)
            print(f"Saved extracted text to {output_file}")

process_pdfs_to_json(pdf_directory, output_json_dir)

### Keep Only Climate Relevant Paragraphs

We use the list of climate related vocabulary from Binger et al. (2024). However, we leave out the following key words: "sustainable",
 "sustainability", "environmental" and "ESG". These terms are omitted as they appear on every page even when there is no content about climate related topics.







In [33]:
text_folder = '/content/drive/MyDrive/Big Data Project/NLP analysis/text_files/Extracted Text'
output_folder = '/content/drive/MyDrive/Big Data Project/NLP analysis/text_files/climate_only_paragraphs'

os.makedirs(output_folder, exist_ok=True)

keywords = [
    "air quality", "bushfire", "carbon", "CH4", "climate", "climate-related", "CO2", "coal",
    "decarbonization", "decarbonisation", "deforestation", "drought", "emission",
    "energy consumption", "energy efficiency", "energy efficient", "energy transition",
    "footprint", "fossil", "GHG", "global warming", "greenhouse",
    "heat wave", "hurricane", "land use", "litigation risk", "low-carbon", "methane", "N2O",
    "natural hazard", "nitrous oxide", "O3", "ozone", "Paris Agreement", "physical risk",
    "renewable", "rural fire", "sea level", "social responsibility", "solar energy", "TCFD",
    "temperature rise", "transition risk", "tropical cyclone", "tropical storm", "typhoon",
    "weather", "wildfire", "wildland fire", "wind energy"
]

keywords_regex = re.compile(r'\b(?:' + '|'.join(re.escape(word) for word in keywords) + r')\b', re.IGNORECASE)

def filter_climate_segments(input_folder, output_folder):
    for file_name in filter(lambda f: f.endswith('.json'), os.listdir(input_folder)):
        input_path = os.path.join(input_folder, file_name)
        output_path = os.path.join(output_folder, file_name)

        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        text = data.get('text', '')
        filtered_segments = [
            segment.strip() for segment in text.split('\n\n') if keywords_regex.search(segment)
        ]
        data['text'] = '\n\n'.join(filtered_segments)

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)

filter_climate_segments(text_folder, output_folder)

# Sentiment Score from the climate_sentiment_model




### Necessary libraries

In [25]:
import torch
import re
import os
import json
import csv
import pandas as pd

### Importing the climate_sentiment_model

The climate_sentiment_model classifies in three categories.

Risk: Business Risk due to climate change

Neutral: No business impact due to climate change

Opportunity: Business Opportunity due to climate change

In [36]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Nutze Gerät: {device}")

text_folder = '/content/drive/MyDrive/Big Data Project/NLP analysis/text_files/climate_only_paragraphs'
base_model = "bert-base-uncased"
model_dir = "/content/drive/MyDrive/Big Data Project/NLP analysis/climate_models/climate_sentiment_model"

tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)

tokenizer.save_pretrained(model_dir)

Nutze Gerät: cuda


('/content/drive/MyDrive/Big Data Project/NLP analysis/climate_models/climate_sentiment_model/tokenizer_config.json',
 '/content/drive/MyDrive/Big Data Project/NLP analysis/climate_models/climate_sentiment_model/special_tokens_map.json',
 '/content/drive/MyDrive/Big Data Project/NLP analysis/climate_models/climate_sentiment_model/vocab.txt',
 '/content/drive/MyDrive/Big Data Project/NLP analysis/climate_models/climate_sentiment_model/added_tokens.json',
 '/content/drive/MyDrive/Big Data Project/NLP analysis/climate_models/climate_sentiment_model/tokenizer.json')

### Split Dataset into Readable Chunks

In [27]:
def split_text_into_segments(text, max_length=512):

    paragraphs = re.split(r'\n\s*\n', text)

    segments = []
    current_segment = []
    current_length = 0

    for paragraph in paragraphs:
        tokens = tokenizer.tokenize(paragraph)
        token_length = len(tokens)

        if current_length + token_length > max_length:
            segments.append(" ".join(current_segment))
            current_segment = []
            current_length = 0

        current_segment.append(paragraph)
        current_length += token_length

    if current_segment:
        segments.append(" ".join(current_segment))

    return segments

### Functions for Label Prediction and Segment Analysis

Further we change the label risk from 1 to -1 to calculate the overall sentiment score for every ESG report.

In [41]:
def predict_label(segment):
    inputs = tokenizer(segment, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1).detach().cpu().numpy()[0]

    predicted_class = int(torch.argmax(logits, dim=1).item())

    label_mapping = {0: 0, 1: -1, 2: 1}
    mapped_label = label_mapping[predicted_class]

    return mapped_label, probabilities

def analyze_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        text = data.get("content", "")

    segments = split_text_into_segments(text)

    segment_results = []
    for segment in segments:
        label, probabilities = predict_label(segment)
        segment_results.append({
            "segment": segment,
            "label": label,
            "probabilities": probabilities
        })

    overall_index = (sum(r["label"] for r in segment_results) / len(segment_results) if segment_results else 0) * 5
    return overall_index, segment_results

### Sentiment Analysis of Every ESG-Report

In [42]:
text_folder = '/content/drive/MyDrive/Big Data Project/NLP analysis/text_files/climate_only_paragraphs'
results = {}

for filename in os.listdir(text_folder):
    file_path = os.path.join(text_folder, filename)
    if os.path.isfile(file_path) and filename.endswith(".json"):
        overall_index, segment_results = analyze_file(file_path)
        results[filename] = {"overall_index": overall_index, "segment_results": segment_results}

Token indices sequence length is longer than the specified maximum sequence length for this model (872 > 512). Running this sequence through the model will result in indexing errors


### Generate Random Sample for Manual Analysis

In [43]:
import random

num_examples = 10

segments = []
for file, result in results.items():
    for segment_result in result['segment_results']:
        segments.append({
            "file": file,
            "content": segment_result['segment'],
            "label": segment_result['label'],
            "probability": segment_result['probabilities']
        })

random_examples = random.sample(segments, min(num_examples, len(segments)))

for idx, example in enumerate(random_examples, 1):
    print(f"Example {idx}:")
    print(f"File: {example['file']}")
    print(f"Content: {example['content']}")
    print(f"Label: {example['label']}")
    print(f"Probability: {example['probability']}")
    print("-" * 40)

Example 1:
File: 45.2022.pdf.json
Content: y
Encourage international cooperation
The Chamber actively supported U.S. participation in the Paris 
Agreement and facilitated constructive business engagement 
with policymakers during COP27.
At the end of 2022, Delta withdrew its membership from the 
Chamber for reasons unrelated to climate or policy matters. 
However, Delta was a member of the U.S. Chamber’s Task 
Force on Climate Actions, which was established by member 
companies seeking to influence the climate policy positioning 
of the Chamber from within and provides a platform for direct 
business engagement with diverse stakeholders influencing 
climate policy development. Through the Task Force and in 
other forums, Delta advocated for climate policies that support 
our sectoral interests, such as incentives for SAF and related 
research and development to support advanced propulsion 
systems and next-generation fuels. 
59
Introduction
Safety
People
Environment
Climate Lobbying
Co

### Save Results for Further Analysis

In [None]:
from natsort import natsorted

output_csv = '/content/drive/MyDrive/Big Data Project/NLP analysis/sentiment_climate_only_text.csv'

sorted_results = natsorted(
    [(filename, data["overall_index"]) for filename, data in results.items()],
    key=lambda x: x[0]
)

df = pd.DataFrame(sorted_results, columns=["Filename", "Overall Index"])
df.to_csv(output_csv, index=False, encoding='utf-8')