In [4]:
import re
import json
from collections import defaultdict

def classify_text(text, keywords):
    """
    Classify the given text into ESG categories based on keyword matches.

    Args:
        text (str): Input text to classify.
        keywords (dict): Dictionary of ESG categories with associated keywords.

    Returns:
        dict: Dictionary with ESG categories and matching text sections.
    """
    classifications = defaultdict(list)

    # Tokenize text into sentences or paragraphs
    segments = re.split(r'\n|\.', text)

    for segment in segments:
        match_counts = {category: 0 for category in keywords.keys()}

        for category, keyword_list in keywords.items():
            for keyword in keyword_list:
                if re.search(rf'\b{keyword}\b', segment, re.IGNORECASE):
                    match_counts[category] += 1

        # Determine the category based on match counts
        max_matches = max(match_counts.values())
        top_categories = [cat for cat, count in match_counts.items() if count == max_matches]

        if max_matches == 0:
            continue  # Skip segment if no matches

        if len(top_categories) > 1:
            # Resolve tie by assigning to all
            classifications["Governance"].append(segment.strip())
            classifications["Social"].append(segment.strip())
            classifications["Environmental"].append(segment.strip())
        else:
            # Assign to the category with the most matches
            classifications[top_categories[0]].append(segment.strip())

    return classifications

def load_keywords():
    """
    Load predefined ESG keywords for classification.

    Returns:
        dict: Dictionary with ESG categories and their associated keywords.
    """
    return {
        "Environmental": [ "greenhouse gas", "waste production", "renewable energy", "water consumption", "climate change", 
                          "pollution", 'CO2 emissions', 'environmental impact', 'sustainable energy','environmentally sustainable',
                          'net-zero emissions','environmental management', 'waste reduction','water resource management',
                          'energy conservation', 'deforestation', 'decarbonisation', 'Brown Industries', 'Clean Technology',
                          'Fossil Fuels', 'Green Industries', 'Green Bonds',
                         ],
        "Social": ['employee diversity','workplace inclusion','workplace equality','employee well-being','human rights compliance',
                   'workforce satisfaction','supplier responsibility','social impact','CSR activities','workplace safety',
                   "employee training", "diversity", "inclusion", "community engagement", 'conflict', 'employee relations'
                   "health and safety", "workplace accidents", "staff turnover", "social initiatives", 'Community Impact Investing',
                   
                        ],
        "Governance": ['leadership accountability','board diversity','business integrity','corporate disclosure', 'strategic risk mitigation',
                       'corruption prevention', 'stakeholder communication', 'regulatory adherence', 'business ethics', "anti-corruption", 
                       "data privacy", "executive pay", "compliance", "supplier audits", "governance diversity", "ethical policies",
                       'bribery and corruption', 'tax strategy', 'political lobbying and donations', 'broad diversity', 'benchmarking',
                       'corporate governance', 'Board of Directors', 'Engagement', 'Stewardship', 'ESG Fund Ratings', 'ESG Integration', 
                       'Shareholder Activism', 'Proxy Voting', 'Ethical Investing', 
                        ]
    }

def save_results(results, output_file):
    """
    Save classification results to a JSON file.

    Args:
        results (dict): Classification results.
        output_file (str): File path to save results.
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)


In [5]:
import os
def process_pdf_directory(directory_path):
        """
        Process all PDFs in a directory
        
        Args:
            directory_path (str): Path to directory with PDFs
        
        Returns:
            list: Extracted metrics from all PDFs
        """
        
        for filename in os.listdir(directory_path):
            if filename.endswith('_normalized.txt'):
                input_file = os.path.join(directory_path, filename)
                print(input_file)  # Ensure the text file is preprocessed or plain text
                output_file = input_file[:-15]+"_classification_results.json"
                
                with open(input_file, 'r', encoding='utf-8') as f:
                    text = f.read()
                
                # Load predefined ESG keywords
                keywords = load_keywords()
                
                # Classify the text
                classifications = classify_text(text, keywords)
                
                # Save the results
                save_results(classifications, output_file)
                
                print(f"Classification completed. Results saved to {output_file}")


In [6]:
files_directory_pdfplumber = './pdfplumber'
files_directory_pypdf2 = './pypdf2'
files_directory_textract = './textract'
# Process PDFs and extract metrics
# cleaned_text = process_pdf_directory(pdf_directory)
process_pdf_directory(files_directory_pdfplumber)
process_pdf_directory(files_directory_pypdf2)
process_pdf_directory(files_directory_textract)

./pdfplumber/Fuchs_2022_normalized.txt
Classification completed. Results saved to ./pdfplumber/Fuchs_2022_classification_results.json
./pdfplumber/Evotec_2022_normalized.txt
Classification completed. Results saved to ./pdfplumber/Evotec_2022_classification_results.json
./pdfplumber/GrandCity_2017_normalized.txt
Classification completed. Results saved to ./pdfplumber/GrandCity_2017_classification_results.json
./pdfplumber/GEAGroup_2017_normalized.txt
Classification completed. Results saved to ./pdfplumber/GEAGroup_2017_classification_results.json
./pdfplumber/Dürr_2020_normalized.txt
Classification completed. Results saved to ./pdfplumber/Dürr_2020_classification_results.json
./pdfplumber/Dürr_2022_normalized.txt
Classification completed. Results saved to ./pdfplumber/Dürr_2022_classification_results.json
./pdfplumber/Aixtron_2020_normalized.txt
Classification completed. Results saved to ./pdfplumber/Aixtron_2020_classification_results.json
./pdfplumber/CompuGroup_2023_normalized.txt
Cl