In [10]:
import re
import pandas as pd
import os

In [11]:
# Extract the year from the file name
def extract_year_from_filename(filename):
    
    # Use a slightly modified regex pattern
    match = re.findall(r'(20\d{2})', filename)
    
    print(f"Matches found: {match}")
    
    if match:
        return max(map(int, match))
    else:
        raise ValueError("No valid year found in the file name.")

In [12]:
# Convert textual references to years
def convert_textual_years(text, current_year):
    """
    Converts textual references to specific years based on the current year.
    
    Args:
        text (str): The input text to process
        current_year (int): The current year to use as a reference
    
    Returns:
        str: Text with year references converted to specific years
    """
    # Phrases for current year
    current_year_phrases = [
        r'\bthis year\b', 
        r'\bcurrent year\b', 
        r'\breporting year\b', 
        r'\bannual period\b', 
        r'\byear under review\b', 
        r'\breporting period\b', 
        r'\bpresent year\b'
    ]
    
    # Phrases for previous year
    previous_year_phrases = [
        r'\bprevious year\b', 
        r'\blast year\b',
        r'\bpast year\b',
        r'\bpreceding year\b', 
        r'\bprior year\b', 
        r'\bcomparative year\b', 
        r'\byear on year\b', 
        r'\bbaseline year\b', 
        r'\breference year\b', 
        r'\bhistorical period\b'
    ]
    
    # Phrases for next year
    next_year_phrases = [
        r'\bnext year\b', 
        r'\bupcoming year\b', 
        r'\bforward looking year\b', 
        r'\bprospective period\b', 
        r'\bfuture fiscal year\b', 
        r'\bprojection period\b', 
        r'\banticipated period\b', 
        r'\bsubsequent year\b'
    ]
    
    # Replace current year phrases
    for phrase in current_year_phrases:
        text = re.sub(phrase, str(current_year), text, flags=re.IGNORECASE)
    
    # Replace previous year phrases
    for phrase in previous_year_phrases:
        text = re.sub(phrase, str(current_year-1), text, flags=re.IGNORECASE)
    
    # Replace next year phrases
    for phrase in next_year_phrases:
        text = re.sub(phrase, str(current_year + 1), text, flags=re.IGNORECASE)
    
    return text

In [13]:
# Other normalization functions
def standardize_units(text):
    """Standardizes units like 'kilograms' to 'kg', 'metric tons' to 'MT', etc."""
    unit_mapping = {
             # Mass Units
        r'\bkilograms?\b': 'kg',
        r'\bmetric tons?\b': 'MT',
        r'\btonnes?\b': 'MT',
        r'\bpounds?\b': 'lbs',
        r'\bgrammes?\b': 'g',
    
        # Energy Units
        r'\bkilowatt hours?\b': 'kWh',
        r'\bmegawatt hours?\b': 'MWh',
        r'\bjoules?\b': 'J',
        r'\bBTUs?\b': 'BTU',
        r'\bgigajoules?\b': 'GJ',

         # Volume Units
        r'\bliters?\b': 'L',
        r'\bcubic meters?\b': 'm³',
        r'\bgallons?\b': 'gal',
        r'\bmilliliters?\b': 'mL',
    
        # Distance/Area Units
        r'\bsquare kilometers?\b': 'km²',
        r'\bhectares?\b': 'ha',
        r'\bacres?\b': 'acre',
        r'\bsquare meters?\b': 'm²',
    
        # Temperature
        r'\bdegrees? celsius\b': '°C',
        r'\bdegrees? fahrenheit\b': '°F',
        r'\bkelvin\b': 'K',
    
        # Emission-specific Units
        r'\bcarbon dioxide\b': 'CO₂',
        r'\bmetric tons? of CO2\b': 'MT CO₂',
        r'\bcarbon equivalent\b': 'CO₂e',
    
        # Water-related Units
        r'\bcubic meters? of water\b': 'm³',
        r'\bliters? per day\b': 'L/day',
    
        # Percentage and Ratio
        r'\bper cent\b': '%',
        r'\bpercent\b': '%',
    
        # Monetary Units (if financial metrics are included)
        r'\bUS dollars?\b': 'USD',
        r'\bdollars?\b': 'USD',
        r'\beuro(s)?\b': 'EUR',
    
        # Waste and Recycling
        r'\bmetric tons? of waste\b': 'MT waste',
        r'\bkilograms? of waste\b': 'kg waste',
    
        # Renewable Energy
        r'\bmegawatts?\b': 'MW',
        r'\bkilowatts?\b': 'kW',
    }
    for pattern, replacement in unit_mapping.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return text

In [14]:
def harmonize_terminology(text):
    """Harmonizes key terminology across the document."""
    term_mapping = {
    # Environmental Terminology
    r'\bcarbon footprint\b': 'CO2 emissions',
        
     r'\bcarbon emissions\b': 'CO2 emissions',
    r'\bgreen(house)? gas(ses)?\b': 'greenhouse gas emissions',
    r'\bclimate change\b': 'environmental impact',
    r'\bglobal warming\b': 'environmental impact',
    r'\brenewable energy\b': 'sustainable energy',
    r'\balternative energy\b': 'sustainable energy',
    r'\beco-friendly\b': 'environmentally sustainable',
    r'\benvironmentally friendly\b': 'environmentally sustainable',
    r'\bcarbon neutral\b': 'net-zero emissions',
    r'\bclimate neutrality\b': 'net-zero emissions',
    r'\bsustainability efforts?\b': 'CSR activities',
    r'\benvironmental stewardship\b': 'environmental management',
    r'\bnatural resource conservation\b': 'environmental management',
    r'\bwaste management\b': 'waste reduction',
    r'\brecycling\b': 'waste reduction',
    r'\bwater conservation\b': 'water resource management',
    r'\bwater saving\b': 'water resource management',
    r'\benergy efficiency\b': 'energy conservation',
    r'\benergy-saving\b': 'energy conservation',

    # Social Terminology
    r'\bworkforce diversity\b': 'employee diversity',
    r'\binclusivity\b': 'workplace inclusion',
    r'\binclusive workplace\b': 'workplace inclusion',
    r'\bequal opportunity\b': 'workplace equality',
    r'\bequal employment opportunity\b': 'workplace equality',
    r'\bwork-life balance\b': 'employee well-being',
    r'\bhealthy work environment\b': 'employee well-being',
    r'\bhuman rights\b': 'human rights compliance',
    r'\bhuman dignity\b': 'human rights compliance',
    r'\bemployee engagement\b': 'workforce satisfaction',
    r'\bemployer satisfaction\b': 'workforce satisfaction',
    r'\bsupply chain ethics\b': 'supplier responsibility',
    r'\bsupplier integrity\b': 'supplier responsibility',
    r'\bcommunity investment\b': 'social impact',
    r'\bsocial investment\b': 'social impact',
    r'\bcorporate social responsibility\b': 'CSR activities',
    r'\bCSR programs?\b': 'CSR activities',
    r'\bhealth and safety\b': 'workplace safety',
    r'\boccupational safety\b': 'workplace safety',
    r'\bworker safety\b': 'workplace safety',

    # Governance Terminology
    r'\bcorporate governance\b': 'leadership accountability',
    r'\bboard composition\b': 'board diversity',
    r'\bboard representation\b': 'board diversity',
    r'\bethical business\b': 'business integrity',
    r'\bethical practices\b': 'business integrity',
    r'\btransparency\b': 'corporate disclosure',
    r'\bopen communication\b': 'corporate disclosure',
    r'\brisk management\b': 'strategic risk mitigation',
    r'\brisk mitigation\b': 'strategic risk mitigation',
    r'\banti-corruption\b': 'corruption prevention',
    r'\banti-bribery\b': 'corruption prevention',
    r'\bstakeholder engagement\b': 'stakeholder communication',
    r'\bstakeholder involvement\b': 'stakeholder communication',
    r'\bcompliance\b': 'regulatory adherence',
    r'\blegal compliance\b': 'regulatory adherence',
    r'\bcorporate ethics\b': 'business ethics',
    r'\bethical standards\b': 'business ethics',
    }

    for pattern, replacement in term_mapping.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return text

In [15]:
def normalize_dates(text):
    """Normalizes all date formats to ISO 8601 (YYYY-MM-DD)."""
    date_pattern = r'\b(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{2,4})\b'
    def format_date(match):
        day, month, year = match.groups()
        year = year if len(year) == 4 else f'20{year}'
        return f'{year}-{month.zfill(2)}-{day.zfill(2)}'
    return re.sub(date_pattern, format_date, text)

In [16]:
# def normalize_numbers(text):
#     """Removes commas from numbers (e.g., 1,000 -> 1000) and standardizes decimals."""
#     return re.sub(r'(\d+),(\d+)', r'\1\2', text)

In [17]:
# Load the data (assuming the text file is already cleaned and provided as raw text)
def process_cleaned_directory(directory_path):
        
        for filename in os.listdir(directory_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(directory_path, filename)
                print(file_path)  # Ensure the text file is preprocessed or plain text
                
                with open(file_path, 'r') as f:
                    raw_data = f.read()
                
                # Apply normalization
                file_year = extract_year_from_filename(file_path)
                normalized_data = convert_textual_years(raw_data, file_year)
                normalized_data = standardize_units(normalized_data)
                normalized_data = harmonize_terminology(normalized_data)
                normalized_data = normalize_dates(normalized_data)
                # normalized_data = normalize_numbers(normalized_data)
                
                f = open(file_path[:-4]+"_normalized.txt", "a")
                f.write(normalized_data)
                f.close()
                
                print(f"Normalization and structuring completed. Data saved as {file_path}_normalized.txt using year {file_year}.")


In [18]:
files_directory_pdfplumber = './pdfplumber'
files_directory_pypdf2 = './pypdf2'
files_directory_textract = './textract'

# Process PDFs and extract metrics
# cleaned_text = process_pdf_directory(pdf_directory)
process_cleaned_directory(files_directory_pdfplumber)
process_cleaned_directory(files_directory_pypdf2)
process_cleaned_directory(files_directory_textract)


./pdfplumber/CompuGroup_2023.txt
Matches found: ['2023']
Normalization and structuring completed. Data saved as ./pdfplumber/CompuGroup_2023.txt_normalized.txt using year 2023.
./pdfplumber/GEAGroup_2022.txt
Matches found: ['2022']
Normalization and structuring completed. Data saved as ./pdfplumber/GEAGroup_2022.txt_normalized.txt using year 2022.
./pdfplumber/DeutscheBank_2019.txt
Matches found: ['2019']
Normalization and structuring completed. Data saved as ./pdfplumber/DeutscheBank_2019.txt_normalized.txt using year 2019.
./pdfplumber/HugoBoss_2019.txt
Matches found: ['2019']
Normalization and structuring completed. Data saved as ./pdfplumber/HugoBoss_2019.txt_normalized.txt using year 2019.
./pdfplumber/Dürr_2018.txt
Matches found: ['2018']
Normalization and structuring completed. Data saved as ./pdfplumber/Dürr_2018.txt_normalized.txt using year 2018.
./pdfplumber/CompuGroup_2017.txt
Matches found: ['2017']
Normalization and structuring completed. Data saved as ./pdfplumber/CompuG