In [12]:
#Task_1
import pandas as pd
from rapidfuzz import fuzz, utils  # For fast and effective string matching
import re
import unicodedata
import os

# Define base weights for different features contributing to entity matching
BASE_WEIGHTS = {
    'name': 0.5,      
    'address': 0.4,   # Addresses may vary slightly due to formatting/language
    'country': 0.1    # used mostly as fallback
}

MATCH_THRESHOLD = 0.75  # Probability above this threshold considered a match

# Patterns to normalize company suffixes
ENTITY_SUFFIXES = {
    r'\bag\b': 'aktienGesellschaft',
    r'\bco\b': 'company',
    r'\bltd?\b': 'limited',
    r'\binc\b': 'incorporated',
    r'\bllc\b': 'limitedliabilitycompany',
    r'\bgmbh\b': 'gesellschaftmitbeschränkterhaftung'
}

def normalize_text(text):
    """
    Clean and normalize text for fair string comparison.
    Handles Unicode, lowercasing, and common company suffixes.
    """
    if pd.isna(text) or str(text).strip() == '':
        return ""
    
    #Standardize Unicode and remove diacritics
    text = unicodedata.normalize('NFKC', str(text))
    text = utils.default_process(text).lower()
    
    #Replace common company suffixes to avoid misleading mismatches
    for pattern, replacement in ENTITY_SUFFIXES.items():
        text = re.sub(pattern, replacement, text)
    
    #Remove special characters 
    text = re.sub(r'[^\w\s\u4e00-\u9fff]', ' ', text)
    
    #Normalize spacing
    return re.sub(r'\s+', ' ', text).strip()

def calculate_similarity(s1, s2):
    """
    Compute token sort similarity ratio using rapidfuzz.
    Returns a value between 0 and 1.
    """
    if not s1 and not s2:
        return 1.0  # Both are empty means perfect match
    if not s1 or not s2:
        return 0.0  # One is empty means no match
    return fuzz.token_sort_ratio(s1, s2) / 100

def process_file(input_path, output_path):
    # Read the input CSV
    df = pd.read_csv(input_path)
    original_cols = df.columns.tolist()

    # Normalize text fields
    for col in ['Name_x', 'Name_y', 
                'Local Name (if available)_x', 'Local Name (if available)_y',
                'Address_x', 'Address_y', 
                'Country_x', 'Country_y']:
        if col in df.columns:
            df[col] = df[col].apply(normalize_text)

    probabilities = []

    for _, row in df.iterrows():
        # Extract fields
        name_x = row['Name_x']
        name_y = row['Name_y']
        local_x = row.get('Local Name (if available)_x', '')
        local_y = row.get('Local Name (if available)_y', '')
        addr_x = row.get('Address_x', '')
        addr_y = row.get('Address_y', '')
        country_x = row.get('Country_x', '')
        country_y = row.get('Country_y', '')

        # Check field availability
        address_available = bool(addr_x.strip() and addr_y.strip())
        local_name_present = bool(local_x.strip() or local_y.strip())

        # Adjust weights depending on available fields
        if address_available:
            name_weight = BASE_WEIGHTS['name']
            address_weight = BASE_WEIGHTS['address']
            country_weight = BASE_WEIGHTS['country']
        else:
            name_weight = BASE_WEIGHTS['name'] + (BASE_WEIGHTS['address'] * 0.55)
            country_weight = BASE_WEIGHTS['country'] + (BASE_WEIGHTS['address'] * 0.45)
            address_weight = 0

        if local_name_present:
            local_weight = name_weight * 0.4
            name_weight *= 0.6
        else:
            local_weight = 0

        # Compute similarities
        similarities = {
            'name': calculate_similarity(name_x, name_y),
            'local_name': calculate_similarity(local_x, local_y) if local_name_present else 0,
            'address': calculate_similarity(addr_x, addr_y) if address_available else 0,
            'country': calculate_similarity(country_x, country_y)
        }

        # Final match probability
        final_prob = (
            similarities['name'] * name_weight +
            similarities['local_name'] * local_weight +
            similarities['address'] * address_weight +
            similarities['country'] * country_weight
        )

        probabilities.append(round(final_prob, 2))

    # Label rows based on threshold
    df['Match'] = (pd.Series(probabilities) >= MATCH_THRESHOLD).astype(int)
    df['Probability'] = probabilities

    # Ensure output directory exists
    output_dir = os.path.dirname(output_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save the results to CSV
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    return df

# Run on input
input_file = 'task_1_data.csv'
output_file = r'C:\Users\omerm\Downloads\task_1_result.csv'  # Please input your output file path

result_df = process_file(input_file, output_file)

'''
# Approach & Design Decisions:

I implemented a hybrid fuzzy matching logic using rapidfuzz to compare fields like names, addresses, local names, and countries.

The matching logic dynamically adjusts weights based on data availability. For instance:

If an address is missing, its weight is redistributed—55% to name and 45% to country.

If a local name is available, 40% of the name weight is reallocated to it, as local names often translate closely to official names.

Text normalization includes multilingual character handling and suffix replacement, which improves accuracy when comparing international companies.

Unicode encoding (utf-8-sig) ensures language-specific characters are preserved in the output.

# Matching Logic:

A probability score is calculated for each pair using weighted similarity measures.

A match is confirmed when the score is ≥ 0.75.

# Observations:

Matches with probabilities between 0.65–0.75 may require manual validation due to borderline cases or incomplete data.

# In future iterations, I plan to:

Incorporate semantic similarity (e.g., using embeddings or LLM-based name comparisons).

Improve logic when only one of the Local Name fields is populated.
'''

#Task_2 
The purpose of this script is to extract structured information from ZAF audit reports in PDF format, which are semi-structured and can vary in alignment/format. It extracts:
Codes like ZAF-xxx and ZAAxxxx

Fields like Workplace requirement, Status, Corrective actions, etc.

Multi-line and overlapping fields with similar keywords

Irregular layouts with broken words or missing separators


In [14]:
#Task_2
import pdfplumber
import re
import pandas as pd
import os
from collections import OrderedDict

# Configuration
PDF_FILES = ['task_2_pdf_1.pdf', 'task_2_pdf_2.pdf']
OUTPUT_DIR = r'C:\Users\omerm\Downloads'
OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'task_2_result.csv')
SUMMARY_PAGE = 10


def log(msg):
    print(f"[DEBUG] {msg}")


def clean_text(text):
    """Cleans and normalizes extracted text."""
    if not text:
        return ""
    text = re.sub(r'\s+', ' ', text)  # collapse whitespace
    text = re.sub(r'([.:,])\s+', r'\1 ', text)  # fix punctuation spacing
    text = re.sub(r'\s*-\s*', '-', text)  # normalize dashes
    return text.strip()


def extract_zaa_code(pdf):
    """Extracts the ZAA code from the last pages of the PDF."""
    for page in reversed(pdf.pages):
        page_text = page.extract_text()
        if not page_text:
            continue
        zaa_matches = re.findall(r'\bZAA\d+\b', page_text)
        if zaa_matches:
            return zaa_matches[-1]
    return ""


def get_zaf_entries(pdf):
    """
    Finds ZAF codes from the summary page and locates the page number
    where each ZAF code first appears in the document.
    """
    try:
        summary_text = pdf.pages[SUMMARY_PAGE].extract_text()
        if not summary_text:
            log("Summary page has no text.")
            return []

        found_codes = re.findall(r'\bZAF[-\w]*\b', summary_text, re.I)
        zaf_codes = list(dict.fromkeys(found_codes))  # preserve order, remove duplicates

        entries = []
        for code in zaf_codes:
            for idx, page in enumerate(pdf.pages):
                page_text = page.extract_text()
                if page_text and re.search(fr'^\s*{re.escape(code)}\b', page_text, re.I | re.M):
                    entries.append({'zaf': code, 'page': idx})
                    break
            else:
                log(f"Could not locate ZAF code: {code}")
        return entries

    except Exception as err:
        log(f"Failed to extract ZAF entries from summary page: {err}")
        return []


def parse_zaf_page(text):
    """
    Parses the page text of a ZAF entry and extracts relevant structured fields.
    """
    FIELD_ALIASES = OrderedDict([
        ('Code area', ['Code area']),
        ('Workplace requirement', ['Workplace requirement']),
        ('Issue title', ['Issue title']),
        ('Area of non-compliance/non-conformance', ['Area of non-compliance/non-conformance']),
        ('Local law reference', ['Local law reference']),
        ('Description', ['Description']),
        ('Corrective and preventative actions', [
            'Corrective and preventative actions', 'Corrective and preventative'
        ]),
        ('Time given to resolve', ['Time given to resolve']),
        ('Verification method', ['Verification method']),
        ('Status', ['Status']),
        ('Due', ['Due']),
    ])

    data = OrderedDict((field, '') for field in FIELD_ALIASES)

    # Normalize layout issues
    text = re.sub(r'(\S)\n(\S)', r'\1 \2', text)  # join broken words
    text = re.sub(r'\n+', '\n', text)  # collapse newlines

    # Code area & Status 
    code_area_match = re.search(
        r"(?i)Code\s+area\s*[:•]?\s*(.+?)(?=\s*(Workplace requirement|Issue title|Area of|$))", text
    )
    raw_code = clean_text(code_area_match.group(1)) if code_area_match else ""
    code_line_match = re.search(r"\b\d+.*", raw_code)
    if code_line_match:
        line = clean_text(code_line_match.group())
        status_match = re.search(r"(.*?)\s*(Closed\s*\([^)]+\)\*?|Open\*?)$", line, re.I)
        data['Code area'] = clean_text(status_match.group(1)) if status_match else line
        data['Status'] = clean_text(status_match.group(2)) if status_match else "-"
    else:
        data['Code area'] = "-"
        data['Status'] = "-"

    #Workplace requirement
    wr_match = re.search(
        r"(?i)Workplace\s+requirement\s*[:•]?\s*([\s\S]*?)(?=\s*(Issue title|Area of non-compliance|Description|Local law reference|$))",
        text, re.DOTALL
    )
    if wr_match:
        wr_text = clean_text(wr_match.group(1))
        wr_extract = re.search(
            r"(\d+(?:\.[A-Z])*\s+[\s\S]*?)(?=\s*(Issue title|Area of non-compliance|Description|Local law reference|$))",
            wr_text, re.DOTALL
        )
        if wr_extract:
            wr_clean = clean_text(wr_extract.group(1))
            wr_clean = re.split(
                r'(?i)\b(Issue title|Area of non-compliance|Description|Local law reference|Verification method)\b',
                wr_clean
            )[0]
            data['Workplace requirement'] = wr_clean.strip()
        else:
            data['Workplace requirement'] = wr_text
    else:
        data['Workplace requirement'] = "-"

    #  Due date 
    due_match = re.search(r"\bDue\s+(\d{4}-\d{2}-\d{2})", text)
    data['Due'] = due_match.group(1) if due_match else "-"

    # Area of non-compliance/non-conformance 
    area_value = ''
    area_match = re.search(r"Area of non-compliance/non-conformance\s*[:•]?\s*(.*)", text, re.IGNORECASE)
    if area_match:
        after_heading = area_match.group(1)
        match = re.search(r'\b(Local law)?\s*(Base code)?\b', after_heading, re.IGNORECASE)
        if match:
            vals = []
            if match.group(1): vals.append('Local law')
            if match.group(2): vals.append('Base code')
            area_value = " ".join(vals)

    if not area_value:
        search_area = area_match.group(1) if area_match else text
        vals = []
        if re.search(r'\bLocal law\b', search_area, re.IGNORECASE): vals.append('Local law')
        if re.search(r'\bBase code\b', search_area, re.IGNORECASE): vals.append('Base code')
        area_value = " ".join(vals).strip()

    if 'Base code' in area_value:
        area_value = area_value.replace("Base code", "Local law/Base code")
    data['Area of non-compliance/non-conformance'] = area_value or "-"

    #  Other multiline fields
    def extract_multiline_field(start_patterns, stop_patterns):
        start_regex = "|".join(map(re.escape, start_patterns))
        stop_regex = "|".join(map(re.escape, stop_patterns)) if stop_patterns else "$"
        pattern = fr"(?i)(?:{start_regex})\s*[:•]?\s*((?:(?!{stop_regex}).)+)"
        match = re.search(pattern, text, re.DOTALL)
        return clean_text(match.group(1)) if match else ""

    keys = list(FIELD_ALIASES.keys())
    for i, field in enumerate(keys):
        if field in ['Code area', 'Workplace requirement', 'Area of non-compliance/non-conformance', 'Status', 'Due', 'Time given to resolve']:
            continue
        aliases = FIELD_ALIASES[field]
        stop_patterns = [s for f in keys[i+1:] for s in FIELD_ALIASES[f]]
        val = extract_multiline_field(aliases, stop_patterns)

        if field == 'Issue title':
            issue_match = re.search(r'(\d+[A-Z\-\.]*.*?)\s*(?=Area of non-compliance|Verification method|$)', val, re.I)
            data[field] = clean_text(issue_match.group(1)) if issue_match else clean_text(val)
        else:
            data[field] = val or "-"

    # Time given to resolve
    time_match = re.search(r"\b(\d{1,3}\s+days)\b", text)
    data['Time given to resolve'] = time_match.group(1) if time_match else "-"

    # --- Verification method ---
    for method in ["Follow up audit", "Collaborative action required", "Desktop audit"]:
        if re.search(re.escape(method), text, re.I):
            data['Verification method'] = method
            break

    return data


def process_pdf(file_path):
    """Processes a single PDF file and returns a list of extracted rows."""
    extracted = []
    try:
        with pdfplumber.open(file_path) as pdf:
            zaa_code = extract_zaa_code(pdf)
            zaf_entries = get_zaf_entries(pdf)

            for entry in zaf_entries:
                page = pdf.pages[entry['page']]
                text = page.extract_text()
                if not text:
                    continue
                parsed = parse_zaf_page(text)
                parsed['ZAF'] = entry['zaf']
                parsed['ZAA'] = zaa_code
                extracted.append(parsed)

    except Exception as err:
        log(f"PDF processing failed for {file_path}: {err}")
    return extracted


def main():
    """Main entry point: processes all files and writes the output CSV."""
    all_rows = []
    for file in PDF_FILES:
        log(f"\n{'=' * 30}\nProcessing file: {file}\n{'=' * 30}")
        all_rows.extend(process_pdf(file))

    final_columns = [
        'ZAA', 'ZAF', 'Due', 'Status', 'Code area', 'Workplace requirement',
        'Issue title', 'Area of non-compliance/non-conformance', 'Local law reference',
        'Description', 'Corrective and preventative actions', 'Time given to resolve',
        'Verification method'
    ]

    df = pd.DataFrame(all_rows, columns=final_columns)

    # Normalize and clean up final output
    for col in final_columns:
        df[col] = df[col].apply(lambda val: '-' if pd.isna(val) or str(val).strip() == "" else val)

    df['Status'] = df['Status'].str.replace(r'\*+', '*', regex=True)
    df['ZAA'] = df['ZAA'].str.strip().fillna('-')

    df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')
    log(f"\nFinished processing {len(df)} rows")
    log(f"Saved output to: {OUTPUT_FILE}")


if __name__ == '__main__':
    main()


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[DEBUG] 
Processing file: task_2_pdf_1.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[DEBUG] 
Processing file: task_2_pdf_2.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[DEBUG] 
Finished processing 7 rows
[DEBUG] Saved output to: C:\Users\omerm\Downloads\task_2_result.csv


By using  Regular Expressions:
Find sections by headers  -   (?i) makes it case-insensitive

                               It matches headers like Code area: value, even if the colon or bullet is missing

                               Then uses a non-greedy match (.+?) to capture the value until it hits the next field.
Multiline field extraction -   Start at a header (like Corrective and preventative actions)

                                Stop when the next header starts (like Time given to resolve)

                               Capture everything in between, even over multiple lines
 Text normalization   -     This handles layout issues where a word is split across lines, example, cor\nrective becomes corrective.
Status extraction  -        This identifies if the line ends in something like Closed (minor)*, and separates the status from the main value.
ZAF & ZAA code detection -   These look for the standard codes in summary and footer pages respectively.

    


Handling Two Report Layouts / Alignments:

The script is built to handle two types of page layouts:

Type 1: Well-structured layout
          Fields are clearly separated by headers like Workplace requirement:

          Fields follow each other with standard spacing

          This is handled easily by re.search using field names as anchors.
          
Type 2: Poorly-structured or misaligned layout
          
           Lines break mid-sentence
           Fields overlap each other or appear out of order
           Using re.DOTALL, so the regex captures newlines (. matches \n)
           Stop-pattern logic: It anticipates the start of the next field and uses that to stop capturing
           Fallback matching: For example, if Workplace requirement isn't found with \d+, it tries to grab whatever is between headers.
          
          

| Field                                 | Parsing Logic                                                                    |
| ------------------------------------- | -------------------------------------------------------------------------------- |
| `Code area`                           | Extracts using header, splits out trailing status like `Closed (major)`          |
| `Workplace requirement`               | Matches start using regex, ends at next known field, validates with `\d+` prefix |
| `Status`                              | Extracted from end of `Code area` line                                           |
| `Due`                                 | Simple date regex: `Due YYYY-MM-DD`                                              |
| `Area of non-compliance`              | Looks for "Local law" and/or "Base code" keywords                                |
| `Corrective and preventative actions` | Captures large multiline blocks between two field headers                        |
| `Time given to resolve`               | Looks for pattern like `30 days`                                                 |
| `Verification method`                 | Matches one of the known phrases exactly                                         |
| `Issue title`                         | Sometimes requires its own nested regex to exclude overlaps                      |
