In [12]:
import os                   # File and directory operations
import re                   # Regular expressions for text splitting & pattern matching
import json                 # Read/write JSON for storing the “universe” of topics & policies
import csv                  # Read CSV metadata mapping report numbers to project info
import spacy                # NLP library for tokenization, lemmatization, stop-word removal
from typing import Dict     # Type hinting for dictionaries


In [13]:
# Define global constants

BASE_PATH     = '/Users/pastudilloe/Library/CloudStorage/Dropbox/01 CONSULTING/WB_PriorActions_Poverty'
SUMMARY_RATIO = 0.95        # Fraction of sentences to keep when doing extractive compression
MAX_LINES     = 900        # Only read the first 450 lines from each raw report

In [14]:
# Load the small English spaCy model, and bump its maximum document length

nlp = spacy.load("en_core_web_sm")  
nlp.max_length = 5_000_000   # Allow processing very large text without truncate errors

In [15]:
# Build a lookup table (report_map) from the CSV of World Bank docs

csv_path = os.path.join(BASE_PATH, 'Documents', 'world_bank_documents_urls_missing.csv')
report_map: Dict[str, Dict[str, str]] = {}

if os.path.exists(csv_path):
    with open(csv_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            rpt = row.get('Report No.', '').strip()            # Unique report identifier
            if not rpt:
                continue
            report_map[rpt] = {
                "Project Name": row.get('Project Name', '').strip() or "UNKNOWN",
                "Link":         row.get('Link',         '').strip() or "UNKNOWN"
            }
# At this point, report_map maps each Report No. to its project name and URL

In [16]:
report_map

{'87083': {'Project Name': 'India - Second Development Policy Loan to Promote Inclusive Green Growth and Sustainable Development in Himachal Pradesh Program (English)',
  'Link': 'https://documents.worldbank.org/en/publication/documents-reports/documentdetail/333461468042844051'},
 '87865': {'Project Name': 'Vietnam - Second Economic Management Competitiveness Credit Program (English)',
  'Link': 'https://documents.worldbank.org/en/publication/documents-reports/documentdetail/483031468338947656'},
 '105825': {'Project Name': 'Pakistan - Competitiveness and Growth Development Policy Financing Project (English)',
  'Link': 'https://documents.worldbank.org/en/publication/documents-reports/documentdetail/764471468195835551'},
 'PGD373': {'Project Name': 'Sierra Leone - Second Inclusive and Sustainable Growth Development Policy Financing (English)',
  'Link': 'https://documents.worldbank.org/en/publication/documents-reports/documentdetail/099130011102276038'},
 '62267': {'Project Name': 'Br

In [17]:
# Simple extractive compressor: keep only the first SUMMARY_RATIO of sentences

def compress_text_extractive(text: str, ratio: float = SUMMARY_RATIO) -> str:
    """
    Split text into sentences, then join only the first `ratio` fraction.
    """
    # Split on sentence boundaries (., !, ?)
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    if len(sentences) < 2:
        return text  # Too short to compress
    keep_n = max(1, int(len(sentences) * ratio))
    return ' '.join(sentences[:keep_n])

In [18]:
# Preprocess text: lemmatize, lowercase, and remove stopwords/punctuation

def preprocess_text(text: str) -> str:
    doc = nlp(text)
    tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop and not token.is_space
    ]
    return " ".join(tokens)


In [19]:
# Extract the Operation or Project ID from the raw text using regex

def extract_operation_id(text: str) -> str:
    label_re = re.compile(
        r"(?:(?:Operation)|(?:Project))\s*ID(?:\s*(?:No\.?|Number))?\s*[:\-]?\s*"
        r"([A-Za-z0-9]+(?:[ \t\-]+[A-Za-z0-9]+)*)",
        re.IGNORECASE
    )
    m = label_re.search(text)
    if m:
        # Remove any spaces/tabs from the matched ID
        return re.sub(r"[ \t]+", "", m.group(1))

    # Fallback: look for a standalone pattern like “P123456”
    fall_re = re.compile(r"\bP\d{6}\b", re.IGNORECASE)
    fm = fall_re.search(text)
    return fm.group(0) if fm else "P_UNKNOWN"

In [20]:
# Parse the “universe” description file into a nested dict of topics → policy areas → text

def parse_universe_text(fp: str) -> Dict[str, Dict[str, str]]:
    universe: Dict[str, Dict[str, str]] = {}
    current_topic = None
    current_policy = None
    buffer_lines = []

    # Regex for lines starting “Topic X: …” and “Policy Area Y: …”
    topic_re  = re.compile(r"^Topic\s*\d*:\s*(.+)$", re.IGNORECASE)
    policy_re = re.compile(r"^Policy\s+Area\s*\d+(?:\.\d+)*:\s*(.+)$", re.IGNORECASE)

    with open(fp, 'r', encoding='utf-8', errors='replace') as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue

            t = topic_re.match(line)
            p = policy_re.match(line)

            if t:
                # When a new topic starts, flush the previous policy
                if current_topic and current_policy:
                    universe[current_topic][current_policy] = preprocess_text(" ".join(buffer_lines))
                    buffer_lines = []
                current_topic = t.group(1).strip()
                universe[current_topic] = {}
                current_policy = None

            elif p:
                # When a new policy starts, flush the previous one
                if current_topic and current_policy:
                    universe[current_topic][current_policy] = preprocess_text(" ".join(buffer_lines))
                    buffer_lines = []
                current_policy = p.group(1).strip()

            else:
                # Accumulate lines under the current policy
                buffer_lines.append(line)

        # At EOF, flush the last policy section
        if current_topic and current_policy:
            universe[current_topic][current_policy] = preprocess_text(" ".join(buffer_lines))

    return universe

In [21]:
report_map

{'87083': {'Project Name': 'India - Second Development Policy Loan to Promote Inclusive Green Growth and Sustainable Development in Himachal Pradesh Program (English)',
  'Link': 'https://documents.worldbank.org/en/publication/documents-reports/documentdetail/333461468042844051'},
 '87865': {'Project Name': 'Vietnam - Second Economic Management Competitiveness Credit Program (English)',
  'Link': 'https://documents.worldbank.org/en/publication/documents-reports/documentdetail/483031468338947656'},
 '105825': {'Project Name': 'Pakistan - Competitiveness and Growth Development Policy Financing Project (English)',
  'Link': 'https://documents.worldbank.org/en/publication/documents-reports/documentdetail/764471468195835551'},
 'PGD373': {'Project Name': 'Sierra Leone - Second Inclusive and Sustainable Growth Development Policy Financing (English)',
  'Link': 'https://documents.worldbank.org/en/publication/documents-reports/documentdetail/099130011102276038'},
 '62267': {'Project Name': 'Br

In [22]:
# Main execution block

if __name__ == "__main__":
    # Paths for universe input, universe output, raw and processed reports
    uni_in   = os.path.join(BASE_PATH, "Helpers", "Prior_Actions_DESCRIPTION.txt")
    uni_out  = os.path.join(BASE_PATH, "Helpers", "Prior_Actions_PROCESSED.txt")
    raw_dir  = os.path.join(BASE_PATH, "Datasets", "Raw", "policy_reports_test")
    proc_dir = os.path.join(BASE_PATH, "Datasets", "Processed", "policy_reports_test")

    # 1) Parse the universe file & write it out as JSON
    universe = parse_universe_text(uni_in)
    with open(uni_out, 'w', encoding='utf-8') as uf:
        json.dump(universe, uf, ensure_ascii=False, indent=2)

    # 2) Ensure raw reports exist and create processed directory
    if not os.path.exists(raw_dir):
        print(f"Raw folder missing: {raw_dir}")
        exit(1)
    os.makedirs(proc_dir, exist_ok=True)

    # 3) Loop over each .txt report, process, and save
    count = 0
    for fn in os.listdir(raw_dir):
        if not fn.lower().endswith('.txt'):
            continue

        # Read up to MAX_LINES from the report
        with open(os.path.join(raw_dir, fn), 'r', encoding='utf-8', errors='replace') as rf:
            raw_lines = [next(rf, '') for _ in range(MAX_LINES)]
        raw_text = "".join(raw_lines)

        # Extract metadata
        op_id  = extract_operation_id(raw_text)
        rpt_no = os.path.splitext(fn)[0]
        meta   = report_map.get(rpt_no, {"Project Name": "UNKNOWN", "Link": "UNKNOWN"})
        pname  = meta["Project Name"]
        plink  = meta["Link"]

        print("Operation ID:", op_id)
        print("Report No.:", rpt_no)
        print("Project Name:", pname)
        print("Link:", plink)

        # Compress and preprocess the text
        brief     = compress_text_extractive(raw_text)
        processed = preprocess_text(brief)

        # Build a header with ID, report number, project name, and link
        header = (
            f"ID {op_id} {fn}\n"
            f"Report: {rpt_no}\n"
            f"Project Name: {pname}\n"
            f"Link: {plink}\n\n"
        )
        content = header + processed

        # Create a filesystem-safe filename from the header line
        safe_name = re.sub(r"\s+", "_", header.splitlines()[0].strip())
        out_path  = os.path.join(proc_dir, safe_name)

        # Write the processed content to disk
        with open(out_path, 'w', encoding='utf-8') as wf:
            wf.write(content)

        print(f"Saved: {safe_name}")
        count += 1

        content

    print(f"\nProcessed {count} reports.")

Operation ID: P113638
Report No.: 50149
Project Name: Indonesia - Sixth Development Policy Loan Program (English)
Link: https://documents.worldbank.org/en/publication/documents-reports/documentdetail/879631468040542643
Saved: ID_P113638_50149.txt
Operation ID: PE-P106724-LEN-BB
Report No.: 44841
Project Name: Uruguay - Second Programmatic Reform Implementation Development Policy Program (English)
Link: https://documents.worldbank.org/en/publication/documents-reports/documentdetail/664071468130484889
Saved: ID_PE-P106724-LEN-BB_44841.txt
Operation ID: P115145
Report No.: 49236
Project Name: Dominican Republic - Public Finance and Social Sector Development Policy Loan Program (English)
Link: https://documents.worldbank.org/en/publication/documents-reports/documentdetail/250081468023080336
Saved: ID_P115145_49236.txt
Operation ID: P117282
Report No.: 52831
Project Name: Togo - Third Economic Recovery and Governance Grant Program (English)
Link: https://documents.worldbank.org/en/publicati