In [8]:
import os
import re

folder_path = "/Users/shairamanandhar/Desktop/DSDA Senior Seminar/Case Study 2/data/essilorluxottica/"

# ---------- Generic helpers ----------

def is_spaced_out_caps(line: str) -> bool:
    """
    Detect lines like 'E S S I L O R L U X O T T I C A ...'
    which are almost all single capital letters separated by spaces.
    Those are usually big title art, not useful text.
    """
    # Count patterns like 'A ' 'B ' etc.
    return bool(re.search(r'(?:\b[A-Z]\b\s+){4,}', line))


def clean_generic_text(raw: str) -> str:
    # Normalize whitespace first
    text = raw.replace("\ufeff", " ")
    lines = text.splitlines()

    clean_lines = []
    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue

        # Remove pure page numbers, "Page 3", "3 of 10", "3/10"
        if re.fullmatch(r"\d+", stripped):
            continue
        if re.search(r"\bPage\s*\d+\b", stripped, flags=re.IGNORECASE):
            continue
        if re.search(r"\b\d+\s*of\s*\d+\b", stripped):
            continue
        if re.search(r"\b\d+\s*/\s*\d+\b", stripped):
            continue

        # Remove stretched all-caps titles like "E S S I L O R L U X O T T I C A ..."
        if is_spaced_out_caps(stripped):
            continue

        # Remove very obvious report headers that repeat on every page
        if "EssilorLuxottica 2024 Annual Report" in stripped:
            continue
        if "Sustainability Report" in stripped and "General Disclosures" in stripped:
            continue

        clean_lines.append(stripped)

    clean_text = " ".join(clean_lines)
    clean_text = re.sub(r"\s+", " ", clean_text).strip()
    return clean_text


def clean_factset_text(raw: str) -> str:
    """
    Keep all metrics & table rows, but remove page footers and boilerplate.
    """
    text = raw.replace("\ufeff", " ")
    lines = text.splitlines()

    clean_lines = []
    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue

        # Remove copyright / footer lines
        if "FactSet Research Systems Inc." in stripped:
            continue
        # Remove the little note line
        if "Contact your FactSet account" in stripped:
            continue
        # Remove the "ESG Report - EL-FR ..." footer header
        if stripped.startswith("ESG Report - EL-FR"):
            continue

        # Remove standalone page markers like "1 of 9", "2 of 9" etc.
        if re.search(r"\b\d+\s*of\s*\d+\b", stripped):
            continue

        clean_lines.append(stripped)

    clean_text = " ".join(clean_lines)
    clean_text = re.sub(r"\s+", " ", clean_text).strip()
    return clean_text


# ---------- Files to clean ----------

txt_files = [
    "annual_report_excerpt.txt",
    "esg_excerpt.txt",
    "external_summary.txt",
    "factset_esg.txt",
    "factset_financials.txt",
]

for fname in txt_files:
    src_path = os.path.join(folder_path, fname)
    dst_path = os.path.join(folder_path, fname.replace(".txt", "_clean.txt"))

    with open(src_path, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()

    if "factset" in fname.lower():
        cleaned = clean_factset_text(raw)
    else:
        cleaned = clean_generic_text(raw)

    with open(dst_path, "w", encoding="utf-8") as f:
        f.write(cleaned)

    print(f"Created cleaned file: {dst_path}")



Created cleaned file: /Users/shairamanandhar/Desktop/DSDA Senior Seminar/Case Study 2/data/essilorluxottica/annual_report_excerpt_clean.txt
Created cleaned file: /Users/shairamanandhar/Desktop/DSDA Senior Seminar/Case Study 2/data/essilorluxottica/esg_excerpt_clean.txt
Created cleaned file: /Users/shairamanandhar/Desktop/DSDA Senior Seminar/Case Study 2/data/essilorluxottica/external_summary_clean.txt
Created cleaned file: /Users/shairamanandhar/Desktop/DSDA Senior Seminar/Case Study 2/data/essilorluxottica/factset_esg_clean.txt
Created cleaned file: /Users/shairamanandhar/Desktop/DSDA Senior Seminar/Case Study 2/data/essilorluxottica/factset_financials_clean.txt


In [10]:
import os
import pandas as pd

folder_path = "/Users/shairamanandhar/Desktop/DSDA Senior Seminar/Case Study 2/data/essilorluxottica/"
company_name = "EssilorLuxottica"

def compute_chunk_sizes(num_words, min_words=500, max_words=1000):
    if num_words < min_words:
        return [num_words]

    sizes = []
    remaining = num_words

    while remaining > (min_words + max_words):
        sizes.append(max_words)
        remaining -= max_words

    if remaining <= max_words:
        sizes.append(remaining)
    else:
        a = remaining // 2
        b = remaining - a
        sizes.extend([a, b])

    return sizes


def chunk_text_range(text, min_words=500, max_words=1000):
    words = text.split()
    n = len(words)

    if n < min_words:
        return [" ".join(words)]

    sizes = compute_chunk_sizes(n, min_words=min_words, max_words=max_words)

    chunks = []
    idx = 0
    for sz in sizes:
        chunk_words = words[idx:idx + sz]
        chunks.append(" ".join(chunk_words))
        idx += sz

    return chunks


all_chunks = []

for file in os.listdir(folder_path):
    if file.endswith("_clean.txt"):
        txt_path = os.path.join(folder_path, file)

        with open(txt_path, "r", encoding="utf-8") as f:
            text = f.read()

        chunks = chunk_text_range(text)

        for j, chunk in enumerate(chunks):
            all_chunks.append({
                "chunk_id": j,
                "company": company_name,
                "source_file": file,
                "chunk_text": chunk,
                "word_count": len(chunk.split())
            })

df = pd.DataFrame(all_chunks)
print(df[["source_file", "chunk_id", "word_count"]].head(20))

df.to_csv("essilor_chunks_clean.csv", index=False)
print("Saved essilor_chunks_clean.csv")



                        source_file  chunk_id  word_count
0             factset_esg_clean.txt         0         544
1             factset_esg_clean.txt         1         544
2      factset_financials_clean.txt         0         561
3      factset_financials_clean.txt         1         561
4             esg_excerpt_clean.txt         0        1000
5             esg_excerpt_clean.txt         1        1000
6             esg_excerpt_clean.txt         2        1000
7             esg_excerpt_clean.txt         3        1000
8             esg_excerpt_clean.txt         4        1000
9             esg_excerpt_clean.txt         5         674
10       external_summary_clean.txt         0         838
11  annual_report_excerpt_clean.txt         0        1000
12  annual_report_excerpt_clean.txt         1        1000
13  annual_report_excerpt_clean.txt         2        1000
14  annual_report_excerpt_clean.txt         3        1000
15  annual_report_excerpt_clean.txt         4         655
16  annual_rep