In [15]:
import os
import pandas as pd
import unicodedata

In [16]:
# Root paths for the two datasets
ORIGINAL_ROOT = "/kaggle/input/all-audit-opinions/AUDIT_OPINION_ITEM_7"
PARAPHRASE_ROOT = "/kaggle/input/me-poiotiko-elegxo-augmented-paraphrases/ME_POIOTIKO_ELEGXO"

original_data = []
paraphrase_data = []

# --------- LOAD ORIGINAL AUDIT OPINIONS ---------
for year in os.listdir(ORIGINAL_ROOT):
    if not year.isdigit():
        continue
    year_int = int(year)
    if year_int < 1998 or year_int > 2021:
        continue

    year_path = os.path.join(ORIGINAL_ROOT, year)

    for label in ["YES_BANKRUPTCY", "NO_BANKRUPTCY"]:
        label_path = os.path.join(year_path, label)
        if not os.path.exists(label_path):
            continue

        for firm_id in os.listdir(label_path):
            firm_path = os.path.join(label_path, firm_id, "audit_opinion.txt")
            if not os.path.exists(firm_path):
                continue

            try:
                with open(firm_path, "r", encoding="utf-8") as f:
                    text = f.read().strip()
                    original_data.append({
                        "year": year_int,
                        "firm_id": firm_id,
                        "text": text,
                        "label": 1 if label == "YES_BANKRUPTCY" else 0,
                        "source": "original",
                        "llm": None
                    })
            except Exception as e:
                print(f"[ERROR] Could not read original file: {firm_path}\n--> {e}")

# --------- LOAD PARAPHRASED AUDIT OPINIONS ---------
for dir_name in os.listdir(PARAPHRASE_ROOT):
    if "_" not in dir_name:
        continue
    try:
        year_str, llm_tag = dir_name.split("_", 1)
        year = int(year_str)
    except:
        continue
    if year < 1998 or year > 2020:
        continue

    year_llm_path = os.path.join(PARAPHRASE_ROOT, dir_name)

    for firm_id in os.listdir(year_llm_path):
        firm_dir = os.path.join(year_llm_path, firm_id)
        for filename in os.listdir(firm_dir):
            if not filename.startswith("paraphrase_"):
                continue

            file_path = os.path.join(firm_dir, filename)
            text = None

            # Try UTF-8 first, then fallback to ISO-8859-1
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    text = f.read().strip()
            except UnicodeDecodeError:
                try:
                    with open(file_path, "r", encoding="ISO-8859-1") as f:
                        text = f.read().strip()
                        print(f"[WARNING] Used ISO-8859-1 fallback: year={year}, llm={llm_tag}, firm_id={firm_id}, file={filename}")
                except Exception as e:
                    print(f"[ERROR] Could not decode paraphrase file: year={year}, llm={llm_tag}, firm_id={firm_id}, file={filename}\n--> {e}")
                    continue  # skip if both fail

            # Only add if successfully read
            if text:
                paraphrase_data.append({
                    "year": year,
                    "firm_id": firm_id,
                    "text": text,
                    "label": 1,
                    "source": "paraphrased",
                    "llm": llm_tag.lower()
                })

# --------- COMBINE INTO ONE DATAFRAME ---------
df_original = pd.DataFrame(original_data)
df_paraphrased = pd.DataFrame(paraphrase_data)
df_all = pd.concat([df_original, df_paraphrased], ignore_index=True)

# --------- OPTIONAL: Normalize Smart Characters ---------
def clean_text(text):
    text = unicodedata.normalize("NFKC", text)
    return text.replace('\x92', "'").replace('\x96', '-').replace('\x97', '-')

df_all['text'] = df_all['text'].apply(clean_text)

# --------- BASIC SANITY CHECK ---------
print("✅ Full Dataset Shape:", df_all.shape)
print("\nLabel Distribution:\n", df_all['label'].value_counts())
print("\nSource Distribution:\n", df_all['source'].value_counts())
print("\nSample rows:")
display(df_all.sample(5))

✅ Full Dataset Shape: (77586, 6)

Label Distribution:
 label
0    67672
1     9914
Name: count, dtype: int64

Source Distribution:
 source
original       68218
paraphrased     9368
Name: count, dtype: int64

Sample rows:


Unnamed: 0,year,firm_id,text,label,source,llm
12066,2011,717605,Report of Independent Registered Public Accoun...,0,original,
18390,2004,765258,REPORT OF INDEPENDENT REGISTERED PUBLIC ACCOUN...,0,original,
32697,2005,1012771,REPORT OF INDEPENDENT REGISTERED PUBLIC ACCOUN...,0,original,
9820,2011,921557,REPORT OF INDEPENDENT REGISTERED PUBLIC ACCOUN...,0,original,
50016,2016,834071,REPORT OF INDEPENDENT REGISTERED PUBLIC ACCOUN...,0,original,


In [17]:
import pandas as pd

# STEP 1: Copy/paste the firm/year/llm/file info from the warnings
iso_warn_entries = [
    ("2019", "YI_ALL_PARAPHRASED", "1255474", "paraphrase_3.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1255474", "paraphrase_4.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1255474", "paraphrase_5.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1255474", "paraphrase_2.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1722287", "paraphrase_6.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1326089", "paraphrase_3.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1326089", "paraphrase_1.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1326089", "paraphrase_2.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1346980", "paraphrase_3.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1346980", "paraphrase_4.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1346980", "paraphrase_5.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1346980", "paraphrase_1.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1655020", "paraphrase_6.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1655020", "paraphrase_3.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1655020", "paraphrase_4.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1655020", "paraphrase_1.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1655020", "paraphrase_2.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "314808", "paraphrase_6.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "314808", "paraphrase_3.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "314808", "paraphrase_4.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "314808", "paraphrase_5.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "314808", "paraphrase_2.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1579252", "paraphrase_3.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1579252", "paraphrase_4.txt"),
    ("2019", "YI_ALL_PARAPHRASED", "1579252", "paraphrase_2.txt")
]

# STEP 2: Build DataFrame from those entries
df_fallback_refs = pd.DataFrame(iso_warn_entries, columns=["year", "llm", "firm_id", "file"])
df_fallback_refs["year"] = df_fallback_refs["year"].astype(int)
df_fallback_refs["firm_id"] = df_fallback_refs["firm_id"].astype(str)
df_fallback_refs["llm"] = df_fallback_refs["llm"].str.lower()

# STEP 3: Join with df_all to filter just those entries
df_fallback_aos = df_all[
    (df_all["source"] == "paraphrased") &
    (df_all["year"].isin(df_fallback_refs["year"])) &
    (df_all["firm_id"].isin(df_fallback_refs["firm_id"])) &
    (df_all["llm"].isin(df_fallback_refs["llm"]))
]

# Optional: deduplicate by text or show unique audit opinions
df_fallback_aos = df_fallback_aos.drop_duplicates(subset=["text"])

# STEP 4: Display them
print(f"\n⚠️ Number of AO entries from ISO-8859-1 fallback: {len(df_fallback_aos)}")
display(df_fallback_aos[["year", "firm_id", "llm", "text"]].sample(41))


⚠️ Number of AO entries from ISO-8859-1 fallback: 41


Unnamed: 0,year,firm_id,llm,text
68338,2019,1579252,yi_all_paraphrased,Independent Registered Public Accounting Firm'...
68337,2019,1579252,yi_all_paraphrased,Independent Registered Public Accounting Firm'...
68321,2019,1346980,yi_all_paraphrased,Independent Registered Public Accounting Firm'...
68312,2019,1326089,yi_all_paraphrased,Independent Registered Public Accounting Firm ...
68307,2019,1722287,yi_all_paraphrased,Independent Registered Public Accounting Firm'...
68314,2019,1326089,yi_all_paraphrased,Independent Registered Public Accounting Firm ...
68326,2019,1655020,yi_all_paraphrased,Independent Registered Public Accounting Firm ...
68319,2019,1346980,yi_all_paraphrased,Report by an Independent Authorized Public Acc...
68302,2019,1255474,yi_all_paraphrased,Report of Independent Registered Public Accoun...
68335,2019,1579252,yi_all_paraphrased,Independent Registered Public Accounting Firm'...


In [18]:
# Save the combined DataFrame to Parquet
df_all.to_parquet("/kaggle/working/df_audit_opinions.parquet", index=False)
print("Saved to /kaggle/working/df_audit_opinions.parquet")

Saved to /kaggle/working/df_audit_opinions.parquet


In [19]:
p=df_all[df_all['source']=='original']
p[p['label']==1]

Unnamed: 0,year,firm_id,text,label,source,llm
0,2020,884940,Report of Independent Registered Public Accoun...,1,original,
1,2020,1648428,REPORT OF INDEPENDENT REGISTERED PUBLIC ACCOUN...,1,original,
2,2020,1109189,Report of Independent Registered Public Accoun...,1,original,
3,2020,1211351,REPORT OF INDEPENDENT REGISTERED PUBLIC ACCOUN...,1,original,
4,2020,1166126,REPORT OF INDEPENDENT REGISTERED PUBLIC ACCOUN...,1,original,
...,...,...,...,...,...,...
64804,2000,1054422,REPORT OF INDEPENDENT PUBLIC ACCOUNTANTS<p><p>...,1,original,
64805,2000,918275,Independent Auditors' Report<p>The Board of Di...,1,original,
64806,2000,927417,"REPORT OF ERNST & YOUNG LLP, INDEPENDENT AUDIT...",1,original,
64807,2000,1076732,Report of Independent Auditors The Board of ...,1,original,
