In [6]:
# 0. Fallback Responses Removal

import os
import re
import pandas as pd
from pathlib import Path

BASE_DIR = Path("/Users/phoebeeeee/ongoing/LLM_AED/generation/llama_70b_generation_raw")
OUTPUT_CSV = BASE_DIR / "manual_check.csv"
FILE_TYPES = ["E", "N", "C"]
VALIDATION = True 
STRIP_NUMBERING = True 

def read_file_lines(file_path: Path, strip_numbering: bool = True):
    lines = []
    if not file_path.is_file():
        return lines
    with file_path.open("r", encoding="utf-8") as f:
        for raw in f:
            s = raw.strip()
            if not s:
                continue  # 跳过空行
            if strip_numbering:
                s = re.sub(r"^\s*\d+\.\s*", "", s)
            lines.append(s)
    return lines

def aggregate_to_csv(base_dir: Path, output_csv: Path):
    records = []
    for sub in sorted([p for p in base_dir.iterdir() if p.is_dir()]):
        subfolder_name = sub.name
        for ft in FILE_TYPES:
            fpath = sub / ft  # 期望文件名就是 'E' / 'N' / 'C'
            if not fpath.exists():
                continue
            lines = read_file_lines(fpath, strip_numbering=STRIP_NUMBERING)
            for idx, content in enumerate(lines, start=1):
                records.append({
                    "validation": VALIDATION,
                    "subfolder": subfolder_name,
                    "file_type": ft,
                    "line_no": idx,
                    "content": content
                })

    if not records:
        print("未收集到任何记录，请检查目录与文件命名是否正确。")
        return

    df = pd.DataFrame.from_records(
        records,
        columns=["validation", "subfolder", "file_type", "line_no", "content"]
    )
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"已保存：{output_csv}（共 {len(df)} 行）")

if __name__ == "__main__":
    aggregate_to_csv(BASE_DIR, OUTPUT_CSV)


已保存：/Users/phoebeeeee/ongoing/LLM_AED/generation/llama_70b_generation_raw/manual_check.csv（共 4039 行）


In [9]:
# 0

import os
import re
import shutil
from pathlib import Path
import pandas as pd

BASE_DIR = Path("/Users/phoebeeeee/ongoing/Beyond-noise-MA-Zuo/EACL/qwen_7b_generation_raw")
INPUT_CSV = BASE_DIR / "0_exp_all.csv"   # 改成你实际的标注结果 CSV 路径
FILE_TYPES = ["E", "N", "C"]
RENUMBER_ON_WRITE = True  

def is_false_mark(v) -> bool:
    if isinstance(v, bool):
        return v is False
    if pd.isna(v):
        return False
    s = str(v).strip().lower()
    return s == "false" or s == "0" or s == "FALSE"

def read_logical_lines(file_path: Path):

    lines = []
    with file_path.open("r", encoding="utf-8") as f:
        for raw in f:
            s = raw.strip()
            if not s:
                continue
            s = re.sub(r"^\s*\d+\.\s*", "", s)
            lines.append(s)
    return lines

def write_numbered_lines(file_path: Path, lines):
    with file_path.open("w", encoding="utf-8") as f:
        for i, content in enumerate(lines, start=1):
            f.write(f"{i}. {content}\n")

def main():
    if not INPUT_CSV.is_file():
        raise FileNotFoundError(f"找不到标注 CSV：{INPUT_CSV}")

    df = pd.read_csv(INPUT_CSV, dtype={"subfolder": str, "file_type": str, "line_no": int})

    df = df[["validation", "subfolder", "file_type", "line_no"]].copy()
    df["file_type"] = df["file_type"].str.upper().str.strip()
    df["subfolder"] = df["subfolder"].astype(str).str.strip()

    remove_map = {}
    for _, row in df.iterrows():
        if is_false_mark(row["validation"]):
            key = (row["subfolder"], row["file_type"])
            remove_map.setdefault(key, set()).add(int(row["line_no"]))

    total_deleted = 0
    total_written = 0

    for sub in sorted([p for p in BASE_DIR.iterdir() if p.is_dir()]):
        sub_name = sub.name
        for ft in FILE_TYPES:
            src = sub / ft  
            if not src.exists():
                continue

            dst = sub / f"{ft}_0.txt"
            key = (sub_name, ft)
            to_remove = remove_map.get(key, set())

            if not to_remove:
                shutil.copyfile(src, dst)
                total_written += 1
                continue

            lines = read_logical_lines(src)
            max_idx = len(lines)


            valid_remove = {i for i in to_remove if 1 <= i <= max_idx}
            if len(valid_remove) < len(to_remove):
                bad = sorted(to_remove - valid_remove)
                print(f"[警告] {sub_name}/{ft} 有越界行号被忽略：{bad}（总行数={max_idx}）")

            filtered = [s for i, s in enumerate(lines, start=1) if i not in valid_remove]
            total_deleted += len(valid_remove)

            if RENUMBER_ON_WRITE:
                write_numbered_lines(dst, filtered)
            else:
                with dst.open("w", encoding="utf-8") as f:
                    for s in filtered:
                        f.write(s + "\n")

            total_written += 1

    print(f"完成：写出 {total_written} 个 *_0.txt 文件，删除 {total_deleted} 行。")

if __name__ == "__main__":
    main()


完成：写出 1500 个 *_0.txt 文件，删除 69 行。


In [37]:
# 1. word n-gram

import os
import re
import shutil
import pandas as pd

ROOT_FOLDER = "/Users/phoebeeeee/ongoing/Beyond-noise-MA-Zuo/EACL/qwen_7b_generation_raw"
THRESHOLD = 0.5
N_GRAMS = [1, 2, 3]

def extract_explanations(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return [re.sub(r"^\d+\.\s*", "", line.strip()) for line in lines if line.strip()]

def get_ngrams(text, n):
    tokens = text.lower().split()
    return set(zip(*[tokens[i:] for i in range(n)])) if len(tokens) >= n else set()

def compute_lexical_diversity(a, b, n):
    a_ngrams = get_ngrams(a, n)
    b_ngrams = get_ngrams(b, n)
    union = a_ngrams | b_ngrams
    if not union:
        return 0.0
    intersection = a_ngrams & b_ngrams
    diversity = 1 - len(intersection) / len(union)
    return diversity

def remove_low_lexical_diversity(explanations, n_gram_list, threshold):
    kept = []
    kept_indices = []
    removed_records = []

    for i, exp in enumerate(explanations):
        is_duplicate = False
        for j, kept_exp in enumerate(kept):
            diversities = {
                n: compute_lexical_diversity(exp, kept_exp, n) for n in n_gram_list
            }
            if all(d < threshold for d in diversities.values()):
                removed_records.append({
                    "removed_index": i + 1,
                    "removed_text": exp,
                    "compared_to_index": kept_indices[j] + 1,
                    "compared_to_text": kept_exp,
                    **{f"diversity_{n}gram": diversities[n] for n in n_gram_list},
                    "is_removed": True
                })
                is_duplicate = True
                break
        if not is_duplicate:
            kept.append(exp)
            kept_indices.append(i)
            removed_records.append({
                "removed_index": i + 1,
                "removed_text": exp,
                "compared_to_index": None,
                "compared_to_text": None,
                **{f"diversity_{n}gram": None for n in n_gram_list},
                "is_removed": False
            })
    return kept, removed_records

def process_all():
    summary = []
    all_removed_records = []

    for subfolder in os.listdir(ROOT_FOLDER):
        sub_path = os.path.join(ROOT_FOLDER, subfolder)
        if not os.path.isdir(sub_path):
            continue
        for file_name in ["E_0.txt", "N_0.txt", "C_0.txt"]:
            file_path = os.path.join(sub_path, file_name)
            if not os.path.isfile(file_path):
                continue

            explanations = extract_explanations(file_path)

            base = file_name.split("_", 1)[0]
            output_file = os.path.join(sub_path, f"{base}_first.txt")

            if len(explanations) < 2:
                shutil.copyfile(file_path, output_file)

                summary.append({
                    "folder": subfolder,
                    "file": file_name,
                    "original_count": len(explanations),
                    "retained_count": len(explanations),
                    "reduction_rate": 0.0
                })
                continue

            filtered, removed_records = remove_low_lexical_diversity(
                explanations, N_GRAMS, THRESHOLD
            )

            with open(output_file, "w", encoding="utf-8") as f:
                for idx, line in enumerate(filtered, 1):
                    f.write(f"{idx}. {line}\n")

            for r in removed_records:
                r.update({"folder": subfolder, "file": file_name})
            all_removed_records.extend(removed_records)

            summary.append({
                "folder": subfolder,
                "file": file_name,
                "original_count": len(explanations),
                "retained_count": len(filtered),
                "reduction_rate": 1 - len(filtered) / len(explanations)
            })

    pd.DataFrame(summary).to_csv(os.path.join(ROOT_FOLDER, "summary_lexical_n123.csv"), index=False)
    pd.DataFrame(all_removed_records).to_csv(os.path.join(ROOT_FOLDER, "removed_details_lexical_n123.csv"), index=False)
    print("Processed all files and saved summaries.")

if __name__ == "__main__":
    process_all()


Processed all files and saved summaries.


In [54]:
# 2. pos-tag n-gram

import os
import re
import pandas as pd
import spacy
import shutil

nlp = spacy.load("en_core_web_md")

ROOT_FOLDER = "/Users/phoebeeeee/ongoing/Beyond-noise-MA-Zuo/EACL/qwen_7b_generation_raw"
THRESHOLD = 0.5
N_GRAMS = [1, 2, 3]


def extract_explanations(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return [re.sub(r"^\d+\.\s*", "", line.strip()) for line in lines if line.strip()]

# def get_ngrams(text, n):
#     tokens = text.lower().split()
#     return set(zip(*[tokens[i:] for i in range(n)])) if len(tokens) >= n else set()


def get_ngrams(text, n):
    doc = nlp(text)
    pos_tags = [token.pos_ for token in doc]
    # print(f"POS tags for '{text}': {pos_tags}")  # Debugging line to check POS tags
    return set(zip(*[pos_tags[i:] for i in range(n)])) if len(pos_tags) >= n else set()


def compute_lexical_diversity(a, b, n):
    a_ngrams = get_ngrams(a, n)
    b_ngrams = get_ngrams(b, n)
    union = a_ngrams | b_ngrams
    if not union:
        return 0.0
    intersection = a_ngrams & b_ngrams
    diversity = 1 - len(intersection) / len(union)
    return diversity

def remove_low_lexical_diversity(explanations, n_gram_list, threshold):
    kept = []
    kept_indices = []
    removed_records = []

    for i, exp in enumerate(explanations):
        is_duplicate = False
        for j, kept_exp in enumerate(kept):
            diversities = {
                n: compute_lexical_diversity(exp, kept_exp, n) for n in n_gram_list
            }
            if all(d < threshold for d in diversities.values()):
                removed_records.append({
                    "removed_index": i + 1,
                    "removed_text": exp,
                    "compared_to_index": kept_indices[j] + 1,
                    "compared_to_text": kept_exp,
                    **{f"diversity_{n}gram": diversities[n] for n in n_gram_list},
                    "is_removed": True
                })
                is_duplicate = True
                break
        if not is_duplicate:
            kept.append(exp)
            kept_indices.append(i)
            removed_records.append({
                "removed_index": i + 1,
                "removed_text": exp,
                "compared_to_index": None,
                "compared_to_text": None,
                **{f"diversity_{n}gram": None for n in n_gram_list},
                "is_removed": False
            })
    return kept, removed_records

def process_all():
    summary = []
    all_removed_records = []

    for subfolder in os.listdir(ROOT_FOLDER):
        sub_path = os.path.join(ROOT_FOLDER, subfolder)
        if not os.path.isdir(sub_path):
            continue
        for file_name in ["E_first.txt", "N_first.txt", "C_first.txt"]:
            file_path = os.path.join(sub_path, file_name)
            if not os.path.isfile(file_path):
                print(f"File not found: {file_path}")
                continue

            label_prefix = file_name.split("_")[0]  
            output_file = os.path.join(sub_path, f"{label_prefix}_second.txt")

            explanations = extract_explanations(file_path)
            if len(explanations) < 2:
                shutil.copyfile(file_path, output_file)
                summary.append({
                    "folder": subfolder,
                    "file": file_name,
                    "original_count": len(explanations),
                    "retained_count": len(explanations),
                    "reduction_rate": 0.0
                })
                continue

            filtered, removed_records = remove_low_lexical_diversity(
                explanations, N_GRAMS, THRESHOLD
            )

            with open(output_file, "w", encoding="utf-8") as f:
                for idx, line in enumerate(filtered, 1):
                    f.write(f"{idx}. {line}\n")

            for r in removed_records:
                r.update({"folder": subfolder, "file": file_name})
            all_removed_records.extend(removed_records)

            summary.append({
                "folder": subfolder,
                "file": file_name,
                "original_count": len(explanations),
                "retained_count": len(filtered),
                "reduction_rate": 1 - len(filtered) / len(explanations)
            })

    pd.DataFrame(summary).to_csv(os.path.join(ROOT_FOLDER, "summary_postag_n123.csv"), index=False)
    pd.DataFrame(all_removed_records).to_csv(os.path.join(ROOT_FOLDER, "removed_details_postag_n123.csv"), index=False)
    print("Done deduplication")

if __name__ == "__main__":
    process_all()


Done deduplication


In [None]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [55]:
# 3. sentence_embedding

import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer, util
from sklearn.decomposition import PCA

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
SIM_THRESHOLD = 0.9
ROOT_FOLDER = "/Users/phoebeeeee/ongoing/Beyond-noise-MA-Zuo/EACL/qwen_7b_generation_raw" 

def extract_explanations(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return [re.sub(r"^\d+\.\s*", "", line.strip()) for line in lines if line.strip()]

def mean_pairwise_similarity(sim_matrix):
    n = sim_matrix.shape[0]
    if n < 2:
        return 0.0
    return (np.sum(sim_matrix) - np.trace(sim_matrix)) / (n * (n - 1))

def remove_similar_explanations(explanations):
    embeddings = model.encode(explanations)
    sim_matrix = util.cos_sim(embeddings, embeddings).cpu().numpy()
    n = len(explanations)
    keep_indices = set(range(n))
    for i in range(n):
        for j in range(i + 1, n):
            if sim_matrix[i][j] > SIM_THRESHOLD and j in keep_indices:
                keep_indices.discard(j)
    keep_indices = sorted(keep_indices)
    filtered = [explanations[i] for i in keep_indices]
    return filtered, embeddings, sim_matrix, keep_indices

def plot_pca_original_vs_final(original_explanations, final_explanations, out_path, title="PCA: Original vs Final"):
    if len(original_explanations) < 2:
        print(f"Not enough original explanations to plot PCA: {len(original_explanations)}")
        return

    all_expl = original_explanations
    all_embeddings = model.encode(all_expl)
    final_set = set(final_explanations)

    mask = [text in final_set for text in all_expl]
    mask = np.array(mask)

    pca = PCA(n_components=2)
    reduced = pca.fit_transform(all_embeddings)

    plt.figure(figsize=(8, 6))
    plt.scatter(reduced[:, 0], reduced[:, 1], c='lightgray', alpha=0.4, label="Original (All)")
    plt.scatter(reduced[mask, 0], reduced[mask, 1], c='blue', alpha=0.8, label="Final (Kept)")

    for i, (x, y) in enumerate(reduced):
        plt.text(x + 0.01, y + 0.01, str(i + 1), fontsize=7, color='black')

    plt.title(title)
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

def plot_similarity_matrix(sim_matrix, out_path, title="Similarity Matrix"):
    plt.figure(figsize=(6, 5))
    sns.heatmap(sim_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True,
                xticklabels=np.arange(1, sim_matrix.shape[0] + 1),
                yticklabels=np.arange(1, sim_matrix.shape[0] + 1))
    plt.title(title)
    plt.xlabel("Index")
    plt.ylabel("Index")
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

def process_all_files(root_folder):
    summary = []

    for subfolder in os.listdir(root_folder):
        sub_path = os.path.join(root_folder, subfolder)
        if not os.path.isdir(sub_path):
            continue
        for label in ["E", "N", "C"]:
            input_file = os.path.join(sub_path, f"{label}_second.txt")
            output_file = os.path.join(sub_path, f"{label}_third.txt")
            if not os.path.isfile(input_file):
                continue

            explanations = extract_explanations(input_file)
            if len(explanations) < 2:
                shutil.copyfile(input_file, output_file)
                summary.append({
                    "folder": subfolder,
                    "file": f"{label}_second.txt",
                    "original_count": len(explanations),
                    "retained_count": len(explanations),
                    "reduction_rate": 0.0,
                    "mean_sim_original": None,
                    "mean_sim_filtered": None,
                    "similarity_drop": None,
                })
                continue

            filtered, embeddings, sim_matrix, filtered_indices = remove_similar_explanations(explanations)
            filtered_embeddings = model.encode(filtered)
            filtered_sim_matrix = util.cos_sim(filtered_embeddings, filtered_embeddings).cpu().numpy()

            mean_sim_orig = mean_pairwise_similarity(sim_matrix)
            mean_sim_filt = mean_pairwise_similarity(filtered_sim_matrix)
            similarity_drop = mean_sim_orig - mean_sim_filt

            with open(output_file, 'w', encoding='utf-8') as f:
                for idx, line in enumerate(filtered, 1):
                    f.write(f"{idx}. {line}\n")

            original_file = os.path.join(sub_path, f"{label}.txt")
            if os.path.isfile(original_file):
                original_expl = extract_explanations(original_file)
                pca_out = os.path.join(sub_path, f"{label}_third_pca_from_original.png")
                plot_pca_original_vs_final(original_expl, filtered, pca_out, title=f"{subfolder}_{label} Original vs Final PCA")

            sim_out = os.path.join(sub_path, f"{label}_third_sim_matrix.png")
            plot_similarity_matrix(sim_matrix, sim_out, title=f"{subfolder}_{label}_third Similarity Matrix")

            summary.append({
                "folder": subfolder,
                "file": f"{label}_second.txt",
                "original_count": len(explanations),
                "retained_count": len(filtered),
                "reduction_rate": 1 - len(filtered) / len(explanations),
                "mean_sim_original": mean_sim_orig,
                "mean_sim_filtered": mean_sim_filt,
                "similarity_drop": similarity_drop,
            })

    df = pd.DataFrame(summary)
    df.to_csv(os.path.join(root_folder, "summary_sentence_embedding.csv"), index=False)
    print("Summary saved to summary.csv")
process_all_files(ROOT_FOLDER)


Summary saved to summary.csv


In [56]:
# collect dedup information
 
import os
import re
import pandas as pd

ROOT_FOLDER = "/Users/phoebeeeee/ongoing/Beyond-noise-MA-Zuo/EACL/qwen_7b_generation_raw"

LABELS = ["E", "N", "C"]
STAGE_PATHS = [
    ("0-1", "", "first"),
    ("1-2", "first", "second"),
    ("2-3", "second", "third"),
]

def extract_indexed_explanations(file_path):
    lines = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            match = re.match(r"^(\d+)\.\s*(.*)", line)
            if match:
                idx = int(match.group(1))
                content = match.group(2)
                lines.append((idx, content))
    return lines

def compare_files(from_list, to_list):
    from_dict = dict(from_list)
    to_indices = set(idx for idx, _ in to_list)
    dropped_indices = sorted(set(from_dict.keys()) - to_indices)
    dropped_texts = [from_dict[idx] for idx in dropped_indices]
    return dropped_indices, dropped_texts

def compare_all_stages():
    all_records = []

    for subfolder in os.listdir(ROOT_FOLDER):
        sub_path = os.path.join(ROOT_FOLDER, subfolder)
        if not os.path.isdir(sub_path):
            continue

        for label in LABELS:
            row = {
                "folder": subfolder,
                "label": label,
                "0-1": {"num": 0, "text": []},
                "1-2": {"num": 0, "text": []},
                "2-3": {"num": 0, "text": []},
                "dropped_count": 0,
                # "dropped_indices": [],
            }

            all_dropped_indices = set()

            for stage_name, from_tag, to_tag in STAGE_PATHS:
                from_file = f"{label}.txt" if from_tag == "" else f"{label}_{from_tag}.txt"
                to_file = f"{label}_{to_tag}.txt"

                from_path = os.path.join(sub_path, from_file)
                to_path = os.path.join(sub_path, to_file)

                if not os.path.isfile(from_path) or not os.path.isfile(to_path):
                    continue

                from_data = extract_indexed_explanations(from_path)
                to_data = extract_indexed_explanations(to_path)

                dropped_indices, dropped_texts = compare_files(from_data, to_data)

                row[stage_name]["num"] = len(dropped_indices)
                row[stage_name]["text"] = dropped_texts
                all_dropped_indices.update(dropped_indices)

            row["dropped_count"] = len(all_dropped_indices)
            # row["dropped_indices"] = sorted(all_dropped_indices)

            all_records.append(row)

    df = pd.DataFrame(all_records)
    output_csv = os.path.join(ROOT_FOLDER, "explanation_diff_summary_structured.csv")
    df.to_csv(output_csv, index=False)
    print(f"✅ Summary saved to {output_csv}")

compare_all_stages()


✅ Summary saved to /Users/phoebeeeee/ongoing/Beyond-noise-MA-Zuo/EACL/qwen_7b_generation_raw/explanation_diff_summary_structured.csv
