In [41]:
from dataclasses import dataclass
import hashlib
import json
from pathlib import Path
import psutil
import time
from tqdm.auto import tqdm
import logging

@dataclass() 
class ExperimentConfig:
    # 核心参数
    num_bands: int = 4
    band_size: int = 16
    threshold: float = 0.8
    simhash_bits: int = 64
    ngram_range: tuple = (1, 3)
    chunk_size: int = 10000
    
    # 路径参数
    raw_data_root: str = "D:/spring2025/UCUG2011-Discrete-Math/project 1/discrete-math-project-1/src/data"
    processed_dir: str = "D:/spring2025/UCUG2011-Discrete-Math/project 1/discrete-math-project-1/src/processeddata"

    def __post_init__(self):
        """参数校验"""
        if self.num_bands * self.band_size != self.simhash_bits:
            raise ValueError("num_bands * band_size must equal simhash_bits")
        if not (0 < self.threshold <= 1):
            raise ValueError("Threshold must be in (0.0, 1.0]")

    @property
    def param_hash(self) -> str:
        """包含所有参数的哈希标识"""
        param_dict = {
            "num_bands": self.num_bands,
            "band_size": self.band_size,
            "threshold": round(self.threshold, 2),
            "simhash_bits": self.simhash_bits,
            "ngram_range": self.ngram_range,
            "chunk_size": self.chunk_size
        }
        return hashlib.md5(json.dumps(param_dict, sort_keys=True).encode()).hexdigest()[:10]
        
    @property
    def params_dict(self) -> dict:
        """获取可序列化的参数字典"""
        return {
            'num_bands': self.num_bands,
            'band_size': self.band_size,
            'threshold': self.threshold,
            'simhash_bits': self.simhash_bits,
            'ngram_range': str(self.ngram_range),
            'chunk_size': self.chunk_size
        }
    
    @property
    def experiment_dir(self) -> Path:
        """动态实验目录"""
        dir_path = Path(self.processed_dir) / f"exp_{self.param_hash}"
        dir_path.mkdir(parents=True, exist_ok=True)
        return dir_path
    
    preprocessed_path = Path(processed_dir) / "preprocessed.csv"
    
    @property
    def signature_path(self) -> Path:
        """动态签名文件路径"""
        return self.experiment_dir / f"signatures_{self.param_hash}.parquet"

    @property
    def candidates_path(self) -> Path:
        """动态候选对路径"""
        return self.experiment_dir / f"candidates_{self.param_hash}.csv"
    
    @property
    def performance_log_path(self) -> Path:
        """动态性能评估路径"""
        return self.experiment_dir / f"performance.json"
    
# 初始化日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.FileHandler(Path(ExperimentConfig.processed_dir)/'process.log'),
        logging.StreamHandler()
    ]
)

In [42]:
def monitor_performance(func):
    def wrapper(*args, **kwargs):
        start_time = time.monotonic()
        process = psutil.Process()
        start_mem = process.memory_info().rss
        
        result = func(*args, **kwargs)
        
        # 计算资源消耗
        elapsed = time.monotonic() - start_time
        end_mem = process.memory_info().rss
        peak_mem = (end_mem - start_mem) // 1024 // 1024  # 转换为MB
        
        # 自动捕获配置对象
        config = next((a for a in args if isinstance(a, ExperimentConfig)), None)
        
        if config:
            # 记录到实验结果
            performance_data = {
                "stage": func.__name__,
                "time_sec": round(elapsed, 2),
                "peak_memory_mb": peak_mem,
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
            }
            
            # 确保目录存在
            config.experiment_dir.mkdir(parents=True, exist_ok=True)
            
            # 合并历史日志（如果存在）
            log_path = config.performance_log_path
            existing_logs = []
            if log_path.exists():
                try:
                    with open(log_path, "r") as f:
                        existing_logs = json.load(f)
                except json.JSONDecodeError:
                    logging.warning(f"性能日志 {log_path} 损坏，重置文件")

            # 追加新记录
            existing_logs.append(performance_data)
            
            # 安全写入
            try:
                with open(log_path, "w") as f:
                    json.dump(existing_logs, f, indent=2)
            except IOError as e:
                logging.error(f"写入性能日志失败: {str(e)}")

        
        return result
    return wrapper

In [43]:
import re
import pandas as pd
from datasets import load_dataset

def preprocess_document(text):
    # 1. 仅删除结构化标记，保留后续内容
    text = re.sub(
        r'_START[＿_]\w+\b\s*',  # 匹配_START_xxx及紧随的空白
        '', 
        text,
        flags=re.MULTILINE
    )
    
    # 2. 标准化为小写
    text = text.lower()
    
    # 3. 去除标点符号（保留字母数字和空格）
    text = re.sub(r'[^\w\s]', '', text)
    
    # 4. 合并连续空格并去除首尾空白
    return ' '.join(text.strip().split())

def preprocess_data(config: ExperimentConfig):
    logging.info("Starting Step 1: Data Preprocessing")
    
    # 加载原始数据
    test_files = [str(p) for p in Path(f"{config.raw_data_root}/test").glob("*.arrow")]
    val_files = [str(p) for p in Path(f"{config.raw_data_root}/validation").glob("*.arrow")]
    
    test_data = load_dataset("arrow", data_files=test_files, split="train")
    val_data = load_dataset("arrow", data_files=val_files, split="train")
    
    # 转换为Pandas并重命名列
    val_df = val_data.data.table.to_pandas().rename(columns={'wikidata_id': 'doc_id'})
    test_df = test_data.data.table.to_pandas().rename(columns={'wikidata_id': 'doc_id'})
    
    # 添加source列
    val_df['source'] = 'val'
    test_df['source'] = 'test'
    
    # 合并数据集
    merged_df = pd.concat([val_df[['doc_id', 'text', 'source']], 
                          test_df[['doc_id', 'text', 'source']]])
    
    # 分块处理文本（内存优化）
    chunk_size = 10000
    chunks = []
    
    with tqdm(total=len(merged_df), desc="Preprocessing Texts") as pbar:
        for i in range(0, len(merged_df), chunk_size):
            chunk = merged_df.iloc[i:i+chunk_size].copy()
            chunk['clean_text'] = chunk['text'].apply(
                lambda x: preprocess_document(str(x))
            )
            # 删除原始文本列释放内存
            chunk.drop(columns=['text'], inplace=True)
            chunks.append(chunk)
            pbar.update(len(chunk))
    
    processed_df = pd.concat(chunks)
    
    # 保存预处理结果
    processed_df[['doc_id', 'source', 'clean_text']].to_csv(
        ExperimentConfig.preprocessed_path,
        index=False,
        encoding='utf-8'
    )
    logging.info(f"Saved preprocessed data to {ExperimentConfig.preprocessed_path}")

In [44]:
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.pipeline import make_pipeline
from joblib import Parallel, delayed

class OptimizedTFIDFSimHasher:
    def __init__(self, num_bits=64):
        self.num_bits = num_bits
        self._feature_cache = {}  # 特征哈希缓存
        
    def _get_feature_hash(self, feature_id: int) -> np.ndarray:
        """预计算特征哈希（使用稳定哈希）"""
        if feature_id not in self._feature_cache:
            # 使用SHA1确保哈希稳定性
            hash_bytes = hashlib.sha1(str(feature_id).encode()).digest()
            hash_int = int.from_bytes(hash_bytes, byteorder='big')
            mask = (1 << self.num_bits) - 1
            bits = [(hash_int & (1 << i)) >> i for i in range(self.num_bits)]
            self._feature_cache[feature_id] = np.array(
                [1 if b else -1 for b in bits], dtype=np.int8
            )
        return self._feature_cache[feature_id]

    def generate_signature(self, tfidf_vector):
        """优化后的签名生成"""
        hash_vector = np.zeros(self.num_bits, dtype=np.float32)
        
        # 只处理非零特征
        indices = tfidf_vector.indices
        data = tfidf_vector.data
        
        for i, val in zip(indices, data):
            hash_vector += self._get_feature_hash(i) * val
            
        return int(''.join(['1' if x > 0 else '0' for x in hash_vector]), 2)

In [45]:
import csv
import numpy as np
from pathlib import Path
from collections import defaultdict
from typing import List, Union

class SimHashLSHProcessor:
    def __init__(self, config):
        """
        增强型LSH处理器，适配预处理流水线
        :param num_bands: 分桶数（必须满足 num_bands * band_size == 64）
        :param band_size: 每桶比特数
        :param threshold: 相似度阈值 (0.0~1.0)
        """
        self.config=config
        self._validate_config()
        self.num_bands = config.num_bands
        self.band_size = config.band_size
        self.threshold = config.threshold
        self.simhash_bits=config.simhash_bits

        self.inverted_index = defaultdict(set)
        self.signatures = {}
        self.doc_metadata = {}

    def _validate_config(self):
        """参数校验"""
        if self.config.num_bands * self.config.band_size != self.config.simhash_bits:
            raise ValueError(
                f"num_bands({self.config.num_bands}) * band_size({self.config.band_size}) "
                f"must equal to simhash_bits({self.config.simhash_bits})"
            )
        if not (0 < self.config.threshold <= 1):
            raise ValueError(f"Threshold must be in (0.0, 1.0], got {self.config.threshold}")

    def load_signatures(self, parquet_path: Union[str, Path]):
        """
        从Parquet文件加载预处理生成的签名数据
        :param parquet_path: 预处理输出文件路径
        """
        try:
            data = pd.read_parquet(parquet_path)
        except FileNotFoundError:
            logging.error(f"签名文件不存在: {parquet_path}")
            raise
        except Exception as e:
            logging.error(f"加载Parquet失败: {str(e)}")
            raise

        
        # 数据格式验证
        required_fields = {'doc_id', 'source', 'simhash'}
        if not required_fields.issubset(data.columns):
            missing=required_fields - set(data.columns)
            raise KeyError(f"Missing required fields: {missing}")
        
                # 加载到内存
        for row in data.itertuples():
            if len(row.simhash) != self.config.simhash_bits:
                raise ValueError(
                    f"Length of the signature of the file {row.doc_id} is not equal to {self.config.simhash_bits}. "
                    f"Expected: {self.config.simhash_bits}."
                )
                
            self.signatures[row.doc_id] = row.simhash
            self.doc_metadata[row.doc_id] = {'source': row.source}

    def _band_signature(self, binary_str: str) -> List[str]:
        """优化分桶策略，适配预处理格式"""
        return [
            hashlib.sha1(binary_str[i*self.band_size : (i+1)*self.band_size].encode()).hexdigest()
            for i in range(self.num_bands)
        ]

    def _hamming_similarity(self, sig1: str, sig2: str) -> float:
        """向量化汉明距离计算"""
        arr1 = np.frombuffer(sig1.encode(), 'u1') - ord('0')
        arr2 = np.frombuffer(sig2.encode(), 'u1') - ord('0')
        return 1 - np.count_nonzero(arr1 != arr2) / self.simhash_bits

    def generate_candidates(self, output_path: str):
        """
        生成候选对并保存为CSV
        :param output_path: 输出文件路径
        """
        # 建立倒排索引
        self.inverted_index.clear()
        for doc_id, sig in tqdm(self.signatures.items(), desc="Building Inverted Index"):
            for band_hash in self._band_signature(sig):
                self.inverted_index[band_hash].add(doc_id)

        # 候选对生成
        seen_pairs = set()
        stats = {'total': 0, 'valid':0}
        
        with open(output_path, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(["doc_id1", "source1", "doc_id2", "source2", "similarity"])
            
            for doc_id1 in tqdm(self.signatures, desc="生成候选对"):
                candidate_ids = set()
                for band_hash in self._band_signature(self.signatures[doc_id1]):
                    candidate_ids.update(self.inverted_index[band_hash])
                
                stats['total'] += len(candidate_ids)
                
                for doc_id2 in candidate_ids:
                    if doc_id1 >= doc_id2:
                        continue
                    
                    pair_key = frozenset({doc_id1, doc_id2})
                    if pair_key in seen_pairs:
                        continue
                        
                    similarity = self._hamming_similarity(
                        self.signatures[doc_id1], 
                        self.signatures[doc_id2]
                    )
                    
                    if similarity >= self.config.threshold:
                        seen_pairs.add(pair_key)
                        stats['valid'] += 1
                        
                        source1 = self.doc_metadata[doc_id1]['source']
                        source2 = self.doc_metadata[doc_id2]['source']
                        writer.writerow([
                            doc_id1, source1, 
                            doc_id2, source2, 
                            f"{similarity:.4f}"
                        ])
        
        logging.info(f"生成候选对完成: 总候选 {stats['total']}, 有效 {stats['valid']}")
        return {
            'total_candidates': stats['total'],
            'valid_candidates': stats['valid'],
        }

In [46]:
class UnionFind:
    def __init__(self, elements):
        self.parent = {e: e for e in elements}
        self.rank = {e: 1 for e in elements}
    
    def find(self, x):
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]
    
    def union(self, x, y):
        root_x = self.find(x)
        root_y = self.find(y)
        
        if root_x != root_y:
            if self.rank[root_x] > self.rank[root_y]:
                self.parent[root_y] = root_x
            else:
                self.parent[root_x] = root_y
                if self.rank[root_x] == self.rank[root_y]:
                    self.rank[root_y] += 1

    def get_components(self):
        components =defaultdict(list)
        for elem in self.parent:
            root = self.find(elem)
            components[root].append(elem)
        return components

In [None]:
class ExperimentRunner:
    def __init__(self, base_dir=ExperimentConfig.processed_dir):
        self.base_dir = Path(base_dir)
        self.current_results = {}
        self.summary_path = self.base_dir / "experiment_summary.csv" 
        self._init_summary_file()
    
    def _init_summary_file(self):
        """初始化空CSV文件"""
        if not self.summary_path.exists():
            columns = [
                'experiment_id', 'timestamp',
                *ExperimentConfig().params_dict.keys(),
                'simhash_time_sec', 'lsh_time_sec',
                'total_time_sec', 'peak_memory_mb',
                'total_candidates', 'valid_candidates',
                'total_duplicate_ratio', 'cross_set_ratio',
                'intra_val_ratio', 'intra_test_ratio',
            ]
            pd.DataFrame(columns=columns).to_csv(self.summary_path, index=False)

    def _update_summary(self, config: ExperimentConfig, metrics: dict):
        if self.summary_path.exists():
            existing_df = pd.read_csv(self.summary_path)
        else:
            existing_df = pd.DataFrame()

        new_row = {
            'experiment_id': config.param_hash,
            'timestamp': pd.Timestamp.now().isoformat(),
            **config.params_dict,
            **metrics
        }
        updated_df = pd.concat([existing_df, pd.DataFrame([new_row])], ignore_index=True)
        updated_df.to_csv(self.summary_path, index=False)

    def run_experiment(self, config: ExperimentConfig):
        """执行完整实验流程"""
        self.current_results = {}
        exp_dir = self.base_dir / config.param_hash  
        exp_dir.mkdir(parents=True, exist_ok=True)
        
        # 动态生成路径
        logging.info(f"签名文件路径: {config.signature_path}")
        logging.info(f"候选对路径: {config.candidates_path}")
        
        # 执行各阶段并记录性能
        if config.preprocessed_path.exists():
            logging.info(f"使用现有预处理文件: {config.preprocessed_path}")
        else:
            logging.info(f"开始预处理数据: {config.preprocessed_path}")
            self._run_preprocessing(config)
        simhash_start = time.time()
        self._run_simhash(config)
        simhash_time = time.time() - simhash_start
        logging.info(f"SimHash签名生成耗时: {simhash_time:.2f}秒")
        lsh_start = time.time()
        candidate_path=self._run_lsh(config)
        lsh_time = time.time() - lsh_start

        dup_metrics= self._calculate_duplication_metrics(candidate_path)
        

        try:
            with open(config.performance_log_path) as f:
                perf_data = json.load(f)
                peak_memory = max(log['peak_memory_mb'] for log in perf_data)
        except FileNotFoundError:
            logging.error("性能日志未生成，请检查预处理阶段")
            peak_memory = -1  # 标记异常值
        except Exception as e:
            logging.error(f"读取性能日志失败: {str(e)}")
            peak_memory = -1
            
        summary_metrics={
            **config.params_dict,
            'simhash_time_sec': round(simhash_time, 2),
            'lsh_time_sec': round(lsh_time, 2),
            'total_time_sec': round(simhash_time + lsh_time, 2),
            'peak_memory_mb': peak_memory,
            **self.current_results
            **dup_metrics
        }
        # 保存实验结果
        self._update_summary(config, summary_metrics)
        
    def _run_preprocessing(self, config):
        Path(config.processed_dir).mkdir(parents=True, exist_ok=True)
        
        if config.preprocessed_path.exists():
            logging.info(f"使用现有预处理文件: {config.preprocessed_path}")
            return
        
        # 执行完整预处理
        preprocess_data(config)
    
    @monitor_performance
    def _run_simhash(self, config):
        """优化后的签名生成流程"""
        # 加载预处理后的数据
        preprocessed_path = Path(ExperimentConfig.preprocessed_path)
        if not preprocessed_path.exists():
            raise FileNotFoundError(f"预处理文件不存在: {preprocessed_path}")
        preprocessed_data = pd.read_csv(preprocessed_path)
        texts = preprocessed_data['clean_text'].tolist()
        doc_ids = preprocessed_data['doc_id'].tolist()
        sources = preprocessed_data['source'].tolist()
        # 在 _run_simhash 方法中添加
        assert len(doc_ids) == len(sources) == len(texts), \
            f"数据长度不一致: doc_ids({len(doc_ids)}) vs sources({len(sources)}) vs texts({len(texts)})"
        
        # 使用HashingVectorizer替代TfidfVectorizer
        pipeline = make_pipeline(
            HashingVectorizer(
                ngram_range=config.ngram_range,
                stop_words='english',
                n_features=2**20  # 增加哈希空间避免冲突
            ),
            TfidfTransformer()
        )
        
        # 分块处理
        chunk_size = config.chunk_size
        total_chunks = (len(texts)-1) // chunk_size + 1
        signatures = []

        with tqdm(total=total_chunks, desc="Generating SimHash Signatures") as pbar:
            for chunk_idx in range(0, len(texts), chunk_size):
                chunk_texts = texts[chunk_idx:chunk_idx + chunk_size]
                
                # TF-IDF转换
                tfidf_chunk = pipeline.fit_transform(chunk_texts)
                
                # 初始化SimHash生成器
                hasher = OptimizedTFIDFSimHasher(num_bits=config.simhash_bits)
                
                # 并行生成签名
                chunk_sigs = Parallel(n_jobs=-1, prefer="threads")(
                    delayed(hasher.generate_signature)(tfidf_chunk[j])
                    for j in range(tfidf_chunk.shape[0])
                )
                
                signatures.extend(chunk_sigs)
                pbar.update(1)
            
            
        # 构建签名DataFrame
        signature_df = pd.DataFrame({
            'doc_id': doc_ids,
            'source': sources,
            'simhash': [format(sig, f'0{config.simhash_bits}b') for sig in signatures]
        })

        # 验证签名长度
        assert all(len(s) == config.simhash_bits 
            for s in signature_df['simhash']), "签名长度异常"
        
        # 保存签名文件
        config.signature_path.parent.mkdir(parents=True, exist_ok=True)
        signature_df.to_parquet(
            config.signature_path,
            engine='pyarrow',
            compression='snappy'
        )
        logging.info(f"SimHash签名已保存至: {config.signature_path}")
    
    @monitor_performance
    def _run_lsh(self, config):
        '''generate candidates'''
        processor = SimHashLSHProcessor(config=config)
        try:
            processor.load_signatures(config.signature_path)
        except Exception as e:
            logging.error(f"加载签名失败: {str(e)}")
            raise
        result=processor.generate_candidates(config.candidates_path)
        self.current_results.update({
            'total_candidates': result['total_candidates'],
            'valid_candidates': result['valid_candidates']
        })

        return config.candidates_path
    
    def _calculate_duplication_metrics(self, candidates_path: Path) -> dict:
        """计算详细的重复率指标"""

        candidates = pd.read_csv(candidates_path)
        preprocessed_path = Path(ExperimentConfig.preprocessed_path)
        preprocessed = pd.read_csv(preprocessed_path)
        total_docs = len(preprocessed)
        
        # 构建并查集
        uf = UnionFind(preprocessed['doc_id'].tolist())
        
        # 合并所有候选对
        for _, row in candidates.iterrows():
            uf.union(row['doc_id1'], row['doc_id2'])
        
        # 统计各连通分量
        components = uf.get_components()
        
        # 初始化统计指标
        cross_set_duplicates = 0    # 跨数据集重复文档数
        intra_val_duplicates = 0    # 验证集内部重复文档数
        intra_test_duplicates = 0   # 测试集内部重复文档数
        duplicate_docs =set()

        # 分析每个连通分量
        for component in components.values():
            component_size = len(component)
            if component_size < 2:
                continue  # 忽略单文档组
            
            # 统计来源分布
            sources = preprocessed[preprocessed['doc_id'].isin(component)]['source']
            has_val = 'val' in sources.values
            has_test = 'test' in sources.values
            
            # 统计重复文档数（去除基准计数）
            duplicate_docs.update(component)
            
            if has_val and has_test:
                cross_set_duplicates += component_size  # 整个分量都算作跨集重复
            elif has_val:
                intra_val_duplicates += component_size
            elif has_test:
                intra_test_duplicates += component_size
        
        # 计算重复率（基于总文档数）
        return {
            'total_docs': total_docs,
            'total_duplicate_ratio': len(duplicate_docs) / total_docs if total_docs else 0,
            'cross_set_ratio': cross_set_duplicates / total_docs if total_docs else 0,
            'intra_val_ratio': intra_val_duplicates / total_docs if total_docs else 0,
            'intra_test_ratio': intra_test_duplicates / total_docs if total_docs else 0
        }

In [48]:
from sklearn.model_selection import ParameterGrid
def main():
    """单组参数测试主函数（适配新版配置）"""
    # 初始化实验运行器
    runner = ExperimentRunner()

    # 定义参数搜索空间
    param_grid = {
        'num_bands': [4, 8, 16],
        'band_size': [16, 8, 4],  # make sure num_bands * band_size = simhash_bits
        'threshold': [0.7, 0.75, 0.8],
        'simhash_bits': [64],
        'ngram_range': [(1,3), (2,4)],
        'chunk_size': [10000, 50000]
    }
    # param_grid = {
    #     'num_bands': [8, 16, 4],
    #     'band_size': [16, 8, 32],  # make sure num_bands * band_size = simhash_bits
    #     'threshold': [0.7, 0.75, 0.8],
    #     'simhash_bits': [128],
    #     'ngram_range': [(1,3), (2,4)],
    #     'chunk_size': [10000, 50000]
    # }

    valid_params = []
    for params in ParameterGrid(param_grid):
        if params['num_bands'] * params['band_size'] != params['simhash_bits']:
            continue
        valid_params.append(params)
    
    print(f"总实验数: {len(valid_params)}")
    
    
    for i, params in enumerate(tqdm(valid_params, desc="Running Experiments")):
        try:
            # 创建配置对象
            config = ExperimentConfig(
                num_bands=params['num_bands'],
                band_size=params['band_size'],
                threshold=params['threshold'],
                simhash_bits=params['simhash_bits'],
                ngram_range=params['ngram_range'],
                chunk_size=params['chunk_size']
            )
            print("=== 实验配置 ===")
            print(f"参数哈希: {config.param_hash}")
            print(f"实验目录: {config.experiment_dir}")
            print(f"预处理文件: {config.preprocessed_path}")
            print(f"签名文件: {config.signature_path}")
            print(f"候选对文件: {config.candidates_path}\n")
            # 运行实验
            print("=== 开始执行 ===")
            runner.run_experiment(config)
            # 打印性能日志
            print("\n=== 详细性能 ===")
            perf_log = load_performance_log(config.performance_log_path)
            for log in perf_log:
                print(f"[{log['timestamp']}] {log['stage']}: "
                    f"{log['time_sec']}s | 内存峰值: {log['peak_memory_mb']}MB")
                
        except Exception as e:
            print(f"\n!!! 实验失败: {str(e)}")
            # 打印调试信息
            print("\n调试信息:")
            print(f"当前目录内容: {list(config.experiment_dir.glob('*'))}")
            if config.performance_log_path.exists():
                print(f"性能日志大小: {config.performance_log_path.stat().st_size} bytes")
            raise
            


def load_performance_log(log_path: Path) -> list:
    """安全加载性能日志"""
    if not log_path.exists():
        return []
    
    try:
        with open(log_path, "r") as f:
            return json.load(f)
    except json.JSONDecodeError:
        print(f"警告: 性能日志 {log_path} 格式错误")
        return []
    except Exception as e:
        print(f"加载日志失败: {str(e)}")
        return []

if __name__ == "__main__":
    main()

总实验数: 36


Running Experiments:   0%|          | 0/36 [00:00<?, ?it/s]

2025-04-18 14:56:10,862 [INFO] 签名文件路径: D:\spring2025\UCUG2011-Discrete-Math\project 1\discrete-math-project-1\src\processeddata\exp_799da7c2a6\signatures_799da7c2a6.parquet
2025-04-18 14:56:10,863 [INFO] 候选对路径: D:\spring2025\UCUG2011-Discrete-Math\project 1\discrete-math-project-1\src\processeddata\exp_799da7c2a6\candidates_799da7c2a6.csv
2025-04-18 14:56:10,864 [INFO] 开始预处理数据: D:\spring2025\UCUG2011-Discrete-Math\project 1\discrete-math-project-1\src\processeddata\preprocessed.csv
2025-04-18 14:56:10,866 [INFO] Starting Step 1: Data Preprocessing


=== 实验配置 ===
参数哈希: 799da7c2a6
实验目录: D:\spring2025\UCUG2011-Discrete-Math\project 1\discrete-math-project-1\src\processeddata\exp_799da7c2a6
预处理文件: D:\spring2025\UCUG2011-Discrete-Math\project 1\discrete-math-project-1\src\processeddata\preprocessed.csv
签名文件: D:\spring2025\UCUG2011-Discrete-Math\project 1\discrete-math-project-1\src\processeddata\exp_799da7c2a6\signatures_799da7c2a6.parquet
候选对文件: D:\spring2025\UCUG2011-Discrete-Math\project 1\discrete-math-project-1\src\processeddata\exp_799da7c2a6\candidates_799da7c2a6.csv

=== 开始执行 ===


Preprocessing Texts:   0%|          | 0/325871 [00:00<?, ?it/s]

2025-04-18 14:57:00,996 [INFO] Saved preprocessed data to D:\spring2025\UCUG2011-Discrete-Math\project 1\discrete-math-project-1\src\processeddata\preprocessed.csv


Generating SimHash Signatures:   0%|          | 0/33 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# import time
# from sklearn.model_selection import ParameterGrid
# from tqdm.auto import tqdm
# import pandas as pd

# def main():
#     """主函数：执行多组参数实验"""
#     # 初始化实验运行器
#     runner = ExperimentRunner(base_dir="experiments")
    
#     # 定义参数搜索空间
#     param_grid = {
#         'num_bands': [4, 8, 16],
#         'band_size': [16, 8, 4],  # 需确保 num_bands * band_size = simhash_bits
#         'threshold': [0.7, 0.75, 0.8],
#         'simhash_bits': [64],     # 固定参数示例
#         'ngram_range': [(1,3), (2,4)],
#         'chunk_size': [10000, 50000]
#     }

#     # 生成有效参数组合
    # valid_params = []
    # for params in ParameterGrid(param_grid):
    #     # 跳过无效组合（例如 num_bands * band_size != simhash_bits）
    #     if params['num_bands'] * params['band_size'] != params['simhash_bits']:
    #         continue
    #     valid_params.append(params)
    
    # print(f"总实验数: {len(valid_params)}")
    
#     # 准备结果表格
#     result_df = pd.DataFrame(columns=[
#         'param_hash', 'num_bands', 'band_size', 'threshold',
#         'simhash_bits', 'ngram_range', 'chunk_size',
#         'total_candidates', 'valid_candidates', 'duplicate_rate',
#         'time_cost', 'memory_peak'
#     ])
    
#     # 执行实验
#     for i, params in enumerate(tqdm(valid_params, desc="Running Experiments")):
#         try:
#             # 创建配置对象
#             config = ExperimentConfig(
#                 num_bands=params['num_bands'],
#                 band_size=params['band_size'],
#                 threshold=params['threshold'],
#                 simhash_bits=params['simhash_bits'],
#                 ngram_range=params['ngram_range'],
#                 chunk_size=params['chunk_size']
#             )
            
#             # 运行实验
#             start_time = time.time()
#             runner.run_experiment(config)
#             elapsed = time.time() - start_time
            
#             # 收集结果
#             result_row = {
#                 'param_hash': config.param_hash,
#                 'num_bands': config.num_bands,
#                 'band_size': config.band_size,
#                 'threshold': config.threshold,
#                 'simhash_bits': config.simhash_bits,
#                 'ngram_range': str(config.ngram_range),  # 转换为字符串方便存储
#                 'chunk_size': config.chunk_size,
#                 'total_candidates': runner.current_results['total_candidates'],
#                 'valid_candidates': runner.current_results['valid_candidates'],
#                 'duplicate_rate': runner.current_results['duplicate_rate'],
#                 'time_cost': elapsed,
#                 'memory_peak': max([log['peak_memory_mb'] for log in config.performance_log])
#             }
            
#             # 保存到表格
#             result_df = pd.concat([result_df, pd.DataFrame([result_row])], ignore_index=True)
            
#             # 实时保存进度
#             result_df.to_csv("experiments/summary.csv", index=False)
            
#         except Exception as e:
#             print(f"\n参数组合 {params} 执行失败: {str(e)}")
#             continue
    
#     # 分析最佳参数
#     print("\n===== 实验结果分析 =====")
#     print("耗时最短配置:")
#     print(result_df.loc[result_df['time_cost'].idxmin()][['num_bands', 'band_size', 'time_cost']])
    
#     print("\n重复率最高配置:")
#     print(result_df.loc[result_df['duplicate_rate'].idxmax()][['threshold', 'duplicate_rate']])
    
#     # 生成可视化报告
#     generate_report(result_df)

# def generate_report(df):
#     """生成可视化报告（示例）"""
#     import matplotlib.pyplot as plt
    
#     # 耗时与分块大小关系
#     plt.figure(figsize=(10,6))
#     for ngram in df['ngram_range'].unique():
#         subset = df[df['ngram_range'] == ngram]
#         plt.scatter(subset['chunk_size'], subset['time_cost'], label=ngram)
#     plt.xlabel('Chunk Size')
#     plt.ylabel('Time Cost (s)')
#     plt.legend()
#     plt.savefig("experiments/time_vs_chunk.png")
    
#     # 内存使用分布
#     plt.figure(figsize=(10,6))
#     df['memory_peak'].plot(kind='hist', bins=20)
#     plt.xlabel('Peak Memory Usage (MB)')
#     plt.savefig("experiments/memory_dist.png")

# if __name__ == "__main__":
#     main()