In [4]:
import os
import json
import random
import logging
import shutil
import stanza
from tqdm import tqdm

In [5]:
# 假设 BASE_DIR 已经定义为项目的根目录
BASE_DIR = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling' 

In [6]:
# 定义源目录和目标目录
spacy_dir = os.path.join(BASE_DIR, 'experiments/lda/spacy/preprocessed')
cusanus_dir = os.path.join(BASE_DIR, 'experiments/lda/cusanus/preprocessed')

# 创建测试集和训练集目录
spacy_test_dir = os.path.join(BASE_DIR, 'experiments/lda/spacy/test_set')
spacy_train_dir = os.path.join(BASE_DIR, 'experiments/lda/spacy/train_set')
cusanus_test_dir = os.path.join(BASE_DIR, 'experiments/lda/cusanus/test_set')
cusanus_train_dir = os.path.join(BASE_DIR, 'experiments/lda/cusanus/train_set')

# 确保所有目录存在
for directory in [spacy_test_dir, spacy_train_dir, cusanus_test_dir, cusanus_train_dir]:
    os.makedirs(directory, exist_ok=True)

def split_train_test_files(source_dir, test_dir, train_dir, test_ratio=0.2):
    # 获取所有文件
    all_files = [f for f in os.listdir(source_dir) if f.endswith('.txt')]
    
    # 计算测试集数量
    test_size = int(len(all_files) * test_ratio)
    
    # 随机选择测试文件
    test_files = random.sample(all_files, test_size)
    # 剩余的文件作为训练集
    train_files = [f for f in all_files if f not in test_files]
    
    # 复制测试文件
    for file in test_files:
        src = os.path.join(source_dir, file)
        dst = os.path.join(test_dir, file)
        shutil.copy2(src, dst)
    
    # 复制训练文件
    for file in train_files:
        src = os.path.join(source_dir, file)
        dst = os.path.join(train_dir, file)
        shutil.copy2(src, dst)
    
    return test_files, train_files

# 处理 Spacy 文件
print("处理 Spacy 预处理文件...")
spacy_test_files, spacy_train_files = split_train_test_files(spacy_dir, spacy_test_dir, spacy_train_dir)
print(f"已选择 {len(spacy_test_files)} 个Spacy测试文件")
print(f"已选择 {len(spacy_train_files)} 个Spacy训练文件")

# 处理 Cusanus 文件
print("\n处理 Cusanus 预处理文件...")
cusanus_test_files, cusanus_train_files = split_train_test_files(cusanus_dir, cusanus_test_dir, cusanus_train_dir)
print(f"已选择 {len(cusanus_test_files)} 个Cusanus测试文件")
print(f"已选择 {len(cusanus_train_files)} 个Cusanus训练文件")

# 保存文件列表
with open(os.path.join(BASE_DIR, 'train_test_files_list.json'), 'w', encoding='utf-8') as f:
    json.dump({
        'spacy': {
            'test_files': spacy_test_files,
            'train_files': spacy_train_files
        },
        'cusanus': {
            'test_files': cusanus_test_files,
            'train_files': cusanus_train_files
        }
    }, f, indent=2, ensure_ascii=False)

处理 Spacy 预处理文件...
已选择 61 个Spacy测试文件
已选择 245 个Spacy训练文件

处理 Cusanus 预处理文件...
已选择 61 个Cusanus测试文件
已选择 247 个Cusanus训练文件
