# 说明
### 本部分代码包括从Huggingface根据特定关键词获取相关数据集信息，并最终整理成规范格式的完整流程

## 根据关键词构建数据集列表

定义爬虫函数

In [42]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import csv
import json
import logging
import re
import os
from urllib.parse import urljoin

# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class HuggingFaceSeleniumCrawler:
    def __init__(self, headless=True, delay=2):
        """
        初始化爬虫
        :param headless: 是否使用无头模式
        :param delay: 页面加载等待时间
        """
        self.delay = delay
        self.datasets = []
        self.setup_driver(headless)
        
    def setup_driver(self, headless=True):
        """
        设置Chrome浏览器驱动
        """
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
        
        try:
            self.driver = webdriver.Chrome(options=chrome_options)
            self.wait = WebDriverWait(self.driver, 10)
            logger.info("Chrome浏览器驱动初始化成功")
        except Exception as e:
            logger.error(f"浏览器驱动初始化失败: {e}")
            raise
    
    def load_page(self, url):
        """
        加载页面
        :param url: 页面URL
        :return: 是否成功加载
        """
        try:
            logger.info(f"正在加载页面: {url}")
            self.driver.get(url)
            time.sleep(self.delay)
            
            # 等待页面主要内容加载
            self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "main")))
            return True
        except TimeoutException:
            logger.error(f"页面加载超时: {url}")
            return False
        except Exception as e:
            logger.error(f"页面加载失败: {url}, 错误: {e}")
            return False
    
    def extract_datasets_from_current_page(self):
        """
        从当前页面提取数据集信息
        :return: 数据集列表
        """
        datasets = []
        
        try:
            # 等待数据集列表加载
            time.sleep(2)
            
            # 多种选择器策略来查找数据集元素
            selectors = [
                "a[href*='/datasets/']",  # 包含datasets路径的链接
                ".overview-card a",       # 概览卡片中的链接
                "[data-testid*='dataset']", # 测试ID包含dataset的元素
                ".grid a[href^='/datasets/']", # 网格中的数据集链接
            ]
            
            dataset_elements = []
            for selector in selectors:
                try:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    if elements:
                        dataset_elements.extend(elements)
                        logger.info(f"使用选择器 '{selector}' 找到 {len(elements)} 个元素")
                except Exception as e:
                    logger.debug(f"选择器 '{selector}' 未找到元素: {e}")
                    continue
            
            # 如果上述方法都不行，尝试查找所有链接
            if not dataset_elements:
                all_links = self.driver.find_elements(By.TAG_NAME, "a")
                dataset_elements = [link for link in all_links 
                                  if link.get_attribute("href") and "/datasets/" in link.get_attribute("href")]
                logger.info(f"通过所有链接查找，找到 {len(dataset_elements)} 个数据集链接")
            
            # 处理找到的元素
            processed_names = set()
            
            for element in dataset_elements:
                try:
                    href = element.get_attribute("href")
                    if not href or "/datasets/" not in href:
                        continue
                    
                    # 提取数据集名称
                    dataset_name = self.extract_dataset_name_from_url(href)
                    
                    if dataset_name and dataset_name not in processed_names:
                        # 尝试获取显示的文本作为额外信息
                        display_text = element.text.strip()
                        
                        dataset_info = {
                            'name': dataset_name,
                            'url': f"https://huggingface.co/datasets/{dataset_name}",
                            'display_text': display_text[:100] if display_text else "",
                            'href': href
                        }
                        
                        datasets.append(dataset_info)
                        processed_names.add(dataset_name)
                        logger.info(f"找到数据集: {dataset_name}")
                        
                except Exception as e:
                    logger.debug(f"处理元素时出错: {e}")
                    continue
            
            # 如果仍然没有找到，尝试使用正则表达式从页面源代码中提取
            if not datasets:
                datasets = self.extract_from_page_source()
            
            return datasets
            
        except Exception as e:
            logger.error(f"提取数据集时出错: {e}")
            return []
    
    def extract_dataset_name_from_url(self, url):
        """
        从URL中提取数据集名称
        :param url: 数据集URL
        :return: 数据集名称 (用户名/数据集名)
        """
        try:
            if "/datasets/" in url:
                # 提取 /datasets/ 后面的部分
                parts = url.split("/datasets/", 1)
                if len(parts) > 1:
                    dataset_path = parts[1].strip("/")
                    # 移除查询参数和锚点
                    dataset_path = dataset_path.split("?")[0].split("#")[0]
                    
                    # 确保格式为 用户名/数据集名
                    path_parts = dataset_path.split("/")
                    if len(path_parts) >= 2:
                        return f"{path_parts[0]}/{path_parts[1]}"
            return None
        except Exception as e:
            logger.debug(f"从URL提取数据集名称时出错: {e}")
            return None
    
    def extract_from_page_source(self):
        """
        从页面源代码中使用正则表达式提取数据集信息
        :return: 数据集列表
        """
        datasets = []
        try:
            page_source = self.driver.page_source
            
            # 使用正则表达式查找数据集路径
            pattern = r'/datasets/([a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+)'
            matches = re.findall(pattern, page_source)
            
            processed_names = set()
            for match in matches:
                if match not in processed_names:
                    dataset_info = {
                        'name': match,
                        'url': f"https://huggingface.co/datasets/{match}",
                        'display_text': "",
                        'href': f"/datasets/{match}"
                    }
                    datasets.append(dataset_info)
                    processed_names.add(match)
                    logger.info(f"从源代码中找到数据集: {match}")
            
            return datasets
            
        except Exception as e:
            logger.error(f"从页面源代码提取数据集时出错: {e}")
            return []
    
    def find_next_page_button(self):
        """
        查找下一页按钮
        :return: 下一页按钮元素或None
        """
        # 多种策略查找下一页按钮
        next_button_selectors = [
            "a[aria-label='Next page']",
            "a:contains('Next')",
            "button[aria-label='Next page']",
            "a[href*='p=']",  # 包含页码参数的链接
            ".pagination a:last-child",
            "[data-testid='next-page']"
        ]
        
        for selector in next_button_selectors:
            try:
                if ":contains(" in selector:
                    # 对于包含文本的选择器，使用XPath
                    xpath = "//a[contains(text(), 'Next')] | //button[contains(text(), 'Next')]"
                    elements = self.driver.find_elements(By.XPATH, xpath)
                else:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                
                for element in elements:
                    if element.is_displayed() and element.is_enabled():
                        return element
                        
            except Exception as e:
                logger.debug(f"查找下一页按钮时出错 ({selector}): {e}")
                continue
        
        # 尝试通过文本查找
        try:
            xpath_selectors = [
                "//a[contains(text(), 'Next')]",
                "//button[contains(text(), 'Next')]",
                "//a[contains(text(), '>')]",
                "//a[text()='>']"
            ]
            
            for xpath in xpath_selectors:
                elements = self.driver.find_elements(By.XPATH, xpath)
                for element in elements:
                    if element.is_displayed() and element.is_enabled():
                        return element
                        
        except Exception as e:
            logger.debug(f"通过XPath查找下一页按钮时出错: {e}")
        
        return None
    
    def go_to_next_page(self):
        """
        跳转到下一页
        :return: 是否成功跳转
        """
        try:
            next_button = self.find_next_page_button()
            
            if next_button:
                # 滚动到按钮位置
                self.driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                time.sleep(1)
                
                # 点击按钮
                next_button.click()
                
                # 等待页面加载
                time.sleep(self.delay)
                
                # 验证页面是否发生了变化
                self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "main")))
                
                logger.info("成功跳转到下一页")
                return True
            else:
                logger.info("未找到下一页按钮，可能已到最后一页")
                return False
                
        except Exception as e:
            logger.error(f"跳转下一页时出错: {e}")
            return False
    
    def crawl_all_pages(self, start_url, max_pages=None):
        """
        爬取所有页面
        :param start_url: 起始URL
        :param max_pages: 最大页面数（None表示无限制）
        :return: 所有数据集列表
        """
        all_datasets = []
        page_count = 1
        
        # 加载第一页
        if not self.load_page(start_url):
            logger.error("无法加载起始页面")
            return []
        
        while True:
            logger.info(f"正在处理第 {page_count} 页")
            
            # 提取当前页面的数据集
            page_datasets = self.extract_datasets_from_current_page()
            
            if page_datasets:
                all_datasets.extend(page_datasets)
                logger.info(f"第 {page_count} 页找到 {len(page_datasets)} 个数据集")
            else:
                logger.warning(f"第 {page_count} 页没有找到数据集")
            
            # 检查是否达到最大页面数
            if max_pages and page_count >= max_pages:
                logger.info(f"已达到最大页面数 {max_pages}")
                break
            
            # 尝试跳转到下一页
            if not self.go_to_next_page():
                logger.info("没有更多页面")
                break
            
            page_count += 1
        
        logger.info(f"爬取完成，总共处理 {page_count} 页，找到 {len(all_datasets)} 个数据集")
        return all_datasets
    
    def extract_category_from_url(self, url):
        """
        从URL中提取分类参数
        :param url: URL字符串
        :return: 分类名称
        """
        try:
            from urllib.parse import urlparse, parse_qs
            
            parsed_url = urlparse(url)
            query_params = parse_qs(parsed_url.query)
            
            # 检查other参数
            if 'other' in query_params:
                return query_params['other'][0]
            
            # 检查其他可能的分类参数
            category_params = ['category', 'type', 'tag']
            for param in category_params:
                if param in query_params:
                    return query_params[param][0]
                    
            return ""
            
        except Exception as e:
            logger.debug(f"从URL提取分类时出错: {e}")
            return ""
    
    def save_results(self, datasets, filename_prefix="huggingface_datasets_selenium", category=""):
        """
        保存结果到文件
        :param datasets: 数据集列表
        :param filename_prefix: 文件名前缀
        :param category: 分类名称（从URL参数中提取）
        """
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        
        # 如果有分类信息，添加到文件名中
        if category:
            csv_filename = f"{filename_prefix}_{category}_{timestamp}.csv"
        else:
            csv_filename = f"{filename_prefix}_{timestamp}.csv"
            
        with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['name', 'url', 'display_text']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for dataset in datasets:
                writer.writerow({
                    'name': dataset['name'],
                    'url': dataset['url'],
                    'display_text': dataset.get('display_text', '')
                })
        
        logger.info(f"结果已保存到:")
        logger.info(f"  CSV: {csv_filename}")
    
    def save_results_to_path(self, datasets, file_path, category=""):
        """
        保存结果到指定路径
        :param datasets: 数据集列表
        :param file_path: 完整的文件路径（不包含扩展名）
        :param category: 分类名称（从URL参数中提取）
        """
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        
        # 确保目录存在
        directory = os.path.dirname(file_path)
        if directory and not os.path.exists(directory):
            os.makedirs(directory, exist_ok=True)
        
        # 构建完整的文件名
        if category:
            csv_filename = f"{file_path}.csv"
        else:
            csv_filename = f"{file_path}.csv"
            
        with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['name', 'url', 'display_text']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for dataset in datasets:
                writer.writerow({
                    'name': dataset['name'],
                    'url': dataset['url'],
                    'display_text': dataset.get('display_text', '')
                })
        
        logger.info(f"结果已保存到: {csv_filename}")
        return csv_filename
    
    def close(self):
        """
        关闭浏览器
        """
        if hasattr(self, 'driver'):
            self.driver.quit()
            logger.info("浏览器已关闭")

In [40]:
import pandas as pd
from typing import List, Dict, Optional
import os

def crawl_multiple_keywords(keywords: List[str], 
                          save_path: Optional[str] = None,
                          max_pages_per_keyword: Optional[int] = None,
                          base_url: str = "https://huggingface.co/datasets",
                          sort_by: str = "trending",
                          headless: bool = True,
                          delay: int = 3) -> pd.DataFrame:
    """
    处理多个关键词的Hugging Face数据集爬取函数
    
    Parameters:
    -----------
    keywords : List[str]
        要爬取的关键词列表，例如 ['climate', 'medical', 'chemistry']
    save_path : Optional[str]
        保存结果的文件路径，支持 .csv, .xlsx, .json 格式
        如果为 None，则不保存文件，只返回 DataFrame
    max_pages_per_keyword : Optional[int]
        每个关键词最大爬取页面数，None 表示无限制
    base_url : str
        基础URL，默认为 Hugging Face 数据集页面
    sort_by : str
        排序方式，默认为 "trending"
    headless : bool
        是否使用无头模式运行浏览器
    delay : int
        页面加载等待时间（秒）
    
    Returns:
    --------
    pd.DataFrame
        包含所有关键词数据集的综合DataFrame
    """
    
    all_results = []
    failed_keywords = []
    
    print(f"开始爬取 {len(keywords)} 个关键词的数据集...")
    
    for i, keyword in enumerate(keywords, 1):
        print(f"\n[{i}/{len(keywords)}] 正在处理关键词: '{keyword}'")
        
        # 构建URL
        url = f"{base_url}?other={keyword}&sort={sort_by}"
        
        # 创建爬虫实例
        crawler = HuggingFaceSeleniumCrawler(headless=headless, delay=delay)
        
        try:
            # 爬取当前关键词的数据集
            datasets = crawler.crawl_all_pages(url, max_pages=max_pages_per_keyword)
            
            if datasets:
                # 为每个数据集添加关键词标记
                for dataset in datasets:
                    dataset['keyword'] = keyword
                    dataset['category'] = keyword  # 添加分类字段
                
                all_results.extend(datasets)
                print(f"✓ 关键词 '{keyword}' 完成，获得 {len(datasets)} 个数据集")
            else:
                print(f"✗ 关键词 '{keyword}' 未获得任何数据集")
                failed_keywords.append(keyword)
                
        except Exception as e:
            print(f"✗ 关键词 '{keyword}' 处理失败: {e}")
            failed_keywords.append(keyword)
            logger.error(f"处理关键词 '{keyword}' 时出错: {e}")
            
        finally:
            # 确保关闭浏览器
            crawler.close()
    
    # 创建DataFrame
    if all_results:
        df = pd.DataFrame(all_results)
        
        # 数据清理和去重
        print(f"\n数据处理中...")
        print(f"原始数据: {len(df)} 条记录")
        
        # 去重（基于name和keyword的组合）
        df = df.drop_duplicates(subset=['name', 'keyword'], keep='first')
        print(f"去重后: {len(df)} 条记录")
        
        # 重新排列列的顺序
        column_order = ['keyword', 'category', 'name', 'url', 'display_text']
        df = df.reindex(columns=column_order)
        
        # 按关键词和名称排序
        df = df.sort_values(['keyword', 'name']).reset_index(drop=True)
        
        # 统计信息
        print(f"\n=== 爬取结果统计 ===")
        keyword_stats = df['keyword'].value_counts()
        for keyword, count in keyword_stats.items():
            print(f"  {keyword}: {count} 个数据集")
        
        if failed_keywords:
            print(f"\n失败的关键词: {failed_keywords}")
        
        # 保存文件
        if save_path:
            save_dataframe_to_file(df, save_path)
        
        return df
    
    else:
        print("\n没有获取到任何数据集！")
        return pd.DataFrame()


def save_dataframe_to_file(df: pd.DataFrame, file_path: str) -> None:
    """
    将DataFrame保存到指定格式的文件
    
    Parameters:
    -----------
    df : pd.DataFrame
        要保存的DataFrame
    file_path : str
        保存路径，根据后缀名确定格式
    """
    try:
        # 确保目录存在
        os.makedirs(os.path.dirname(file_path) if os.path.dirname(file_path) else '.', exist_ok=True)
        
        file_extension = os.path.splitext(file_path)[1].lower()
        
        if file_extension == '.csv':
            df.to_csv(file_path, index=False, encoding='utf-8')
            print(f"✓ 结果已保存为CSV: {file_path}")
            
        elif file_extension in ['.xlsx', '.xls']:
            df.to_excel(file_path, index=False, engine='openpyxl')
            print(f"✓ 结果已保存为Excel: {file_path}")
            
        elif file_extension == '.json':
            df.to_json(file_path, orient='records', indent=2, force_ascii=False)
            print(f"✓ 结果已保存为JSON: {file_path}")
            
        else:
            # 默认保存为CSV
            csv_path = file_path + '.csv'
            df.to_csv(csv_path, index=False, encoding='utf-8')
            print(f"✓ 未识别的文件格式，已保存为CSV: {csv_path}")
            
    except Exception as e:
        print(f"✗ 保存文件时出错: {e}")
        logger.error(f"保存文件时出错: {e}")


def get_keyword_urls_batch(keywords: List[str], 
                          base_url: str = "https://huggingface.co/datasets",
                          sort_by: str = "trending") -> Dict[str, str]:
    """
    批量生成关键词对应的URL
    
    Parameters:
    -----------
    keywords : List[str]
        关键词列表
    base_url : str
        基础URL
    sort_by : str
        排序方式
    
    Returns:
    --------
    Dict[str, str]
        关键词到URL的映射
    """
    return {keyword: f"{base_url}?other={keyword}&sort={sort_by}" for keyword in keywords}

执行爬虫函数获取数据。

根据关键词选择特定学科

In [None]:
# 实际使用示例
# 定义要爬取的关键词列表
target_keywords = ['climate', 'forecast']

# 执行多关键词爬取
# 可以自定义保存路径和文件格式
save_location = "备份_中间流程结果/datasets_list.csv"  # 可以是 .csv, .xlsx, .json

print("开始执行多关键词数据集爬取...")

# 执行爬取
df = crawl_multiple_keywords(
    keywords=target_keywords,
    save_path=save_location,
    max_pages_per_keyword=None,  # None表示爬取所有页面，也可以设置数字限制
    headless=True,  # True表示不显示浏览器窗口
    delay=3  # 页面加载等待时间
)

# 显示结果
if not df.empty:
    print(f"\n爬取完成！共获得 {len(df)} 个数据集")
    print(f"文件已保存至: {save_location}")
    
    # 显示前几条数据作为预览
    print("\n数据预览:")
    print(df.head())
    
    # 显示各关键词的数据集数量统计
    print("\n各关键词数据集数量:")
    print(df['keyword'].value_counts())
else:
    print("未获取到任何数据！")

爬取指定url页面下的所有数据集

In [44]:
def main(output_file_path="备份_中间流程结果/datasets_list"):
    """
    主函数
    :param output_file_path: 指定保存结果的文件路径（不包含扩展名）
    """
    # 目标URL
    start_url = "https://huggingface.co/datasets?size_categories=or:%28size_categories:n%3E1T%29&sort=trending"
    
    # 创建爬虫实例
    crawler = HuggingFaceSeleniumCrawler(headless=True, delay=3)
    
    try:
        # 从URL中提取分类信息
        category = crawler.extract_category_from_url(start_url)
        logger.info(f"检测到分类: {category}")
        
        # 开始爬取（可以设置max_pages限制页面数，用于测试）
        all_datasets = crawler.crawl_all_pages(start_url, max_pages=None)
        
        # 去重
        unique_datasets = []
        seen_names = set()
        for dataset in all_datasets:
            if dataset['name'] not in seen_names:
                unique_datasets.append(dataset)
                seen_names.add(dataset['name'])
        
        logger.info(f"去重后共有 {len(unique_datasets)} 个唯一数据集")
        
        # 打印前10个结果作为示例
        print("\n前10个数据集:")
        for i, dataset in enumerate(unique_datasets[:10], 1):
            print(f"{i}. {dataset['name']} -> {dataset['url']}")
        
        if len(unique_datasets) > 10:
            print(f"... 还有 {len(unique_datasets) - 10} 个数据集")
        
        # 保存结果到指定路径
        crawler.save_results_to_path(unique_datasets, output_file_path, category=category)
        
    except KeyboardInterrupt:
        logger.info("爬取被用户中断")
    except Exception as e:
        logger.error(f"爬取过程中出现错误: {e}")
    finally:
        # 确保关闭浏览器
        crawler.close()


if __name__ == "__main__":
    # 你可以这样调用：
    # main()  # 使用默认路径
    # main("my_custom_datasets")  # 使用自定义路径
    main()

2025-08-26 11:36:15,478 - INFO - Chrome浏览器驱动初始化成功
2025-08-26 11:36:15,484 - INFO - 检测到分类: 
2025-08-26 11:36:15,487 - INFO - 正在加载页面: https://huggingface.co/datasets?size_categories=or:%28size_categories:n%3E1T%29&sort=trending
2025-08-26 11:36:20,907 - INFO - 正在处理第 1 页
2025-08-26 11:36:22,919 - INFO - 使用选择器 'a[href*='/datasets/']' 找到 30 个元素
2025-08-26 11:36:22,946 - INFO - 使用选择器 '.grid a[href^='/datasets/']' 找到 30 个元素
2025-08-26 11:36:22,965 - INFO - 找到数据集: allenai/dolma
2025-08-26 11:36:22,974 - INFO - 找到数据集: nasa-ibm-ai4science/SDO_training
2025-08-26 11:36:22,983 - INFO - 找到数据集: 3DTopia/4DNeX-10M
2025-08-26 11:36:22,991 - INFO - 找到数据集: allenai/MADLAD-400
2025-08-26 11:36:23,000 - INFO - 找到数据集: urbanaudiosensing/ASPED
2025-08-26 11:36:23,008 - INFO - 找到数据集: x-humanoid-robomind/RoboMIND
2025-08-26 11:36:23,017 - INFO - 找到数据集: InternRobotics/InternData-N1
2025-08-26 11:36:23,025 - INFO - 找到数据集: DeliberatorArchiver/asmr-archive-data-01
2025-08-26 11:36:23,034 - INFO - 找到数据集: DeliberatorA


前10个数据集:
1. allenai/dolma -> https://huggingface.co/datasets/allenai/dolma
2. nasa-ibm-ai4science/SDO_training -> https://huggingface.co/datasets/nasa-ibm-ai4science/SDO_training
3. 3DTopia/4DNeX-10M -> https://huggingface.co/datasets/3DTopia/4DNeX-10M
4. allenai/MADLAD-400 -> https://huggingface.co/datasets/allenai/MADLAD-400
5. urbanaudiosensing/ASPED -> https://huggingface.co/datasets/urbanaudiosensing/ASPED
6. x-humanoid-robomind/RoboMIND -> https://huggingface.co/datasets/x-humanoid-robomind/RoboMIND
7. InternRobotics/InternData-N1 -> https://huggingface.co/datasets/InternRobotics/InternData-N1
8. DeliberatorArchiver/asmr-archive-data-01 -> https://huggingface.co/datasets/DeliberatorArchiver/asmr-archive-data-01
9. DeliberatorArchiver/asmr-archive-data-02 -> https://huggingface.co/datasets/DeliberatorArchiver/asmr-archive-data-02
10. oscar-corpus/OSCAR-2109 -> https://huggingface.co/datasets/oscar-corpus/OSCAR-2109
... 还有 91 个数据集


In [45]:
df=pd.read_csv('备份_中间流程结果/datasets_list.csv')

In [46]:
# 2. 对两个表格的name列进行处理：根据/分割左侧为作者，右侧为数据集/模型名称

def split_name_column(df, df_type):
    """
    分割name列为author和item_name列
    """
    print(f"\n处理{df_type}的name列...")
    
    # 创建副本避免修改原数据
    df_copy = df.copy()
    
    # 查看一些name列的示例
    print(f"name列示例 (前5个):")
    for i, name in enumerate(df_copy['name'].head()):
        print(f"  {i+1}. {name}")
    
    # 分割name列
    split_names = df_copy['name'].str.split('/', n=1, expand=True)
    
    # 添加author和item_name列
    df_copy['author'] = split_names[0] if len(split_names.columns) > 0 else ''
    df_copy['item_name'] = split_names[1] if len(split_names.columns) > 1 else ''
    
    # 处理没有/的情况（整个name作为item_name，author为空）
    mask_no_slash = df_copy['item_name'].isna()
    df_copy.loc[mask_no_slash, 'item_name'] = df_copy.loc[mask_no_slash, 'author']
    df_copy.loc[mask_no_slash, 'author'] = ''
    
    # 清理空值
    df_copy['author'] = df_copy['author'].fillna('')
    df_copy['item_name'] = df_copy['item_name'].fillna('')
    
    print(f"分割完成:")
    print(f"  - 有作者信息的条目: {(df_copy['author'] != '').sum()}")
    print(f"  - 没有作者信息的条目: {(df_copy['author'] == '').sum()}")
    
    return df_copy

# 处理datasets_df
datasets_df_processed = split_name_column(df, "数据集")


处理数据集的name列...
name列示例 (前5个):
  1. allenai/dolma
  2. nasa-ibm-ai4science/SDO_training
  3. 3DTopia/4DNeX-10M
  4. allenai/MADLAD-400
  5. urbanaudiosensing/ASPED
分割完成:
  - 有作者信息的条目: 101
  - 没有作者信息的条目: 0


In [47]:
import re
from datetime import datetime, timedelta

# 3. 处理display_text列，提取更新日期
def extract_update_date(display_text):
    """
    从display_text中提取更新日期
    """
    if pd.isna(display_text) or display_text == '':
        return None
    
    # 按•分割
    parts = display_text.split('•')
    
    # 查找包含Updated的部分
    update_part = None
    for part in parts:
        if 'Updated' in part:
            update_part = part.strip()
            break
    
    if not update_part:
        return None
    
    # 提取Updated后的日期部分
    # 使用正则表达式匹配Updated后的内容
    match = re.search(r'Updated\s+(.+)', update_part)
    if not match:
        return None
    
    date_str = match.group(1).strip()
    
    try:
        # 1. 处理完整年月日格式，如 "Jun 5, 2024"
        if re.match(r'^[A-Za-z]{3}\s+\d{1,2},\s+\d{4}$', date_str):
            return datetime.strptime(date_str, '%b %d, %Y').strftime('%Y-%m-%d')
        
        # 2. 处理没有年份的格式，如 "Mar 8"，默认年份为2025
        elif re.match(r'^[A-Za-z]{3}\s+\d{1,2}$', date_str):
            date_with_year = f"{date_str}, 2025"
            return datetime.strptime(date_with_year, '%b %d, %Y').strftime('%Y-%m-%d')
        
        # 3. 处理相对时间格式，如 "9 days ago"
        elif 'ago' in date_str.lower():
            base_date = datetime(2025, 8, 22)  # 基准日期
            
            # 匹配数字和时间单位
            time_match = re.search(r'(\d+)\s+(day|month|year|week|hour|minute)s?\s+ago', date_str.lower())
            if time_match:
                number = int(time_match.group(1))
                unit = time_match.group(2)
                
                if unit == 'day':
                    result_date = base_date - timedelta(days=number)
                elif unit == 'week':
                    result_date = base_date - timedelta(weeks=number)
                elif unit == 'month':
                    # 简单处理，假设一个月30天
                    result_date = base_date - timedelta(days=number * 30)
                elif unit == 'year':
                    # 简单处理，假设一年365天
                    result_date = base_date - timedelta(days=number * 365)
                elif unit == 'hour':
                    result_date = base_date - timedelta(hours=number)
                elif unit == 'minute':
                    result_date = base_date - timedelta(minutes=number)
                else:
                    return None
                
                return result_date.strftime('%Y-%m-%d')
        
        # 其他格式尝试直接解析
        else:
            return None
            
    except Exception as e:
        return None

print("\n" + "="*50)
print("开始处理display_text列，提取更新日期...")

# 查看一些display_text的样例
datasets_df_processed['update_date'] = datasets_df_processed['display_text'].apply(extract_update_date)
print("\n提取的更新日期示例 (前10个):")
for i, date in enumerate(datasets_df_processed['update_date'].head(10)):
    print(f"  {i+1}. {date}")


开始处理display_text列，提取更新日期...

提取的更新日期示例 (前10个):
  1. 2024-04-17
  2. 2025-05-21
  3. 2025-08-16
  4. 2024-09-10
  5. 2024-01-24
  6. 2025-07-09
  7. 2025-08-15
  8. 2025-02-01
  9. 2025-03-31
  10. 2025-08-02


In [48]:
datasets_df_processed.to_csv("备份_中间流程结果/datasets_list.csv", index=False, encoding='utf-8')

# 进一步整理数据内容

读取数据

In [49]:
import pandas as pd

df=pd.read_csv("备份_中间流程结果/datasets_list.csv")

定义数据集详细信息爬虫

In [50]:
import requests
from bs4 import BeautifulSoup
import re
import json
from typing import Dict, List, Optional

class HuggingFaceDatasetScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
    
    def extract_dataset_info(self, url: str) -> Dict:
        """
        从 Hugging Face dataset 页面提取关键信息
        
        Args:
            url: dataset 页面的 URL
            
        Returns:
            包含提取信息的字典
        """
        try:
            response = self.session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # 提取数据集名称
            dataset_name = self._extract_dataset_name(soup, url)
            
            # 提取标签信息
            metadata = self._extract_metadata_tags(soup)
            
            # 提取文字描述（前两段）
            description = self._extract_description(soup)
            
            result = {
                "dataset_name": dataset_name,
                "url": url,
                "metadata": metadata,
                "description": description,
                "extraction_status": "success"
            }
            
            return result
            
        except Exception as e:
            return {
                "dataset_name": None,
                "url": url,
                "metadata": {},
                "description": [],
                "extraction_status": f"error: {str(e)}"
            }
    
    def _extract_dataset_name(self, soup: BeautifulSoup, url: str) -> str:
        """提取数据集名称"""
        # 尝试从页面标题提取
        title_element = soup.find('h1')
        if title_element:
            return title_element.get_text().strip()
        
        # 从 URL 提取
        url_parts = url.rstrip('/').split('/')
        if len(url_parts) >= 2:
            return url_parts[-1]
        
        return "Unknown"
    
    def _extract_metadata_tags(self, soup: BeautifulSoup) -> Dict:
        """提取上方标签中的元数据信息"""
        metadata = {
            "tasks": [],
            "modalities": [],
            "formats": [],
            "languages": [],
            "size": "",
            "tags": [],
            "libraries": [],
            "license": "",
            "arxiv": ""
        }
        
        # 查找包含标签信息的容器
        # 方法1: 通过标签文本查找
        self._extract_by_label_text(soup, metadata)
        
        # 方法2: 通过特定的 class 或 data 属性查找
        self._extract_by_attributes(soup, metadata)
        
        # 方法3: 通过正则表达式在文本中查找
        self._extract_by_regex(soup, metadata)
        
        return metadata
    
    def _extract_by_label_text(self, soup: BeautifulSoup, metadata: Dict):
        """通过标签文本查找信息"""
        labels_to_find = {
            "modalities": ["Modalities:", "modalities"],
            "formats": ["Formats:", "formats"],
            "size": ["Size:", "size"],
            "tags": ["Tags:", "tags"],
            "libraries": ["Libraries:", "libraries"],
            "license": ["License:", "license"],
            "languages": ["Languages:", "languages"],
            "tasks": ["Tasks:", "tasks"],
            "arxiv": ["ArXiv:", "arxiv", "Paper:"]
        }
        
        for key, possible_labels in labels_to_find.items():
            for label in possible_labels:
                # 查找包含标签的元素
                label_elements = soup.find_all(string=re.compile(label, re.IGNORECASE))
                for label_element in label_elements:
                    parent = label_element.parent
                    if parent:
                        # 查找同级或相邻的元素获取值
                        value_elements = self._find_value_elements(parent)
                        if value_elements:
                            if key in ["size", "license", "arxiv"]:
                                metadata[key] = " ".join([elem.get_text().strip() 
                                                        for elem in value_elements])
                            else:
                                metadata[key] = [elem.get_text().strip() 
                                               for elem in value_elements if elem.get_text().strip()]
                            break
    
    def _extract_by_attributes(self, soup: BeautifulSoup, metadata: Dict):
        """通过HTML属性查找信息"""
        # 查找带有特定 data 属性的元素
        data_attrs = soup.find_all(attrs={"data-testid": True})
        for element in data_attrs:
            testid = element.get('data-testid', '')
            if 'tag' in testid.lower():
                text = element.get_text().strip()
                if text and text not in metadata["tags"]:
                    metadata["tags"].append(text)
        
        # 查找许可证链接
        license_links = soup.find_all('a', href=re.compile(r'license|cc-by', re.IGNORECASE))
        for link in license_links:
            license_text = link.get_text().strip()
            if license_text and not metadata["license"]:
                metadata["license"] = license_text
    
    def _extract_by_regex(self, soup: BeautifulSoup, metadata: Dict):
        """通过正则表达式在页面文本中查找信息"""
        page_text = soup.get_text()
        
        # 查找文件大小
        size_patterns = [
            r'(\d+(?:\.\d+)?[KMGT]?B?)',
            r'(\d+K?-\d+[KMG])',
            r'Size:\s*([^\n\r]+)'
        ]
        for pattern in size_patterns:
            match = re.search(pattern, page_text, re.IGNORECASE)
            if match and not metadata["size"]:
                metadata["size"] = match.group(1).strip()
                break
        
        # 查找 ArXiv ID
        arxiv_pattern = r'arxiv[:\s]*(\d+\.\d+)'
        arxiv_match = re.search(arxiv_pattern, page_text, re.IGNORECASE)
        if arxiv_match and not metadata["arxiv"]:
            metadata["arxiv"] = arxiv_match.group(1)
    
    def _find_value_elements(self, parent_element):
        """在父元素中查找值元素"""
        value_elements = []
        
        # 查找同级元素
        for sibling in parent_element.find_next_siblings():
            if sibling.name in ['span', 'div', 'a', 'code']:
                value_elements.append(sibling)
            if len(value_elements) >= 10:  # 限制数量
                break
        
        # 查找子元素
        if not value_elements:
            value_elements = parent_element.find_all(['span', 'div', 'a', 'code'])
        
        return value_elements[:10]  # 限制返回数量
    
    def _extract_description(self, soup: BeautifulSoup) -> List[str]:
        """提取文字描述的前两段"""
        descriptions = []
        
        # 先尝试查找具体的描述容器
        description_selectors = [
            'div[class*="prose"]',
            'div[class*="dataset-description"]',
            'div[class*="readme"]',
            'section[class*="dataset-description"]',
            'div[class*="content"]',
            'article',
            'div:has(h1, h2, h3)',
            'main',
        ]
        
        description_container = None
        for selector in description_selectors:
            try:
                container = soup.select_one(selector)
                if container:
                    description_container = container
                    break
            except:
                continue
        
        # 如果没找到特定容器，使用整个页面
        if not description_container:
            description_container = soup
        
        # 查找段落标签
        paragraphs = description_container.find_all('p')
        for p in paragraphs:
            text = p.get_text().strip()
            # 过滤条件：足够长，不是导航文本，包含有意义的内容
            if (len(text) > 50 and 
                not self._is_navigation_text(text) and
                self._is_meaningful_description(text)):
                descriptions.append(text)
                if len(descriptions) >= 2:
                    return descriptions
        
        return descriptions[:2]
    
    def _is_meaningful_description(self, text: str) -> bool:
        """判断文本是否是有意义的描述内容"""
        # 检查是否包含描述性的关键词
        descriptive_patterns = [
            r'\b(dataset|data|contains|provides|includes|designed|used|application)\b',
            r'\b(machine learning|deep learning|AI|artificial intelligence)\b',
            r'\b(research|study|analysis|model|algorithm)\b',
            r'\b(images|text|video|audio|data)\b'
        ]
        
        text_lower = text.lower()
        has_descriptive_content = any(re.search(pattern, text_lower) for pattern in descriptive_patterns)
        
        # 检查是否有足够的句子结构
        sentence_count = len([s for s in text.split('.') if len(s.strip()) > 10])
        
        # 避免纯技术标签或元数据
        avoid_patterns = [
            r'^\s*\d+(\.\d+)?\s*(GB|MB|KB|TB)\s*$',  # 纯文件大小
            r'^\s*[a-z-]+:[a-z-]+\s*$',  # 纯标签格式
            r'^\s*(true|false|yes|no)\s*$',  # 纯布尔值
            r'^\s*(download|view|edit|fork|clone)\s*$',  # 纯操作按钮
            r'^\s*\d+\s*(downloads?|views?|likes?)\s*$',  # 纯统计数字
            r'^\s*(last updated|created|modified):\s*',  # 纯时间戳
            r'^\s*size:\s*\d+.*$',  # 纯大小信息
            r'^\s*(cc-by|mit|apache|gpl).*license\s*$'  # 纯许可证
        ]
        
        is_metadata = any(re.match(pattern, text.strip(), re.IGNORECASE) for pattern in avoid_patterns)
        
        return (has_descriptive_content and 
                sentence_count >= 1 and 
                not is_metadata and
                len(text.split()) >= 10)  # 至少10个单词
    
    def _is_navigation_text(self, text: str) -> bool:
        """判断是否为导航或菜单文本"""
        if len(text) > 300:  # 长文本不太可能是导航
            return False
            
        nav_keywords = [
            'home', 'datasets', 'models', 'spaces', 'docs', 'pricing', 
            'login', 'sign up', 'menu', 'navigation', 'download', 'view',
            'edit', 'fork', 'clone', 'settings', 'discussions', 'files',
            'community', 'license', 'paper', 'leaderboard'
        ]
        
        text_lower = text.lower().strip()
        
        # 检查是否主要由导航关键词组成
        words = text_lower.split()
        if len(words) <= 5:  # 短文本更可能是导航
            nav_word_count = sum(1 for word in words if any(kw in word for kw in nav_keywords))
            if nav_word_count >= len(words) * 0.6:  # 60%以上是导航词汇
                return True
        
        # 检查特定的导航模式
        nav_patterns = [
            r'^\s*(home|datasets|models|spaces|docs)\s*$',
            r'^\s*(download|view|edit|fork|clone)\s*$'
        ]
        
        return any(re.match(pattern, text_lower) for pattern in nav_patterns)

In [51]:
import time
import pandas as pd
from tqdm import tqdm

def extract_huggingface_dataset_info(df, url_column='url', delay=1, batch_size=None):
    """
    简化的Hugging Face数据集信息批量提取函数
    
    Parameters:
    -----------
    df : pd.DataFrame
        包含Hugging Face URL的数据框
    url_column : str
        包含URL的列名，默认为'url'
    delay : float
        请求间隔时间（秒），默认为1秒
    batch_size : int or None
        批次大小，None表示处理所有数据
    
    Returns:
    --------
    pd.DataFrame
        添加了新列的数据框，包含：
        - modalities: 模态信息
        - formats: 数据格式
        - size: 数据集大小
        - tags: 标签
        - libraries: 相关库
        - license: 许可证
        - description: 描述
        - extraction_status: 提取状态
    """
    
    # 创建scraper实例
    scraper = HuggingFaceDatasetScraper()
    
    # 复制数据框避免修改原始数据
    result_df = df.copy()
    
    # 初始化新列
    new_columns = ['modalities', 'formats', 'size', 'tags', 'libraries', 'license', 'description', 'extraction_status']
    for col in new_columns:
        if col not in result_df.columns:
            result_df[col] = None
    
    # 检查URL列是否存在
    if url_column not in result_df.columns:
        raise ValueError(f"列 '{url_column}' 不存在于数据框中")
    
    # 确定要处理的行
    total_rows = len(result_df)
    if batch_size is not None:
        process_rows = min(batch_size, total_rows)
        process_indices = result_df.index[:process_rows]
    else:
        process_rows = total_rows
        process_indices = result_df.index
    
    print(f"开始处理 {process_rows} 个Hugging Face数据集URL...")
    
    # 统计变量
    success_count = 0
    failed_count = 0
    error_count = 0
    
    # 使用tqdm显示进度条
    for idx in tqdm(process_indices, desc="提取数据集信息"):
        try:
            url = result_df.loc[idx, url_column]
            
            # 跳过空URL
            if pd.isna(url) or url == '':
                result_df.loc[idx, 'extraction_status'] = 'EMPTY_URL'
                continue
            
            # 确保URL是字符串格式
            url = str(url).strip()
            
            # 验证是否为Hugging Face数据集URL
            if 'huggingface.co/datasets/' not in url:
                result_df.loc[idx, 'extraction_status'] = 'INVALID_URL'
                continue
            
            # 提取信息
            result = scraper.extract_dataset_info(url)
            
            if result["extraction_status"] == "success":
                metadata = result['metadata']
                
                # 更新数据框
                result_df.loc[idx, 'modalities'] = str(metadata.get('modalities', [])) if metadata.get('modalities') else None
                result_df.loc[idx, 'formats'] = str(metadata.get('formats', [])) if metadata.get('formats') else None
                result_df.loc[idx, 'size'] = metadata.get('size', '') if metadata.get('size') else None
                result_df.loc[idx, 'tags'] = str(metadata.get('tags', [])) if metadata.get('tags') else None
                result_df.loc[idx, 'libraries'] = str(metadata.get('libraries', [])) if metadata.get('libraries') else None
                result_df.loc[idx, 'license'] = metadata.get('license', '') if metadata.get('license') else None
                result_df.loc[idx, 'description'] = str(result.get('description', [])) if result.get('description') else None
                result_df.loc[idx, 'extraction_status'] = 'SUCCESS'
                
                success_count += 1
            else:
                result_df.loc[idx, 'extraction_status'] = f'FAILED: {result["extraction_status"]}'
                failed_count += 1
            
            # 请求间隔
            if delay > 0:
                time.sleep(delay)
                
        except Exception as e:
            result_df.loc[idx, 'extraction_status'] = f'ERROR: {str(e)}'
            error_count += 1
            continue
    
    # 显示最终统计
    print(f"\n{'='*60}")
    print("处理完成！统计结果:")
    print(f"总处理数量: {process_rows}")
    print(f"✅ 成功: {success_count} ({success_count/process_rows*100:.1f}%)")
    print(f"❌ 失败: {failed_count} ({failed_count/process_rows*100:.1f}%)")
    print(f"⚠️  错误: {error_count} ({error_count/process_rows*100:.1f}%)")
    
    # 显示成功示例
    successful_rows = result_df[result_df['extraction_status'] == 'SUCCESS']
    if len(successful_rows) > 0:
        print(f"\n成功提取信息的示例 (前3个):")
        for idx, row in successful_rows.head(3).iterrows():
            dataset_name = row.get('name', row.get('item_name', 'Unknown'))
            print(f"- {dataset_name}")
            print(f"  大小: {row['size'] or 'N/A'}")
            print(f"  许可证: {row['license'] or 'N/A'}")
            print(f"  标签数量: {len(eval(row['tags'])) if row['tags'] and row['tags'] != 'None' else 0}")
    
    return result_df

In [52]:
# 使用函数处理数据集信息提取
# 请根据实际情况修改以下参数

# 参数设置
input_dataframe = df  # 替换为你的数据框变量名
url_column_name = 'url'  # 替换为包含URL的列名
request_delay = 1  # 请求间隔时间（秒）
process_batch_size = None  # 批次大小，None表示处理全部，也可以设置如10, 50等

# 执行提取
result_df = extract_huggingface_dataset_info(
    df=input_dataframe,
    url_column=url_column_name,
    delay=request_delay,
    batch_size=process_batch_size
)

# 显示结果
print("处理完成！")
print(f"原始数据行数: {len(input_dataframe)}")
print(f"处理后数据行数: {len(result_df)}")
print(f"新增列: {[col for col in result_df.columns if col not in input_dataframe.columns]}")

# 保存结果（可选）
# result_df.to_excel('dataset_info_extracted.xlsx', index=False)
# result_df.to_csv('dataset_info_extracted.csv', index=False)

开始处理 101 个Hugging Face数据集URL...


提取数据集信息: 100%|██████████| 101/101 [02:47<00:00,  1.66s/it]


处理完成！统计结果:
总处理数量: 101
✅ 成功: 101 (100.0%)
❌ 失败: 0 (0.0%)
⚠️  错误: 0 (0.0%)

成功提取信息的示例 (前3个):
- allenai/dolma
  大小: n>1T
  许可证: odc-by
  标签数量: 3
- nasa-ibm-ai4science/SDO_training
  大小: n>1T
  许可证: mit
  标签数量: 3
- 3DTopia/4DNeX-10M
  大小: n>1T
  许可证: apache-2.0
  标签数量: 4
处理完成！
原始数据行数: 101
处理后数据行数: 101
新增列: ['modalities', 'formats', 'size', 'tags', 'libraries', 'license', 'description', 'extraction_status']





使用huggingface_hub库进行数据集信息提取：文件格式与大小

In [53]:
from huggingface_hub import HfApi
def print_dataset_file_sizes(repo_id):
    api = HfApi()
    dataset_info = api.dataset_info(repo_id=repo_id, files_metadata=True)

    total_size_bytes = 0  
    print(f"File sizes for dataset '{repo_id}':\n")  
    for sibling in dataset_info.siblings:  
        filename = sibling.rfilename  
        size_in_bytes = sibling.size or 0  
        total_size_bytes += size_in_bytes  
        size_mb = size_in_bytes / (1024 * 1024)  
        print(f"  {filename}: {size_mb:.2f} MB")  

    total_size_mb = total_size_bytes / (1024 * 1024)  
    print(f"\nTotal size: {total_size_mb:.2f} MB")  

print_dataset_file_sizes('isaaccorley/AlphaEarth-EuroSAT')

File sizes for dataset 'isaaccorley/AlphaEarth-EuroSAT':

  .gitattributes: 0.00 MB
  README.md: 0.00 MB
  eurosat-aef-embeddings.npz: 5.44 MB
  eurosat-aef.tar.gz: 5710.42 MB

Total size: 5715.86 MB


In [54]:
import pandas as pd
from collections import defaultdict
import os
from tqdm import tqdm
import time
import signal

class TimeoutError(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutError("操作超时")

def get_dataset_file_info(dataset_id, timeout_seconds=120):
    """
    获取数据集的文件信息，包括格式和总大小（修复版本）
    使用dataset_info来获取更详细的文件大小信息
    """
    try:
        # 设置超时信号
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(timeout_seconds)
        
        start_time = time.time()
        api = HfApi()
        
        # 使用dataset_info获取详细信息，包括文件元数据
        dataset_info = api.dataset_info(repo_id=dataset_id, files_metadata=True)
        
        # 统计文件格式和大小
        formats = set()
        total_size = 0
        file_count = 0
        
        # 从siblings中获取文件信息
        if hasattr(dataset_info, 'siblings') and dataset_info.siblings:
            for sibling in dataset_info.siblings:
                # 获取文件扩展名
                if hasattr(sibling, 'rfilename') and sibling.rfilename:
                    file_path = sibling.rfilename
                    if '.' in file_path:
                        extension = os.path.splitext(file_path)[1].lower()
                        if extension:  # 确保扩展名不为空
                            formats.add(extension)
                
                # 获取文件大小
                if hasattr(sibling, 'size') and sibling.size:
                    total_size += sibling.size
                    file_count += 1
        
        # 如果没有通过siblings获取到大小，尝试其他方法
        if total_size == 0:
            try:
                # 尝试使用repo_info
                repo_info = api.repo_info(repo_id=dataset_id, repo_type="dataset")
                if hasattr(repo_info, 'size') and repo_info.size:
                    total_size = repo_info.size
            except:
                pass
        
        # 取消超时
        signal.alarm(0)
        
        elapsed_time = time.time() - start_time
        return {
            'formats': list(formats) if formats else [],
            'size': total_size if total_size > 0 else None,
            'file_count': file_count,
            'status': 'success',
            'elapsed_time': elapsed_time
        }
        
    except TimeoutError:
        signal.alarm(0)  # 取消超时
        return {
            'formats': [],
            'size': None,
            'file_count': 0,
            'status': f'timeout: 处理超过{timeout_seconds}秒',
            'elapsed_time': timeout_seconds
        }
    except Exception as e:
        signal.alarm(0)  # 取消超时
        return {
            'formats': [],
            'size': None,
            'file_count': 0,
            'status': f'error: {str(e)}',
            'elapsed_time': 0
        }

def format_size(size_bytes):
    """
    将字节大小转换为易读格式
    """
    if size_bytes is None or size_bytes == 0:
        return None
    
    units = ['B', 'KB', 'MB', 'GB', 'TB']
    size = float(size_bytes)
    unit_index = 0
    
    while size >= 1024 and unit_index < len(units) - 1:
        size /= 1024
        unit_index += 1
    
    return f"{size:.2f} {units[unit_index]}"

print("函数定义完成（已添加2分钟超时机制）")

函数定义完成（已添加2分钟超时机制）


In [55]:
# 优化版本：处理result_df，更新formats和size列（支持2分钟超时）
print("开始处理数据集文件信息...")

# 设置处理数量限制（可以调整）
process_count = len(result_df)

# 创建副本以避免修改原始数据
processed_df = result_df.copy()

# 统计处理结果
success_count = 0
error_count = 0
timeout_count = 0
errors_log = []
timeout_log = []

# 只处理前几个数据集
for idx in tqdm(range(process_count), desc="处理数据集"):
    row = processed_df.iloc[idx]
    dataset_id = row['name']
    
    if pd.isna(dataset_id) or dataset_id == '':
        continue
    
    try:
        print(f"\n🔄 正在处理: {dataset_id}")
        start_time = time.time()
        
        # 获取文件信息（2分钟超时）
        file_info = get_dataset_file_info(dataset_id, timeout_seconds=120)
        
        processing_time = time.time() - start_time
        
        if file_info['status'] == 'success':
            # 更新formats列 - 将格式列表转换为字符串
            if file_info['formats']:
                processed_df.loc[processed_df.index[idx], 'formats'] = ', '.join(sorted(file_info['formats']))
            else:
                processed_df.loc[processed_df.index[idx], 'formats'] = None
            
            # 更新size列 - 格式化大小
            if file_info['size']:
                formatted_size = format_size(file_info['size'])
                processed_df.loc[processed_df.index[idx], 'size'] = formatted_size
            else:
                processed_df.loc[processed_df.index[idx], 'size'] = None
                
            success_count += 1
            print(f"✅ {dataset_id}: 成功处理 (耗时: {processing_time:.1f}s)")
            
        elif 'timeout' in file_info['status']:
            timeout_count += 1
            timeout_log.append(f"{dataset_id}: {file_info['status']}")
            processed_df.loc[processed_df.index[idx], 'formats'] = None
            processed_df.loc[processed_df.index[idx], 'size'] = None
            print(f"⏰ {dataset_id}: {file_info['status']}")
            
        else:
            error_count += 1
            errors_log.append(f"{dataset_id}: {file_info['status']}")
            processed_df.loc[processed_df.index[idx], 'formats'] = None
            processed_df.loc[processed_df.index[idx], 'size'] = None
            print(f"❌ {dataset_id}: {file_info['status']}")
            
    except Exception as e:
        error_count += 1
        error_msg = str(e)
        errors_log.append(f"{dataset_id}: {error_msg}")
        processed_df.loc[processed_df.index[idx], 'formats'] = None
        processed_df.loc[processed_df.index[idx], 'size'] = None
        print(f"❌ {dataset_id}: {error_msg}")

print(f"\n{'='*60}")
print("处理完成!")
print(f"✅ 成功处理: {success_count}")
print(f"⏰ 超时跳过: {timeout_count}")
print(f"❌ 处理失败: {error_count}")
print(f"📊 总计: {success_count + timeout_count + error_count}")

if timeout_log:
    print(f"\n⏰ 超时列表:")
    for timeout in timeout_log:
        print(f"  - {timeout}")

if errors_log:
    print(f"\n❌ 错误列表:")
    for error in errors_log:
        print(f"  - {error}")

# 更新原始的result_df
result_df = processed_df.copy()
print(f"\n✅ result_df已更新")

开始处理数据集文件信息...


处理数据集:   0%|          | 0/101 [00:00<?, ?it/s]


🔄 正在处理: allenai/dolma


处理数据集:   1%|          | 1/101 [00:00<00:55,  1.80it/s]

✅ allenai/dolma: 成功处理 (耗时: 0.6s)

🔄 正在处理: nasa-ibm-ai4science/SDO_training


处理数据集:   2%|▏         | 2/101 [00:01<00:57,  1.71it/s]

✅ nasa-ibm-ai4science/SDO_training: 成功处理 (耗时: 0.6s)

🔄 正在处理: 3DTopia/4DNeX-10M


处理数据集:   3%|▎         | 3/101 [00:02<01:33,  1.05it/s]

✅ 3DTopia/4DNeX-10M: 成功处理 (耗时: 1.4s)

🔄 正在处理: allenai/MADLAD-400


处理数据集:   4%|▍         | 4/101 [00:37<22:59, 14.22s/it]

✅ allenai/MADLAD-400: 成功处理 (耗时: 34.6s)

🔄 正在处理: urbanaudiosensing/ASPED


处理数据集:   5%|▍         | 5/101 [00:38<15:17,  9.55s/it]

✅ urbanaudiosensing/ASPED: 成功处理 (耗时: 1.3s)

🔄 正在处理: x-humanoid-robomind/RoboMIND


处理数据集:   6%|▌         | 6/101 [00:39<10:31,  6.64s/it]

✅ x-humanoid-robomind/RoboMIND: 成功处理 (耗时: 1.0s)

🔄 正在处理: InternRobotics/InternData-N1


处理数据集:   7%|▋         | 7/101 [00:41<07:52,  5.03s/it]

✅ InternRobotics/InternData-N1: 成功处理 (耗时: 1.7s)

🔄 正在处理: DeliberatorArchiver/asmr-archive-data-01


处理数据集:   8%|▊         | 8/101 [01:07<18:28, 11.92s/it]

✅ DeliberatorArchiver/asmr-archive-data-01: 成功处理 (耗时: 26.7s)

🔄 正在处理: DeliberatorArchiver/asmr-archive-data-02


处理数据集:   9%|▉         | 9/101 [01:35<25:45, 16.80s/it]

✅ DeliberatorArchiver/asmr-archive-data-02: 成功处理 (耗时: 27.5s)

🔄 正在处理: oscar-corpus/OSCAR-2109


处理数据集:  10%|▉         | 10/101 [01:42<21:01, 13.87s/it]

✅ oscar-corpus/OSCAR-2109: 成功处理 (耗时: 7.3s)

🔄 正在处理: poloclub/diffusiondb


处理数据集:  11%|█         | 11/101 [01:46<16:20, 10.89s/it]

✅ poloclub/diffusiondb: 成功处理 (耗时: 4.1s)

🔄 正在处理: CarperAI/pilev2-dev


处理数据集:  12%|█▏        | 12/101 [01:48<12:07,  8.18s/it]

✅ CarperAI/pilev2-dev: 成功处理 (耗时: 2.0s)

🔄 正在处理: oscar-corpus/OSCAR-2301


处理数据集:  13%|█▎        | 13/101 [01:50<08:59,  6.13s/it]

✅ oscar-corpus/OSCAR-2301: 成功处理 (耗时: 1.4s)

🔄 正在处理: oscar-corpus/oscar-2301-hpc


处理数据集:  14%|█▍        | 14/101 [01:50<06:26,  4.44s/it]

✅ oscar-corpus/oscar-2301-hpc: 成功处理 (耗时: 0.5s)

🔄 正在处理: ColtonAi/Oi


处理数据集:  15%|█▍        | 15/101 [01:51<04:43,  3.30s/it]

✅ ColtonAi/Oi: 成功处理 (耗时: 0.6s)

🔄 正在处理: NOABOL35631y/T


处理数据集:  16%|█▌        | 16/101 [01:51<03:30,  2.47s/it]

✅ NOABOL35631y/T: 成功处理 (耗时: 0.5s)

🔄 正在处理: oscar-corpus/colossal-oscar-1.0


处理数据集:  17%|█▋        | 17/101 [01:54<03:24,  2.44s/it]

✅ oscar-corpus/colossal-oscar-1.0: 成功处理 (耗时: 2.4s)

🔄 正在处理: min06/Hscb


处理数据集:  18%|█▊        | 18/101 [01:54<02:36,  1.89s/it]

✅ min06/Hscb: 成功处理 (耗时: 0.6s)

🔄 正在处理: CropNet/CropNet


处理数据集:  19%|█▉        | 19/101 [01:59<03:39,  2.67s/it]

✅ CropNet/CropNet: 成功处理 (耗时: 4.5s)

🔄 正在处理: DL3DV/DL3DV-Benchmark


处理数据集:  20%|█▉        | 20/101 [02:38<18:27, 13.67s/it]

✅ DL3DV/DL3DV-Benchmark: 成功处理 (耗时: 39.3s)

🔄 正在处理: Pytagora/Datalab


处理数据集:  21%|██        | 21/101 [02:39<13:06,  9.84s/it]

✅ Pytagora/Datalab: 成功处理 (耗时: 0.9s)

🔄 正在处理: DL3DV/DL3DV-ALL-4K


处理数据集:  22%|██▏       | 22/101 [02:42<10:18,  7.82s/it]

✅ DL3DV/DL3DV-ALL-4K: 成功处理 (耗时: 3.1s)

🔄 正在处理: shoyimobloqulov/text-to-speech-tts


处理数据集:  23%|██▎       | 23/101 [02:43<07:17,  5.61s/it]

✅ shoyimobloqulov/text-to-speech-tts: 成功处理 (耗时: 0.5s)

🔄 正在处理: DL3DV/DL3DV-ALL-960P


处理数据集:  24%|██▍       | 24/101 [02:46<06:16,  4.90s/it]

✅ DL3DV/DL3DV-ALL-960P: 成功处理 (耗时: 3.2s)

🔄 正在处理: NikkoIGuess/NikkoDoesRandom_Ai


处理数据集:  25%|██▍       | 25/101 [02:46<04:33,  3.60s/it]

✅ NikkoIGuess/NikkoDoesRandom_Ai: 成功处理 (耗时: 0.6s)

🔄 正在处理: DL3DV/DL3DV-ALL-2K


处理数据集:  26%|██▌       | 26/101 [02:50<04:31,  3.62s/it]

✅ DL3DV/DL3DV-ALL-2K: 成功处理 (耗时: 3.7s)

🔄 正在处理: DL3DV/DL3DV-ALL-video


处理数据集:  27%|██▋       | 27/101 [02:54<04:29,  3.65s/it]

✅ DL3DV/DL3DV-ALL-video: 成功处理 (耗时: 3.7s)

🔄 正在处理: HPLT/hplt_monolingual_v1_2


处理数据集:  28%|██▊       | 28/101 [02:54<03:17,  2.71s/it]

✅ HPLT/hplt_monolingual_v1_2: 成功处理 (耗时: 0.5s)

🔄 正在处理: DL3DV/DL3DV-ALL-ColmapCache


处理数据集:  29%|██▊       | 29/101 [02:58<03:35,  2.99s/it]

✅ DL3DV/DL3DV-ALL-ColmapCache: 成功处理 (耗时: 3.7s)

🔄 正在处理: Pragadishwaran/manual


处理数据集:  30%|██▉       | 30/101 [02:59<02:39,  2.25s/it]

✅ Pragadishwaran/manual: 成功处理 (耗时: 0.5s)

🔄 正在处理: psoldunov/testset


处理数据集:  31%|███       | 31/101 [02:59<02:01,  1.73s/it]

✅ psoldunov/testset: 成功处理 (耗时: 0.5s)

🔄 正在处理: meg/dolma-v1_6-sample


处理数据集:  32%|███▏      | 32/101 [03:00<01:35,  1.38s/it]

✅ meg/dolma-v1_6-sample: 成功处理 (耗时: 0.5s)

🔄 正在处理: akhilhsingh/homeo-dataset


处理数据集:  33%|███▎      | 33/101 [03:00<01:20,  1.19s/it]

✅ akhilhsingh/homeo-dataset: 成功处理 (耗时: 0.7s)

🔄 正在处理: oscar-corpus/community-oscar


处理数据集:  34%|███▎      | 34/101 [03:28<10:15,  9.19s/it]

✅ oscar-corpus/community-oscar: 成功处理 (耗时: 27.9s)

🔄 正在处理: Mohamedfadil369/BrainData


处理数据集:  35%|███▍      | 35/101 [03:29<07:13,  6.57s/it]

✅ Mohamedfadil369/BrainData: 成功处理 (耗时: 0.5s)

🔄 正在处理: nkandpa2/cccc_all_domains


处理数据集:  36%|███▌      | 36/101 [03:29<05:15,  4.85s/it]

✅ nkandpa2/cccc_all_domains: 成功处理 (耗时: 0.8s)

🔄 正在处理: artanekrem/SDFSDF


处理数据集:  37%|███▋      | 37/101 [03:30<03:46,  3.55s/it]

✅ artanekrem/SDFSDF: 成功处理 (耗时: 0.5s)

🔄 正在处理: liuqingquan/test2


处理数据集:  38%|███▊      | 38/101 [03:31<02:51,  2.72s/it]

✅ liuqingquan/test2: 成功处理 (耗时: 0.8s)

🔄 正在处理: Artificial-superintelligence/Athtest


处理数据集:  39%|███▊      | 39/101 [03:31<02:07,  2.05s/it]

✅ Artificial-superintelligence/Athtest: 成功处理 (耗时: 0.5s)

🔄 正在处理: DataoceanAI/Lip_reading_Speech_Video_Corpus


处理数据集:  40%|███▉      | 40/101 [03:32<01:37,  1.60s/it]

✅ DataoceanAI/Lip_reading_Speech_Video_Corpus: 成功处理 (耗时: 0.5s)

🔄 正在处理: shuxunoo/NFT-Net


处理数据集:  41%|████      | 41/101 [03:33<01:22,  1.38s/it]

✅ shuxunoo/NFT-Net: 成功处理 (耗时: 0.9s)

🔄 正在处理: BAAI/IndustryCorpus_programming


处理数据集:  42%|████▏     | 42/101 [03:33<01:04,  1.09s/it]

✅ BAAI/IndustryCorpus_programming: 成功处理 (耗时: 0.4s)

🔄 正在处理: BAAI/IndustryCorpus_emotion


处理数据集:  43%|████▎     | 43/101 [03:34<00:56,  1.03it/s]

✅ BAAI/IndustryCorpus_emotion: 成功处理 (耗时: 0.7s)

🔄 正在处理: BAAI/IndustryCorpus_mathematics


处理数据集:  44%|████▎     | 44/101 [03:34<00:46,  1.23it/s]

✅ BAAI/IndustryCorpus_mathematics: 成功处理 (耗时: 0.5s)

🔄 正在处理: BAAI/IndustryCorpus_ai


处理数据集:  45%|████▍     | 45/101 [03:35<00:41,  1.33it/s]

✅ BAAI/IndustryCorpus_ai: 成功处理 (耗时: 0.6s)

🔄 正在处理: Yuqi1997/DrivingDojo


处理数据集:  46%|████▌     | 46/101 [03:35<00:35,  1.53it/s]

✅ Yuqi1997/DrivingDojo: 成功处理 (耗时: 0.4s)

🔄 正在处理: Zyphra/Zyda-2


处理数据集:  47%|████▋     | 47/101 [04:12<10:18, 11.45s/it]

✅ Zyphra/Zyda-2: 成功处理 (耗时: 36.6s)

🔄 正在处理: Maple728/Time-300B


处理数据集:  48%|████▊     | 48/101 [04:13<07:16,  8.23s/it]

✅ Maple728/Time-300B: 成功处理 (耗时: 0.7s)

🔄 正在处理: DL3DV/DL3DV-Drone


处理数据集:  49%|████▊     | 49/101 [04:13<05:07,  5.92s/it]

✅ DL3DV/DL3DV-Drone: 成功处理 (耗时: 0.5s)

🔄 正在处理: Lightarmortech/Bot.com


处理数据集:  50%|████▉     | 50/101 [04:14<03:38,  4.28s/it]

✅ Lightarmortech/Bot.com: 成功处理 (耗时: 0.5s)

🔄 正在处理: LLM360/TxT360


处理数据集:  50%|█████     | 51/101 [04:59<13:55, 16.72s/it]

✅ LLM360/TxT360: 成功处理 (耗时: 45.7s)

🔄 正在处理: avalab/Allo-AVA


处理数据集:  51%|█████▏    | 52/101 [05:02<10:15, 12.56s/it]

✅ avalab/Allo-AVA: 成功处理 (耗时: 2.9s)

🔄 正在处理: anchovy/maple728-time_300B


处理数据集:  52%|█████▏    | 53/101 [05:03<07:10,  8.98s/it]

✅ anchovy/maple728-time_300B: 成功处理 (耗时: 0.6s)

🔄 正在处理: Khole1234/Chloe


处理数据集:  53%|█████▎    | 54/101 [05:03<05:03,  6.45s/it]

✅ Khole1234/Chloe: 成功处理 (耗时: 0.5s)

🔄 正在处理: Moonlightsonata/vso


处理数据集:  54%|█████▍    | 55/101 [05:04<03:34,  4.66s/it]

✅ Moonlightsonata/vso: 成功处理 (耗时: 0.5s)

🔄 正在处理: haggs/test


处理数据集:  55%|█████▌    | 56/101 [05:04<02:35,  3.46s/it]

✅ haggs/test: 成功处理 (耗时: 0.6s)

🔄 正在处理: Prabhuram/Medicine-List


处理数据集:  56%|█████▋    | 57/101 [05:06<02:03,  2.80s/it]

✅ Prabhuram/Medicine-List: 成功处理 (耗时: 1.3s)

🔄 正在处理: dijihax/Dataset


处理数据集:  57%|█████▋    | 58/101 [05:06<01:32,  2.15s/it]

✅ dijihax/Dataset: 成功处理 (耗时: 0.7s)

🔄 正在处理: prasenjeet5/DeLTa-Zz-Core


处理数据集:  58%|█████▊    | 59/101 [05:07<01:14,  1.76s/it]

✅ prasenjeet5/DeLTa-Zz-Core: 成功处理 (耗时: 0.8s)

🔄 正在处理: coan/bot_claiton


处理数据集:  59%|█████▉    | 60/101 [05:08<00:58,  1.42s/it]

✅ coan/bot_claiton: 成功处理 (耗时: 0.6s)

🔄 正在处理: neis-lab/mmcows


处理数据集:  60%|██████    | 61/101 [05:09<00:47,  1.19s/it]

✅ neis-lab/mmcows: 成功处理 (耗时: 0.7s)

🔄 正在处理: m-a-p/FineFineWeb-fasttext-seeddata


处理数据集:  61%|██████▏   | 62/101 [05:09<00:40,  1.04s/it]

✅ m-a-p/FineFineWeb-fasttext-seeddata: 成功处理 (耗时: 0.7s)

🔄 正在处理: 1989shack/Database


处理数据集:  62%|██████▏   | 63/101 [05:10<00:33,  1.13it/s]

✅ 1989shack/Database: 成功处理 (耗时: 0.5s)

🔄 正在处理: sarahgillet/BrainstormingDataset


处理数据集:  63%|██████▎   | 64/101 [05:10<00:29,  1.25it/s]

✅ sarahgillet/BrainstormingDataset: 成功处理 (耗时: 0.6s)

🔄 正在处理: BIOMEDICA/biomedica_webdataset_24M


处理数据集:  64%|██████▍   | 65/101 [05:11<00:32,  1.11it/s]

✅ BIOMEDICA/biomedica_webdataset_24M: 成功处理 (耗时: 1.1s)

🔄 正在处理: Majidu/Ohr


处理数据集:  65%|██████▌   | 66/101 [05:12<00:26,  1.32it/s]

✅ Majidu/Ohr: 成功处理 (耗时: 0.4s)

🔄 正在处理: GraspClutter6D/GraspClutter6D


处理数据集:  66%|██████▋   | 67/101 [05:12<00:23,  1.45it/s]

✅ GraspClutter6D/GraspClutter6D: 成功处理 (耗时: 0.5s)

🔄 正在处理: Johnnyboystar/Images


处理数据集:  67%|██████▋   | 68/101 [05:13<00:20,  1.60it/s]

✅ Johnnyboystar/Images: 成功处理 (耗时: 0.5s)

🔄 正在处理: agibot-world/AgiBotDigitalWorld


处理数据集:  68%|██████▊   | 69/101 [05:28<02:39,  4.99s/it]

✅ agibot-world/AgiBotDigitalWorld: 成功处理 (耗时: 15.2s)

🔄 正在处理: ZeppelinCorp/Eclipse_Corpuz


处理数据集:  69%|██████▉   | 70/101 [05:29<01:52,  3.63s/it]

✅ ZeppelinCorp/Eclipse_Corpuz: 成功处理 (耗时: 0.4s)

🔄 正在处理: IntelLabs/BlueLens


处理数据集:  70%|███████   | 71/101 [05:31<01:38,  3.28s/it]

✅ IntelLabs/BlueLens: 成功处理 (耗时: 2.5s)

🔄 正在处理: jobs-git/Zyda-2


处理数据集:  71%|███████▏  | 72/101 [06:04<05:50, 12.09s/it]

✅ jobs-git/Zyda-2: 成功处理 (耗时: 32.7s)

🔄 正在处理: jobs-git/HPLT2.0_cleaned


处理数据集:  72%|███████▏  | 73/101 [06:36<08:24, 18.03s/it]

✅ jobs-git/HPLT2.0_cleaned: 成功处理 (耗时: 31.9s)

🔄 正在处理: embed2scale/SSL4EO-S12-v1.1


处理数据集:  73%|███████▎  | 74/101 [06:40<06:17, 13.97s/it]

✅ embed2scale/SSL4EO-S12-v1.1: 成功处理 (耗时: 4.5s)

🔄 正在处理: torchgeo/CropClimateX


处理数据集:  74%|███████▍  | 75/101 [06:47<05:06, 11.78s/it]

✅ torchgeo/CropClimateX: 成功处理 (耗时: 6.7s)

🔄 正在处理: everrer/121212


处理数据集:  75%|███████▌  | 76/101 [06:47<03:29,  8.39s/it]

✅ everrer/121212: 成功处理 (耗时: 0.5s)

🔄 正在处理: maxkaufmann/allenai_dolma_test_set


处理数据集:  76%|███████▌  | 77/101 [06:48<02:25,  6.06s/it]

✅ maxkaufmann/allenai_dolma_test_set: 成功处理 (耗时: 0.6s)

🔄 正在处理: BIOMEDICA/biomedica_microscopy_subset_webdataset


处理数据集:  77%|███████▋  | 78/101 [06:49<01:43,  4.49s/it]

✅ BIOMEDICA/biomedica_microscopy_subset_webdataset: 成功处理 (耗时: 0.8s)

🔄 正在处理: BIOMEDICA/biomedica_dermatology_subset_webdataset


处理数据集:  78%|███████▊  | 79/101 [06:49<01:13,  3.35s/it]

✅ BIOMEDICA/biomedica_dermatology_subset_webdataset: 成功处理 (耗时: 0.7s)

🔄 正在处理: BIOMEDICA/biomedica_surgery_subset_webdataset


处理数据集:  79%|███████▉  | 80/101 [06:50<00:53,  2.57s/it]

✅ BIOMEDICA/biomedica_surgery_subset_webdataset: 成功处理 (耗时: 0.7s)

🔄 正在处理: trillarmybewm/trillium


处理数据集:  80%|████████  | 81/101 [06:51<00:39,  1.95s/it]

✅ trillarmybewm/trillium: 成功处理 (耗时: 0.5s)

🔄 正在处理: DL3DV/DL3DV-GS-960P


处理数据集:  81%|████████  | 82/101 [06:55<00:53,  2.80s/it]

✅ DL3DV/DL3DV-GS-960P: 成功处理 (耗时: 4.8s)

🔄 正在处理: criteo/CriteoClickLogs


处理数据集:  82%|████████▏ | 83/101 [06:56<00:37,  2.09s/it]

✅ criteo/CriteoClickLogs: 成功处理 (耗时: 0.4s)

🔄 正在处理: jobs-git/diffusiondb


处理数据集:  83%|████████▎ | 84/101 [07:03<00:59,  3.50s/it]

✅ jobs-git/diffusiondb: 成功处理 (耗时: 6.8s)

🔄 正在处理: introspector/solfunmeme


处理数据集:  84%|████████▍ | 85/101 [07:06<00:53,  3.35s/it]

✅ introspector/solfunmeme: 成功处理 (耗时: 3.0s)

🔄 正在处理: JMaeen25/Mycollection


处理数据集:  85%|████████▌ | 86/101 [07:06<00:37,  2.47s/it]

✅ JMaeen25/Mycollection: 成功处理 (耗时: 0.4s)

🔄 正在处理: tugrul93/TaCarla


处理数据集:  86%|████████▌ | 87/101 [07:09<00:36,  2.58s/it]

✅ tugrul93/TaCarla: 成功处理 (耗时: 2.8s)

🔄 正在处理: cambrain/DigiFakeAV


处理数据集:  87%|████████▋ | 88/101 [07:10<00:28,  2.16s/it]

✅ cambrain/DigiFakeAV: 成功处理 (耗时: 1.2s)

🔄 正在处理: nasa-ibm-ai4science/ar_emergence


处理数据集:  88%|████████▊ | 89/101 [07:11<00:20,  1.73s/it]

✅ nasa-ibm-ai4science/ar_emergence: 成功处理 (耗时: 0.7s)

🔄 正在处理: Groovy-123/deep-think


处理数据集:  89%|████████▉ | 90/101 [07:11<00:14,  1.35s/it]

✅ Groovy-123/deep-think: 成功处理 (耗时: 0.5s)

🔄 正在处理: hiepp2/tvp4


处理数据集:  90%|█████████ | 91/101 [07:12<00:11,  1.16s/it]

✅ hiepp2/tvp4: 成功处理 (耗时: 0.7s)

🔄 正在处理: Groovy-123/Advance


处理数据集:  91%|█████████ | 92/101 [07:13<00:09,  1.06s/it]

✅ Groovy-123/Advance: 成功处理 (耗时: 0.8s)

🔄 正在处理: nvidia/PhysicalAI-Autonomous-Vehicle-Cosmos-Drive-Dreams


处理数据集:  92%|█████████▏| 93/101 [07:46<01:25, 10.67s/it]

✅ nvidia/PhysicalAI-Autonomous-Vehicle-Cosmos-Drive-Dreams: 成功处理 (耗时: 33.1s)

🔄 正在处理: Prompthumanizer/jain_architecture


处理数据集:  93%|█████████▎| 94/101 [07:46<00:53,  7.61s/it]

✅ Prompthumanizer/jain_architecture: 成功处理 (耗时: 0.5s)

🔄 正在处理: Mfonkown/Zark-Dataset_1.0


处理数据集:  94%|█████████▍| 95/101 [07:47<00:32,  5.48s/it]

✅ Mfonkown/Zark-Dataset_1.0: 成功处理 (耗时: 0.5s)

🔄 正在处理: 1il7tw6n/makevideo


处理数据集:  95%|█████████▌| 96/101 [07:47<00:19,  3.98s/it]

✅ 1il7tw6n/makevideo: 成功处理 (耗时: 0.5s)

🔄 正在处理: JQL-AI/fw2_embeddings


处理数据集:  96%|█████████▌| 97/101 [07:48<00:12,  3.05s/it]

✅ JQL-AI/fw2_embeddings: 成功处理 (耗时: 0.9s)

🔄 正在处理: JQL-AI/hplt2_embeddings


处理数据集:  97%|█████████▋| 98/101 [07:49<00:07,  2.33s/it]

✅ JQL-AI/hplt2_embeddings: 成功处理 (耗时: 0.7s)

🔄 正在处理: PLM-Team/Pretrain-Dataset


处理数据集:  98%|█████████▊| 99/101 [07:50<00:04,  2.08s/it]

✅ PLM-Team/Pretrain-Dataset: 成功处理 (耗时: 1.5s)

🔄 正在处理: Groovy-123/QuantumAI


处理数据集:  99%|█████████▉| 100/101 [07:51<00:01,  1.63s/it]

✅ Groovy-123/QuantumAI: 成功处理 (耗时: 0.6s)

🔄 正在处理: theairlabcmu/tartanair


处理数据集: 100%|██████████| 101/101 [07:52<00:00,  4.68s/it]

✅ theairlabcmu/tartanair: 成功处理 (耗时: 0.8s)

处理完成!
✅ 成功处理: 101
⏰ 超时跳过: 0
❌ 处理失败: 0
📊 总计: 101

✅ result_df已更新





数据清洗：模态

In [56]:
import ast
import re

print("开始数据清洗...")
print("="*60)

# 1. 清洗modalities列
print("1. 处理modalities列...")

def clean_modalities(modalities_str):
    """
    清洗并分类modalities
    Text、Tabular -> 文本
    Image -> 图片
    Video -> 视频
    多个不同类型 -> 多模态
    """
    if pd.isna(modalities_str) or modalities_str == '' or modalities_str == 'None':
        return None
    
    try:
        # 尝试解析字符串为列表
        if isinstance(modalities_str, str):
            # 去除外层引号和空格
            modalities_str = modalities_str.strip().strip("'\"")
            
            # 如果是字符串形式的列表，解析它
            if modalities_str.startswith('[') and modalities_str.endswith(']'):
                modalities_list = ast.literal_eval(modalities_str)
            else:
                # 如果不是列表格式，按逗号分割
                modalities_list = [item.strip().strip("'\"") for item in modalities_str.split(',')]
        else:
            modalities_list = [modalities_str]
            
        # 去除空值和None
        modalities_list = [item for item in modalities_list if item and item != 'None' and item.strip()]
        
        if not modalities_list:
            return None
            
        # 转换为小写进行比较
        modalities_lower = [item.lower() for item in modalities_list]
        
        # 分类逻辑
        has_text = any(mod in ['text', 'tabular'] for mod in modalities_lower)
        has_image = any(mod in ['image'] for mod in modalities_lower)
        has_video = any(mod in ['video'] for mod in modalities_lower)
        
        # 计算不同类型的数量
        type_count = sum([has_text, has_image, has_video])
        
        if type_count > 1:
            return '多模态'
        elif has_text:
            return '文本'
        elif has_image:
            return '图片'
        elif has_video:
            return '视频'
        else:
            # 其他未知类型
            return '其他'
            
    except Exception as e:
        print(f"处理modalities时出错: {modalities_str}, 错误: {e}")
        return None

# 应用modalities清洗
result_df['modalities'] = result_df['modalities'].apply(clean_modalities)

# 统计modalities清洗结果
modalities_counts = result_df['modalities'].value_counts()
print("modalities清洗结果:")
for category, count in modalities_counts.items():
    print(f"  {category}: {count}")

开始数据清洗...
1. 处理modalities列...
modalities清洗结果:
  其他: 3
  多模态: 1


数据清洗：学科

In [57]:
# 3. 处理tags列并创建学科分类
print("\n3. 处理tags列并创建学科分类...")

def classify_subject(tags_str):
    """
    根据tags创建学科分类：
    1. 如果包含biology或medical -> '生命科学'
    2. 如果包含chemistry -> '物质科学'  
    3. 如果包含climate -> '大气海洋'
    4. 其他 -> None
    """
    if pd.isna(tags_str) or tags_str == '' or tags_str == 'None':
        return None
    
    try:
        # 转换为字符串并转为小写以便比较
        tags_str_lower = str(tags_str).lower()
        
        # 检查是否包含生命科学相关标签
        if any(keyword in tags_str_lower for keyword in ['biology', 'medical', 'bio-', 'medicine', 'health', 'genomics', 'protein']):
            return '生命科学'
        
        # 检查是否包含物质科学相关标签
        elif any(keyword in tags_str_lower for keyword in ['chemistry', 'chemical', 'molecule', 'compound', 'material']):
            return '物质科学'
        
        # 检查是否包含大气海洋相关标签
        elif any(keyword in tags_str_lower for keyword in ['climate', 'weather', 'ocean', 'atmospheric', 'meteorology', 'environmental']):
            return '大气海洋'
        
        # 其他情况返回None
        else:
            return None
            
    except Exception as e:
        print(f"处理tags时出错: {tags_str}, 错误: {e}")
        return None

def clean_tags(tags_str):
    """
    清洗tags列，去除JSON格式的[]和''，只保留单词
    """
    if pd.isna(tags_str) or tags_str == '' or tags_str == 'None':
        return None
    
    try:
        # 转换为字符串
        tags_str = str(tags_str).strip()
        
        # 去除外层引号
        tags_str = tags_str.strip("'\"")
        
        # 如果是列表格式，尝试解析
        if tags_str.startswith('[') and tags_str.endswith(']'):
            try:
                tags_list = ast.literal_eval(tags_str)
                if isinstance(tags_list, list):
                    # 清理列表中的每个标签
                    clean_list = []
                    for tag in tags_list:
                        tag = str(tag).strip().strip("'\"")
                        if tag and tag.lower() != 'none':
                            clean_list.append(tag)
                    return ', '.join(clean_list)
            except:
                # 如果解析失败，直接去除括号和引号
                tags_str = tags_str.strip('[]')
        
        # 去除各种引号
        tags_str = re.sub(r"['\"]", '', tags_str)
        
        # 分割并清理
        if ',' in tags_str:
            tags_parts = [part.strip() for part in tags_str.split(',')]
        else:
            tags_parts = [tags_str.strip()]
        
        # 过滤并清理标签
        valid_tags = []
        for tag in tags_parts:
            tag = tag.strip()
            if tag and tag.lower() not in ['none', 'null', '']:
                # 移除特殊字符，但保留连字符和下划线
                clean_tag = re.sub(r'[^\w\-]', ' ', tag).strip()
                clean_tag = re.sub(r'\s+', '-', clean_tag)  # 将空格替换为连字符
                if clean_tag:
                    valid_tags.append(clean_tag.lower())
        
        # 去重并排序
        valid_tags = sorted(list(set(valid_tags)))
        
        if valid_tags:
            return ', '.join(valid_tags)
        else:
            return None
            
    except Exception as e:
        print(f"处理tags时出错: {tags_str}, 错误: {e}")
        return None

# 应用tags清洗和学科分类
print("应用tags清洗和学科分类...")
result_df['tags_cleaned'] = result_df['tags'].apply(clean_tags)
result_df['学科'] = result_df['tags'].apply(classify_subject)

# 统计学科分类结果
print("\n学科分类结果:")
subject_counts = result_df['学科'].value_counts()
for subject, count in subject_counts.items():
    print(f"  {subject}: {count}")


3. 处理tags列并创建学科分类...
应用tags清洗和学科分类...

学科分类结果:
  生命科学: 12
  物质科学: 4
  大气海洋: 2


In [58]:
import pandas as pd
license_df=pd.read_excel('规范协议名称.xlsx')

In [59]:
import re
from difflib import get_close_matches

def normalize_license(license_name, standard_licenses):
    """
    规范化协议名称
    """
    if pd.isna(license_name) or license_name == '':
        return license_name
    
    # 转换为字符串并去除首尾空格
    license_str = str(license_name).strip()
    
    # 如果已经是标准格式，直接返回
    if license_str in standard_licenses:
        return license_str
    
    # 处理常见的大小写和格式问题
    license_upper = license_str.upper()
    
    # 创建映射字典处理常见的变体
    mapping = {
        'APACHE-2.0': 'Apache-2.0',
        'APACHE 2.0': 'Apache-2.0',
        'APACHE2.0': 'Apache-2.0',
        'APACHE': 'Apache-2.0',
        'MIT': 'MIT',
        'MIT LICENSE': 'MIT',
        'CC-BY-4.0': 'CC-BY-4.0',
        'CC BY 4.0': 'CC-BY-4.0',
        'CC-BY-SA-4.0': 'CC-BY-SA-4.0',
        'CC BY SA 4.0': 'CC-BY-SA-4.0',
        'CC-BY-NC-SA-4.0': 'CC-BY-NC-SA-4.0',
        'CC BY NC SA 4.0': 'CC-BY-NC-SA-4.0',
        'CC0-1.0': 'CC0-1.0',
        'CC0 1.0': 'CC0-1.0',
        'BSD-3-CLAUSE': 'BSD-3-Clause',
        'BSD 3 CLAUSE': 'BSD-3-Clause',
        'BSD-2-CLAUSE': 'BSD-2-Clause',
        'BSD 2 CLAUSE': 'BSD-2-Clause',
        'GPL-3.0': 'GPL-3.0',
        'GPL V3': 'GPL-3.0',
        'LGPL-3.0': 'LGPL-3.0',
        'AGPL-3.0': 'AGPL-3.0',
        'UNLICENSE': 'Unlicense',
        '公开': '公开',
        'PUBLIC': '公开',
        'OPENRAIL': 'OPENRAIL',
        'CUSTOM': 'CUSTOM'
    }
    
    # 先检查精确映射
    if license_upper in mapping:
        return mapping[license_upper]
    
    # 使用模糊匹配找到最相近的标准协议
    # 首先尝试在标准协议列表中找到最相似的
    close_matches = get_close_matches(license_str, standard_licenses, n=1, cutoff=0.8)
    if close_matches:
        return close_matches[0]
    
    # 如果没找到匹配，尝试大小写不敏感的匹配
    for std_license in standard_licenses:
        if license_upper == std_license.upper():
            return std_license
    
    # 如果仍然没找到，返回原值
    return license_str

# 获取标准协议列表
standard_licenses_list = license_df['开源协议'].tolist()

print("协议名称规范化函数已创建")

协议名称规范化函数已创建


In [60]:
# 对models_df进行协议名称规范化
print("正在规范化 models_df 的协议名称...")

# 创建备份
models_df_backup = result_df.copy()

# 应用规范化函数
result_df['开源协议'] = result_df['license'].apply(
    lambda x: normalize_license(x, standard_licenses_list)
)

# 显示规范化前后的对比
print("Models DataFrame 协议规范化对比:")
comparison_models = pd.DataFrame({
    '原始协议': result_df['license'],
    '规范化协议': result_df['开源协议']
})
comparison_models = pd.DataFrame({
    '原始协议': result_df['license'],
    '规范化协议': result_df['开源协议']
})

# 显示发生变化的记录
changed_models = comparison_models[comparison_models['原始协议'] != comparison_models['规范化协议']]
if len(changed_models) > 0:
    print(f"共有 {len(changed_models)} 条记录发生了变化:")
    print(changed_models.drop_duplicates().head(20))
else:
    print("没有记录发生变化")

# 显示规范化后的唯一值统计
print(f"\n规范化后的协议唯一值统计:")
print(result_df['开源协议'].value_counts().head(20))

正在规范化 models_df 的协议名称...
Models DataFrame 协议规范化对比:
共有 87 条记录发生了变化:
                原始协议            规范化协议
1                mit              MIT
2         apache-2.0       Apache-2.0
4          cc-by-4.0        CC-BY-4.0
6       cc-by-sa-4.0     CC-BY-SA-4.0
7           agpl-3.0         AGPL-3.0
9            cc0-1.0          CC0-1.0
11              None             None
15          openrail         OPENRAIL
20           afl-3.0          AFL-3.0
40      cc-by-nc-4.0     CC-BY-NC-4.0
60   cc-by-nc-sa-4.0  CC-BY-NC-SA-4.0
68   CC BY-NC-SA 4.0  CC-BY-NC-SA-4.0
80             wtfpl            WTFPL
100     bsd-3-clause     BSD-3-Clause

规范化后的协议唯一值统计:
开源协议
Apache-2.0                            23
MIT                                   12
odc-by                                 9
CC0-1.0                                9
CC-BY-4.0                              6
CC-BY-NC-SA-4.0                        4
AGPL-3.0                               3
CC-BY-SA-4.0                           2
pddl           

识别企业与机构

In [61]:
# 重新设计更严格的AI公司识别逻辑
def create_strict_ai_company_mapping():
    """
    创建更严格的AI公司映射，只包含确认的知名公司和机构官方账号
    """
    strict_mapping = {
        # 大型科技公司（精确匹配官方账号）
        'microsoft': 'Microsoft',
        'meta': 'Meta',
        'facebook': 'Meta',
        'google': 'Google',
        'googleai': 'Google',
        'google-research': 'Google',
        'google-deepmind': 'Google DeepMind',
        'deepmind': 'Google DeepMind',
        'openai': 'OpenAI',
        'anthropic': 'Anthropic',
        'apple': 'Apple',
        'nvidia': 'NVIDIA',
        'amazon': 'Amazon',
        'aws': 'Amazon',
        'ibm': 'IBM',
        'ibm-nasa-geospatial': 'IBM',
        'ibm-research': 'IBM',
        'salesforce': 'Salesforce',
        'adobe': 'Adobe',
        'intel': 'Intel',
        
        # AI公司和研究机构（精确匹配）
        'huggingface': 'Hugging Face',
        'eleutherai': 'EleutherAI',
        'stabilityai': 'Stability AI',
        'cohere': 'Cohere',
        'coherelabs': 'Cohere',
        'ai21labs': 'AI21 Labs',
        'allenai': 'Allen Institute for AI',
        
        # 生物技术公司
        'basf-ai': 'BASF',
        'basf': 'BASF',
        
        # 研究机构和开源项目（只包含明确的官方账号）
        'openclimatefix': 'Open Climate Fix',
        'torchgeo': 'TorchGeo',
        'imageomics': 'Imageomics',
        'biomap-research': 'BioMap Research',
        'proteinglm': 'ProteinGLM',
        'autobio-bench': 'AutoBio-Bench',
        'freedomintelligence': 'FreedomIntelligence',
        'hpai-bsc': 'HPAI-BSC',
        'scikit-fingerprints': 'Scikit-Fingerprints',
        'oxai4science': 'OxAI4Science',
        'opendatalab': 'OpenDataLab',
        
        # 知名大学（只包含明确的官方机构账号，不包含个人用户名）
        'stanford-nlp': 'Stanford University',
        'stanfordaimlab': 'Stanford University',
        'stanfordaimi': 'Stanford University',
        'mit-csail': 'MIT',
        'fair-forward': 'Meta FAIR',
        'epfl-eceo': 'EPFL',
        'jingwei-sjtu': 'Shanghai Jiao Tong University',
        'aim-harvard': 'Harvard University',
        
        # 中国知名公司
        'alibaba': 'Alibaba',
        'tencent': 'Tencent',
        'baidu': 'Baidu',
        'bytedance': 'ByteDance',
        'xiaomi': 'Xiaomi',
        'sensetime': 'SenseTime',
        'opendatalab': 'OpenDataLab',

        # 数据服务公司
        'trainingdatapro': 'TrainingDataPro',
        'datatonic': 'DataTonic',
    }
    return strict_mapping

def strict_identify_ai_company(author_name):
    """
    更严格的AI公司识别函数，只识别明确的官方账号
    """
    if pd.isna(author_name) or author_name == '':
        return None
    
    author_lower = str(author_name).lower().strip()
    strict_mapping = create_strict_ai_company_mapping()
    
    # 只进行精确匹配，绝不进行模糊匹配或包含匹配
    if author_lower in strict_mapping:
        return strict_mapping[author_lower]
    
    return None


# 使用新的严格识别逻辑
result_df['识别出的AI公司'] = result_df['author'].apply(strict_identify_ai_company)

# 显示新的识别结果
new_identified = result_df[result_df['识别出的AI公司'].notna()]
print(f"\n严格识别后的AI公司/机构账号数量: {len(new_identified)}")
print(f"识别比例: {len(new_identified)/len(result_df)*100:.1f}%")

print(f"\n严格识别后的AI公司/机构分布:")
new_company_counts = new_identified['识别出的AI公司'].value_counts()
print(new_company_counts)

print(f"\n新的具体映射例子:")
new_examples = new_identified[['author', '识别出的AI公司']].drop_duplicates()
for _, row in new_examples.iterrows():
    print(f"  {row['author']} -> {row['识别出的AI公司']}")


严格识别后的AI公司/机构账号数量: 4
识别比例: 4.0%

严格识别后的AI公司/机构分布:
识别出的AI公司
Allen Institute for AI    2
TorchGeo                  1
NVIDIA                    1
Name: count, dtype: int64

新的具体映射例子:
  allenai -> Allen Institute for AI
  torchgeo -> TorchGeo
  nvidia -> NVIDIA


In [62]:
# 创建"数据原始发布机构"列
# 如果"识别出的AI公司"列为空或NaN，则标注为"Huggingface"，否则使用"识别出的AI公司"的值
result_df['数据原始发布机构'] = result_df['识别出的AI公司'].fillna('Huggingface')

# 处理空字符串的情况
result_df['数据原始发布机构'] = result_df['数据原始发布机构'].replace('', 'Huggingface')

# 显示结果
print("创建'数据原始发布机构'列完成！")
print(f"数据形状: {result_df.shape}")
print("\n各数据原始发布机构的分布:")
print(result_df['数据原始发布机构'].value_counts())
print("\n前5行数据预览:")
print(result_df[['识别出的AI公司', '数据原始发布机构']].head())

创建'数据原始发布机构'列完成！
数据形状: (101, 19)

各数据原始发布机构的分布:
数据原始发布机构
Huggingface               97
Allen Institute for AI     2
TorchGeo                   1
NVIDIA                     1
Name: count, dtype: int64

前5行数据预览:
                 识别出的AI公司                数据原始发布机构
0  Allen Institute for AI  Allen Institute for AI
1                    None             Huggingface
2                    None             Huggingface
3  Allen Institute for AI  Allen Institute for AI
4                    None             Huggingface


# 调用AI翻译英文简介

In [63]:
prompt = """请你学习这个关于数据集的介绍，并用简洁的中文对进行总结，说明这个数据集的内容和用途。以JSON格式输出，严格遵循如下格式：```json{"介绍":"你的总结的内容"}``` """

In [64]:
import pandas as pd
import os
import json
import time
import logging
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from openai import OpenAI

# -------------------------------
# JSON解析模块（独立模块）
# -------------------------------
def default_json_parser(content, idx=None):
    """
    默认的 JSON 解析器：
    清理输入内容后尝试解析 JSON，
    若成功则返回完整的字典，若失败返回空字典。
    """
    try:
        # 去除代码块标记，清理内容
        cleaned_content = content.replace('```json\n', '').replace('```', '').strip()
        parsed_result = json.loads(cleaned_content)
        return parsed_result
    except json.JSONDecodeError:
        if idx is not None:
            logging.warning(f"警告: 第 {idx} 行解析 JSON 失败")
        return {}
    except Exception as e:
        if idx is not None:
            logging.error(f"错误: 第 {idx} 行解析失败 - {str(e)}")
        return {}

# -------------------------------
# 限流处理器（控制请求频率）
# -------------------------------
class RateLimitedProcessor:
    def __init__(self):
        self.request_timestamps = []
        self.MAX_RPM = 500
        self.window_size = 60  # 60秒窗口

    def _clean_old_records(self, current_time):
        cutoff_time = current_time - timedelta(seconds=self.window_size)
        self.request_timestamps = [ts for ts in self.request_timestamps if ts > cutoff_time]

    def can_make_request(self):
        """检查是否可以发起新请求"""
        current_time = datetime.now()
        self._clean_old_records(current_time)
        if len(self.request_timestamps) >= self.MAX_RPM:
            return False
        self.request_timestamps.append(current_time)
        return True

# -------------------------------
# OpenAI文本处理器
# -------------------------------
class OpenAITextProcessor:
    def __init__(self, api_key=None, model=None, base_url=None, json_parser=None):
        self.client = OpenAI(api_key=api_key,base_url=base_url)
        self.model = model
        self.rate_limiter = RateLimitedProcessor()
        self.n_workers = 14  # 优化后的线程数
        # 如果未提供自定义解析器，则使用默认解析器
        self.json_parser = json_parser if json_parser is not None else default_json_parser

    def process_batch(self, df, text_column, prompt, batch_size=20, delay=1, json_parser=None):
        """
        批量处理文本，支持灵活的 JSON 解析。
        
        参数:
            df: 包含文本数据的 DataFrame
            text_column: 文本所在的列名
            prompt: 系统提示，用于 API 调用
            batch_size: 每个批次处理的文本条数
            delay: 每次请求后的延迟（秒）
            json_parser: 可选的自定义 JSON 解析器，若不传入则使用实例内的解析器
        
        返回:
            新的 DataFrame，包含原始数据及 API 返回结果（通过 JSON 解析获得的各字段）
        """
        parser = json_parser if json_parser is not None else self.json_parser
        results = []  # 保存每次请求解析后的结果（字典形式）

        def process_chunk(chunk_data):
            chunk_results = []
            for idx, text in chunk_data:
                # 限流检测：等待直到可以发送请求
                while not self.rate_limiter.can_make_request():
                    time.sleep(0.1)
                try:
                    response = self.client.chat.completions.create(
                        model=self.model,
                        messages=[
                            {"role": "system", "content": prompt},
                            {"role": "user", "content": text}
                        ],
                        temperature=1,
                        max_tokens=500
                    )
                    # 使用解析器处理响应内容，得到字典格式结果
                    parsed_result = parser(response.choices[0].message.content, idx)
                    chunk_results.append(parsed_result)
                    time.sleep(delay)
                except Exception as e:
                    logging.error(f"错误: 处理第 {idx} 行时发生异常: {str(e)}")
                    chunk_results.append({})
            return chunk_results

        # 将数据分成批次，保留行号信息
        chunks = [
            list(enumerate(df[text_column][i:i+batch_size]))
            for i in range(0, len(df), batch_size)
        ]

        with ThreadPoolExecutor(max_workers=self.n_workers) as executor:
            futures = list(tqdm(
                executor.map(process_chunk, chunks),
                total=len(chunks),
                desc="Processing batches"
            ))
            for chunk_results in futures:
                results.extend(chunk_results)

        # 将解析结果列表转为 DataFrame，并与原 DataFrame 合并
        df_result = df.copy().reset_index(drop=True)
        results_df = pd.json_normalize(results)
        df_result = pd.concat([df_result, results_df], axis=1)

        # 统计处理情况
        success_count = sum(1 for r in results if r)
        total_count = len(results)
        success_rate = (success_count / total_count) * 100 if total_count > 0 else 0
        logging.info(f"处理完成: 总数 {total_count}, 成功 {success_count}, 成功率 {success_rate:.2f}%")
        
        return df_result


In [None]:
processor = OpenAITextProcessor(api_key="此处填入你的deepseek api key", base_url="https://api.deepseek.com",model="deepseek-chat")
df_result = processor.process_batch(
    df=result_df,
    text_column="description",
    prompt=prompt,
    batch_size=5,
)

Processing batches:   0%|          | 0/21 [00:00<?, ?it/s]2025-08-26 11:48:02,503 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-08-26 11:48:02,537 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 422 Unprocessable Entity"
2025-08-26 11:48:02,538 - ERROR - 错误: 处理第 0 行时发生异常: Failed to deserialize the JSON body into the target type: messages[1]: data did not match any variant of untagged enum ChatCompletionRequestContent at line 1 column 299
2025-08-26 11:48:02,542 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 422 Unprocessable Entity"
2025-08-26 11:48:02,543 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 422 Unprocessable Entity"
2025-08-26 11:48:02,544 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 422 Unprocessable Entity"
2025-08-26 11:48:02,544 - INFO - HTTP Request: POST https://api.deepseek.com/chat/complet

In [66]:
df_result.to_csv("备份_中间流程结果/dataset_translated.csv", index=False)

# 打标签

In [67]:
import re
import json
from typing import Dict, List, Set, Tuple
from collections import defaultdict

class ScienceCorpusTagger:
    """
    科学语料数据自动标签分配系统（三层级整合版）
    输出格式：学科/功能导向/应用领域/数据类型
    """
    
    def __init__(self):
        self.tag_hierarchy = self._init_tag_hierarchy()
        self.keyword_mapping = self._init_keyword_mapping()
        
    def _init_tag_hierarchy(self) -> Dict:
        """初始化三层标签层次结构"""
        return {
            "生命科学": {
                "功能导向": [
                    "疾病诊断分类", "治疗预测建模", "基因功能注释", "药物筛选发现", 
                    "生物标志物识别", "病理图像分析", "分子结构预测", "进化关联分析",
                    "临床预后评估", "健康状态监测"
                ],
                "应用领域": [
                    "医学影像", "基因组学", "蛋白质组学", "神经科学", "临床医学",
                    "药物学", "免疫学", "病理学", "生物信息学", "分子生物学"
                ],
                "数据类型": [
                    "序列数据", "影像数据", "信号数据", "结构数据", "表型数据",
                    "光谱数据", "时间序列数据"
                ]
            },
            "工程技术": {
                "功能导向": [
                    "系统设计优化", "性能预测评估", "故障诊断监测", "过程控制仿真",
                    "参数调优验证", "流体动力学分析", "热传导建模", "结构力学计算",
                    "自动化控制", "实验数据处理"
                ],
                "应用领域": [
                    "流体力学", "热传递工程", "结构工程", "自动化工程", "计算工程",
                    "机械工程", "化工工程", "航空航天", "材料工程", "控制工程"
                ],
                "数据类型": [
                    "仿真数据", "实验数据", "传感器数据", "时序数据", "几何数据",
                    "网格数据", "图像数据"
                ]
            },
            "物质科学": {
                "功能导向": [
                    "分子性质预测", "化学反应预测", "材料结构设计", "催化活性筛选",
                    "药物分子生成", "合成路径规划", "晶体结构优化", "光谱特征分析",
                    "热力学性质计算", "毒性安全评估"
                ],
                "应用领域": [
                    "药物化学", "催化科学", "材料科学", "有机化学", "无机化学",
                    "物理化学", "计算化学", "高分子科学", "纳米材料", "能源材料"
                ],
                "数据类型": [
                    "量子化学数据", "分子描述符数据", "反应数据", "光谱数据",
                    "热力学数据", "动力学数据", "结构数据"
                ]
            },
            "空间信息": {
                "功能导向": [
                    "天体目标识别", "轨道参数确定", "光度测量分析", "光谱特征提取",
                    "时间序列分析", "位置导航定位", "空间环境监测", "图像处理识别",
                    "射电信号分析", "变源监测预警"
                ],
                "应用领域": [
                    "天体物理学", "行星科学", "恒星物理学", "星系天文学", "空间技术",
                    "射电天文学", "光学天文学", "高能天体物理", "太阳物理学", "宇宙学"
                ],
                "数据类型": [
                    "光学数据", "射电数据", "光谱数据", "测距数据", "图像数据",
                    "时序数据", "位置数据"
                ]
            },
            "大气海洋": {
                "功能导向": [
                    "天气数值预报", "气候变化检测", "极端事件预警", "环境质量评估",
                    "数据同化分析", "海洋动力学建模", "大气成分监测", "趋势模式识别",
                    "灾害风险评估", "生态环境预测"
                ],
                "应用领域": [
                    "数值天气预报", "气候变化研究", "海洋学", "大气化学", "环境科学",
                    "气象学", "水文学", "极地科学", "遥感应用", "生态气象学"
                ],
                "数据类型": [
                    "观测数据", "再分析数据", "卫星数据", "模式数据", "代理数据",
                    "时序数据", "网格数据"
                ]
            },
            "其他": {
                "功能导向": [
                    "知识图谱构建", "信息检索排序", "文本挖掘分析", "多语言处理",
                    "社会网络分析", "统计建模预测", "可视化呈现", "情感态度分析",
                    "档案数字化管理", "跨学科数据融合"
                ],
                "应用领域": [
                    "数字人文", "计算社会科学", "语言学", "历史学", "文化研究",
                    "政治学", "经济学", "人口学", "教育学", "科学计量学"
                ],
                "数据类型": [
                    "文本数据", "调查数据", "档案数据", "多媒体数据", "网络数据",
                    "语音数据", "图像数据"
                ]
            }
        }
    
    def _init_keyword_mapping(self) -> Dict:
        """初始化完善的关键词映射体系"""
        return {
            "生命科学": {
                "功能导向": {
                    "疾病诊断分类": [
                        # 核心词汇
                        "诊断", "疾病", "分类", "识别", "检测", "病理", "临床", "诊疗",
                        # 技术方法
                        "分类器", "模式识别", "特征提取", "机器学习", "深度学习", "神经网络",
                        "支持向量机", "随机森林", "卷积神经网络", "决策树", "朴素贝叶斯",
                        # 应用场景
                        "早期诊断", "鉴别诊断", "辅助诊断", "自动诊断", "智能诊断",
                        "影像诊断", "实验室诊断", "临床决策", "病理分析",
                        # 疾病类型
                        "癌症", "肿瘤", "糖尿病", "心血管", "神经系统", "感染性疾病",
                        "遗传病", "罕见病", "慢性病", "急性病",
                        # 英文词汇
                        "diagnosis", "disease", "classification", "detection", "pathology",
                        "clinical", "diagnostic", "screening", "biomarker", "phenotype"
                    ],
                    "治疗预测建模": [
                        # 核心词汇
                        "治疗", "预测", "建模", "预后", "疗效", "康复", "预测模型",
                        # 治疗方法
                        "药物治疗", "手术治疗", "放疗", "化疗", "免疫治疗", "基因治疗",
                        "靶向治疗", "个性化治疗", "精准医疗", "联合治疗",
                        # 预测指标
                        "生存率", "复发率", "治愈率", "缓解率", "副作用", "耐药性",
                        "治疗反应", "预后评估", "风险评估", "生存分析",
                        # 技术方法
                        "回归分析", "生存模型", "Cox回归", "随机效应", "贝叶斯模型",
                        "时间序列", "纵向数据", "队列研究", "临床试验",
                        # 英文词汇
                        "treatment", "therapy", "prognosis", "efficacy", "outcome",
                        "survival", "response", "toxicity", "adverse", "recovery"
                    ],
                    "基因功能注释": [
                        # 核心词汇
                        "基因", "功能", "注释", "转录", "表达", "调控", "基因组",
                        # 分子机制
                        "转录调控", "转录因子", "启动子", "增强子", "表观遗传",
                        "甲基化", "组蛋白修饰", "染色质", "非编码RNA", "microRNA",
                        "lncRNA", "siRNA", "piRNA", "circRNA",
                        # 功能类别
                        "信号通路", "代谢通路", "细胞周期", "细胞凋亡", "细胞分化",
                        "免疫应答", "DNA修复", "蛋白质合成", "能量代谢",
                        # 技术方法
                        "RNA-seq", "ChIP-seq", "ATAC-seq", "单细胞测序", "全基因组",
                        "功能富集", "通路分析", "共表达网络", "基因本体",
                        # 英文词汇
                        "gene", "annotation", "transcription", "expression", "regulation",
                        "functional", "pathway", "ontology", "enrichment", "network"
                    ],
                    "药物筛选发现": [
                        # 核心词汇
                        "药物", "筛选", "发现", "化合物", "活性", "毒性", "药效",
                        # 药物类型
                        "小分子", "大分子", "生物药", "抗体", "疫苗", "基因药物",
                        "纳米药物", "天然产物", "合成化合物", "先导化合物",
                        # 筛选方法
                        "高通量筛选", "虚拟筛选", "分子对接", "结构筛选", "表型筛选",
                        "细胞筛选", "生化筛选", "靶点筛选", "片段筛选",
                        # 评价指标
                        "IC50", "EC50", "ADMET", "溶解度", "渗透性", "代谢稳定性",
                        "细胞毒性", "选择性", "特异性", "亲和力",
                        # 英文词汇
                        "drug", "compound", "screening", "discovery", "activity",
                        "toxicity", "pharmacology", "medicinal", "therapeutic", "binding"
                    ],
                    "生物标志物识别": [
                        # 核心词汇
                        "标志物", "生物标记", "分子标记", "诊断标记", "预后标记",
                        # 标志物类型
                        "蛋白质标志物", "基因标志物", "代谢标志物", "影像标志物",
                        "液体活检", "循环肿瘤细胞", "循环DNA", "外泌体", "miRNA",
                        # 应用场景
                        "早期发现", "风险预测", "治疗监测", "药物反应", "疾病分型",
                        "预后判断", "复发监测", "伴随诊断", "个体化医疗",
                        # 技术方法
                        "质谱", "蛋白组学", "代谢组学", "基因组学", "多组学",
                        "机器学习", "统计分析", "验证研究", "队列分析",
                        # 英文词汇
                        "biomarker", "marker", "indicator", "signature", "panel",
                        "proteomics", "metabolomics", "genomics", "omics", "validation"
                    ],
                    "病理图像分析": [
                        # 核心词汇
                        "病理", "切片", "组织", "细胞", "形态学", "病理学",
                        # 图像类型
                        "HE染色", "免疫组化", "荧光染色", "特殊染色", "显微镜",
                        "数字病理", "全切片图像", "细胞病理", "组织病理",
                        # 分析内容
                        "细胞计数", "形态分析", "结构分析", "染色定量", "空间分析",
                        "核型分析", "细胞分割", "组织分割", "病变检测",
                        # 技术方法
                        "图像处理", "计算机视觉", "深度学习", "卷积网络", "分割算法",
                        "特征提取", "模式识别", "自动分析", "人工智能",
                        # 英文词汇
                        "pathology", "histology", "cytology", "microscopy", "morphology",
                        "digital", "image", "segmentation", "quantification", "automated"
                    ],
                    "分子结构预测": [
                        # 核心词汇
                        "结构", "预测", "折叠", "构象", "三维", "分子结构",
                        # 结构类型
                        "蛋白质结构", "DNA结构", "RNA结构", "复合物结构", "膜蛋白",
                        "二级结构", "三级结构", "四级结构", "动态结构", "柔性结构",
                        # 预测方法
                        "同源建模", "从头预测", "穿线法", "分子动力学", "蒙特卡洛",
                        "神经网络", "深度学习", "AlphaFold", "结构比对", "能量最小化",
                        # 应用领域
                        "药物设计", "功能预测", "相互作用", "结构域", "活性位点",
                        "分子对接", "虚拟筛选", "结构生物学", "计算生物学",
                        # 英文词汇
                        "structure", "folding", "conformation", "prediction", "modeling",
                        "homology", "template", "dynamics", "simulation", "computational"
                    ],
                    "进化关联分析": [
                        # 核心词汇
                        "进化", "系统发育", "比较", "同源", "保守", "进化树",
                        # 进化概念
                        "自然选择", "遗传漂变", "基因流", "物种形成", "适应性进化",
                        "中性进化", "分子进化", "共同祖先", "分歧时间", "进化速率",
                        # 分析方法
                        "系统发育分析", "比较基因组", "同源性分析", "保守性分析",
                        "正选择检验", "分子钟", "贝叶斯分析", "最大似然", "邻接法",
                        # 应用对象
                        "物种进化", "基因进化", "蛋白质进化", "代谢通路进化",
                        "病毒进化", "细菌进化", "真核进化", "功能进化",
                        # 英文词汇
                        "evolution", "phylogeny", "comparative", "homology", "conservation",
                        "selection", "divergence", "ancestral", "molecular", "adaptive"
                    ],
                    "临床预后评估": [
                        # 核心词汇
                        "临床", "预后", "评估", "生存", "风险", "预后因子",
                        # 评估指标
                        "生存率", "无病生存", "总生存", "复发风险", "死亡率",
                        "生活质量", "功能状态", "并发症", "治疗反应", "缓解率",
                        # 预后因子
                        "临床分期", "病理分级", "分子分型", "基因表达", "蛋白标志物",
                        "年龄", "性别", "合并症", "治疗史", "家族史",
                        # 分析方法
                        "生存分析", "Cox回归", "Kaplan-Meier", "风险评分", "预测模型",
                        "机器学习", "深度学习", "多变量分析", "时间依赖", "竞争风险",
                        # 英文词汇
                        "prognosis", "survival", "outcome", "risk", "prognostic",
                        "clinical", "factor", "prediction", "assessment", "stratification"
                    ],
                    "健康状态监测": [
                        # 核心词汇
                        "健康", "监测", "体征", "生理", "状态", "健康监护",
                        # 监测参数
                        "心率", "血压", "体温", "呼吸", "血氧", "血糖",
                        "心电图", "脑电图", "睡眠", "运动", "压力", "情绪",
                        # 监测技术
                        "可穿戴设备", "传感器", "物联网", "远程监测", "实时监测",
                        "连续监测", "移动健康", "数字健康", "智能设备", "生物传感",
                        # 数据分析
                        "异常检测", "趋势分析", "预警系统", "健康评估", "风险预测",
                        "个性化监测", "数据融合", "信号处理", "模式识别",
                        # 英文词汇
                        "health", "monitoring", "vital", "physiological", "wearable",
                        "sensor", "remote", "continuous", "digital", "wellness"
                    ]
                },
                "应用领域": {
                    "医学影像": [
                        # 影像技术
                        "CT", "MRI", "X射线", "超声", "影像", "扫描", "PET", "SPECT",
                        "DSA", "OCT", "内镜", "病理影像", "分子影像", "功能影像",
                        # 解剖部位
                        "头颅", "胸部", "腹部", "心脏", "肺部", "肝脏", "肾脏",
                        "骨骼", "关节", "血管", "神经", "眼科", "妇科", "儿科",
                        # 影像诊断
                        "影像诊断", "影像分析", "三维重建", "图像配准", "图像融合",
                        "对比度增强", "影像组学", "定量影像", "计算机辅助诊断",
                        # 英文词汇
                        "imaging", "radiology", "radiography", "tomography", "ultrasound",
                        "magnetic", "resonance", "computed", "positron", "emission"
                    ],
                    "基因组学": [
                        # 核心概念
                        "基因组", "DNA", "测序", "变异", "SNP", "CNV", "INDEL",
                        "基因型", "表型", "遗传", "突变", "多态性", "单倍型",
                        # 测序技术
                        "二代测序", "三代测序", "全基因组", "全外显子", "靶向测序",
                        "单细胞测序", "宏基因组", "转录组", "表观基因组", "染色质",
                        # 分析方法
                        "序列比对", "变异检测", "基因注释", "功能预测", "通路分析",
                        "全基因组关联", "连锁分析", "家系分析", "群体遗传学",
                        # 英文词汇
                        "genomics", "sequencing", "variant", "mutation", "polymorphism",
                        "genotype", "phenotype", "genome", "exome", "transcriptome"
                    ],
                    "蛋白质组学": [
                        # 核心概念
                        "蛋白质", "蛋白", "酶", "多肽", "蛋白质组", "翻译后修饰",
                        "蛋白质相互作用", "蛋白质复合物", "蛋白质网络", "酶活性",
                        # 技术方法
                        "质谱", "二维电泳", "Western blot", "免疫印迹", "ELISA",
                        "色谱", "蛋白质纯化", "蛋白质鉴定", "定量蛋白质组学",
                        # 功能分析
                        "功能域", "蛋白质折叠", "结构预测", "活性位点", "底物特异性",
                        "酶动力学", "蛋白质稳定性", "降解", "修饰", "磷酸化",
                        # 英文词汇
                        "proteomics", "protein", "enzyme", "peptide", "mass",
                        "spectrometry", "interaction", "modification", "folding", "activity"
                    ],
                    "神经科学": [
                        # 核心概念
                        "神经", "大脑", "脑电", "EEG", "fMRI", "认知", "神经元",
                        "突触", "神经网络", "神经递质", "神经回路", "神经可塑性",
                        # 脑区功能
                        "前额叶", "海马", "杏仁核", "小脑", "脑干", "丘脑",
                        "皮层", "白质", "灰质", "神经纤维", "神经束", "脑区连接",
                        # 认知功能
                        "记忆", "学习", "注意", "语言", "运动", "感觉", "情绪",
                        "执行功能", "工作记忆", "决策", "意识", "睡眠", "觉醒",
                        # 技术方法
                        "脑成像", "电生理", "光遗传学", "钙成像", "单细胞记录",
                        "脑电图", "脑磁图", "近红外", "深部脑刺激", "经颅刺激",
                        # 英文词汇
                        "neuroscience", "brain", "neural", "cognitive", "synaptic",
                        "neuronal", "cortical", "hippocampus", "amygdala", "plasticity"
                    ],
                    "临床医学": [
                        # 医疗实践
                        "临床", "患者", "医院", "诊疗", "医学", "治疗", "护理",
                        "急诊", "ICU", "手术", "门诊", "住院", "康复", "预防",
                        # 临床科室
                        "内科", "外科", "儿科", "妇产科", "神经科", "心内科",
                        "肿瘤科", "感染科", "急诊科", "麻醉科", "放射科", "检验科",
                        # 临床研究
                        "临床试验", "随机对照", "队列研究", "病例对照", "横断面",
                        "临床指南", "循证医学", "临床路径", "质量控制", "安全性",
                        # 英文词汇
                        "clinical", "patient", "hospital", "treatment", "therapy",
                        "medical", "healthcare", "diagnosis", "medicine", "care"
                    ],
                    "药物学": [
                        # 药物概念
                        "药物", "药理", "药代", "药效", "药动学", "药效学",
                        "药物相互作用", "不良反应", "毒理学", "药物安全性",
                        # 药物类型
                        "处方药", "非处方药", "生物药", "化学药", "中药", "疫苗",
                        "抗生素", "抗病毒", "抗肿瘤", "心血管药", "神经药物",
                        # 药物研发
                        "药物发现", "临床前", "临床试验", "药物审批", "上市后监测",
                        "制剂工艺", "质量控制", "药物递送", "靶向给药", "个体化用药",
                        # 英文词汇
                        "pharmacology", "drug", "pharmaceutical", "medication", "therapeutic",
                        "pharmacokinetics", "pharmacodynamics", "toxicology", "clinical", "trial"
                    ],
                    "免疫学": [
                        # 免疫细胞
                        "免疫", "抗体", "T细胞", "B细胞", "疫苗", "NK细胞",
                        "巨噬细胞", "树突细胞", "中性粒细胞", "嗜酸性粒细胞",
                        # 免疫机制
                        "先天免疫", "获得性免疫", "细胞免疫", "体液免疫", "免疫应答",
                        "免疫耐受", "自身免疫", "过敏反应", "炎症反应", "补体系统",
                        # 免疫分子
                        "细胞因子", "白介素", "干扰素", "肿瘤坏死因子", "趋化因子",
                        "HLA", "MHC", "TCR", "BCR", "免疫球蛋白", "IgG", "IgM",
                        # 英文词汇
                        "immunology", "immune", "antibody", "lymphocyte", "vaccination",
                        "immunotherapy", "autoimmune", "inflammation", "cytokine", "antigen"
                    ],
                    "病理学": [
                        # 病理概念
                        "病理", "肿瘤", "癌症", "组织病理", "细胞病理", "分子病理",
                        "病理诊断", "病理分级", "病理分期", "恶性", "良性", "转移",
                        # 病理技术
                        "组织切片", "石蜡切片", "冰冻切片", "HE染色", "免疫组化",
                        "原位杂交", "分子诊断", "基因检测", "突变分析", "融合基因",
                        # 肿瘤类型
                        "上皮肿瘤", "间质肿瘤", "血液肿瘤", "神经肿瘤", "软组织肿瘤",
                        "骨肿瘤", "皮肤肿瘤", "乳腺癌", "肺癌", "胃癌", "肝癌",
                        # 英文词汇
                        "pathology", "tumor", "cancer", "carcinoma", "sarcoma",
                        "malignant", "benign", "metastasis", "histology", "cytology"
                    ],
                    "生物信息学": [
                        # 核心概念
                        "生物信息", "计算生物", "数据库", "算法", "序列分析",
                        "结构分析", "系统生物学", "网络生物学", "进化分析",
                        # 数据类型
                        "基因序列", "蛋白序列", "结构数据", "表达数据", "变异数据",
                        "相互作用数据", "通路数据", "表型数据", "临床数据",
                        # 分析方法
                        "序列比对", "同源性搜索", "基因预测", "功能注释", "通路分析",
                        "网络分析", "机器学习", "深度学习", "统计分析", "可视化",
                        # 英文词汇
                        "bioinformatics", "computational", "biology", "database", "algorithm",
                        "sequence", "alignment", "annotation", "phylogenetics", "genomics"
                    ],
                    "分子生物学": [
                        # 核心概念
                        "分子", "细胞", "RNA", "转录", "翻译", "基因表达",
                        "基因调控", "信号转导", "细胞周期", "细胞分裂", "细胞凋亡",
                        # 分子技术
                        "PCR", "RT-PCR", "qPCR", "Western blot", "Northern blot",
                        "Southern blot", "免疫荧光", "流式细胞术", "CRISPR", "基因敲除",
                        # 细胞过程
                        "DNA复制", "DNA修复", "RNA剪接", "蛋白质合成", "蛋白质修饰",
                        "细胞代谢", "能量代谢", "信号传递", "基因重组", "表观遗传",
                        # 英文词汇
                        "molecular", "biology", "cellular", "transcription", "translation",
                        "replication", "expression", "regulation", "signaling", "metabolism"
                    ]
                },
                "数据类型": [
                    # 基础数据类型
                    "序列数据", "影像数据", "信号数据", "结构数据", "表型数据",
                    "光谱数据", "时间序列数据", "空间数据", "网络数据", "图数据",
                    # 生物数据类型
                    "基因组数据", "转录组数据", "蛋白质组数据", "代谢组数据",
                    "表观基因组数据", "微生物组数据", "单细胞数据", "多组学数据",
                    # 临床数据类型
                    "临床数据", "电子病历", "影像数据", "检验数据", "药物数据",
                    "流行病学数据", "队列数据", "登记数据", "监测数据",
                    # 实验数据类型
                    "实验数据", "测量数据", "观察数据", "调查数据", "传感器数据",
                    "生理数据", "行为数据", "环境数据", "地理数据"
                ]
            },
            
            "工程技术": {
                "功能导向": {
                    "系统设计优化": [
                        # 核心概念
                        "设计", "优化", "系统", "架构", "方案", "设计方法", "系统工程",
                        # 优化方法
                        "遗传算法", "粒子群", "模拟退火", "多目标优化", "约束优化",
                        "全局优化", "局部优化", "启发式算法", "智能优化", "进化算法",
                        # 设计理念
                        "模块化设计", "集成设计", "并行设计", "协同设计", "绿色设计",
                        "可靠性设计", "鲁棒设计", "创新设计", "逆向设计", "正向设计",
                        # 系统类型
                        "机械系统", "电气系统", "控制系统", "信息系统", "网络系统",
                        "分布式系统", "嵌入式系统", "复杂系统", "智能系统",
                        # 英文词汇
                        "design", "optimization", "system", "architecture", "engineering",
                        "algorithm", "genetic", "particle", "swarm", "robust", "modular"
                    ],
                    "性能预测评估": [
                        # 核心概念
                        "性能", "预测", "评估", "效率", "指标", "性能分析", "效率评价",
                        # 性能指标
                        "吞吐量", "响应时间", "可靠性", "可用性", "稳定性", "精度",
                        "功耗", "速度", "承载能力", "处理能力", "传输速率",
                        # 预测方法
                        "数学建模", "仿真预测", "回归分析", "机器学习", "深度学习",
                        "时间序列", "统计预测", "经验模型", "物理模型", "混合模型",
                        # 评估技术
                        "基准测试", "压力测试", "负载测试", "性能监控", "性能调优",
                        "瓶颈分析", "容量规划", "性能建模", "风险评估",
                        # 英文词汇
                        "performance", "prediction", "evaluation", "efficiency", "throughput",
                        "reliability", "modeling", "simulation", "benchmark", "monitoring"
                    ],
                    "故障诊断监测": [
                        # 核心概念
                        "故障", "诊断", "监测", "检测", "异常", "故障分析", "健康监测",
                        # 故障类型
                        "硬件故障", "软件故障", "系统故障", "网络故障", "传感器故障",
                        "执行器故障", "间歇性故障", "永久性故障", "渐进性故障",
                        # 诊断方法
                        "模式识别", "信号处理", "频谱分析", "小波分析", "神经网络",
                        "专家系统", "决策树", "支持向量机", "深度学习", "集成学习",
                        # 监测技术
                        "在线监测", "离线监测", "实时监测", "连续监测", "周期监测",
                        "振动监测", "温度监测", "压力监测", "声学监测", "光学监测",
                        # 英文词汇
                        "fault", "diagnosis", "monitoring", "detection", "anomaly",
                        "failure", "condition", "health", "vibration", "acoustic"
                    ],
                    "过程控制仿真": [
                        # 核心概念
                        "控制", "仿真", "过程", "调节", "稳定", "过程控制", "自动控制",
                        # 控制理论
                        "PID控制", "自适应控制", "鲁棒控制", "预测控制", "智能控制",
                        "模糊控制", "神经控制", "滑模控制", "最优控制", "非线性控制",
                        # 仿真技术
                        "数值仿真", "离散仿真", "连续仿真", "混合仿真", "实时仿真",
                        "蒙特卡洛", "分子动力学", "有限元", "数字孪生", "虚拟现实",
                        # 过程类型
                        "化工过程", "制造过程", "生产过程", "工艺过程", "传输过程",
                        "反应过程", "分离过程", "传热过程", "传质过程", "流动过程",
                        # 英文词汇
                        "control", "simulation", "process", "regulation", "stability",
                        "adaptive", "robust", "predictive", "fuzzy", "optimal"
                    ],
                    "参数调优验证": [
                        # 核心概念
                        "参数", "调优", "验证", "校准", "标定", "参数优化", "性能调优",
                        # 调优方法
                        "网格搜索", "随机搜索", "贝叶斯优化", "进化算法", "梯度下降",
                        "启发式搜索", "多目标优化", "约束优化", "全局搜索", "局部搜索",
                        # 验证技术
                        "交叉验证", "留一验证", "自助法", "A/B测试", "统计检验",
                        "假设检验", "置信区间", "显著性检验", "效应量", "功效分析",
                        # 调优对象
                        "控制参数", "算法参数", "模型参数", "系统参数", "工艺参数",
                        "运行参数", "配置参数", "超参数", "结构参数",
                        # 英文词汇
                        "parameter", "tuning", "validation", "calibration", "optimization",
                        "grid", "search", "bayesian", "cross", "bootstrap"
                    ],
                    "流体动力学分析": [
                        # 核心概念
                        "流体", "湍流", "流动", "动力学", "CFD", "流体力学", "流场分析",
                        # 流动类型
                        "层流", "湍流", "过渡流", "可压缩流", "不可压缩流", "粘性流",
                        "无粘流", "定常流", "非定常流", "多相流", "两相流",
                        # 分析方法
                        "有限差分", "有限元", "有限体积", "谱方法", "格子玻尔兹曼",
                        "大涡模拟", "直接数值模拟", "雷诺平均", "湍流模型",
                        # 应用领域
                        "空气动力学", "水动力学", "传热传质", "燃烧", "多相流",
                        "生物流体", "地球物理流体", "天体物理流体", "微流体",
                        # 英文词汇
                        "fluid", "dynamics", "turbulence", "flow", "computational",
                        "viscous", "compressible", "simulation", "reynolds", "navier"
                    ],
                    "热传导建模": [
                        # 核心概念
                        "传热", "热传导", "温度", "热量", "散热", "热分析", "传热学",
                        # 传热方式
                        "导热", "对流", "辐射", "相变传热", "沸腾", "凝结",
                        "自然对流", "强制对流", "混合对流", "层流传热", "湍流传热",
                        # 建模方法
                        "有限差分", "有限元", "边界元", "热网络", "集总参数",
                        "分布参数", "数值传热", "解析解", "近似解", "格林函数",
                        # 应用场景
                        "电子散热", "建筑节能", "工业炉窑", "换热器", "制冷系统",
                        "航空航天", "核反应堆", "太阳能", "地热", "生物传热",
                        # 英文词汇
                        "heat", "transfer", "conduction", "convection", "radiation",
                        "thermal", "temperature", "cooling", "heating", "exchange"
                    ],
                    "结构力学计算": [
                        # 核心概念
                        "结构", "力学", "应力", "变形", "强度", "结构分析", "固体力学",
                        # 力学概念
                        "弹性力学", "塑性力学", "断裂力学", "疲劳", "蠕变", "松弛",
                        "屈曲", "振动", "动力学", "静力学", "非线性", "大变形",
                        # 计算方法
                        "有限元", "边界元", "差分法", "虚功原理", "变分法",
                        "矩阵法", "传递矩阵", "子结构", "超单元", "多尺度",
                        # 结构类型
                        "梁结构", "板结构", "壳结构", "框架", "桁架", "拱结构",
                        "悬索", "薄膜", "复合材料", "智能结构", "生物结构",
                        # 英文词汇
                        "structural", "mechanics", "stress", "strain", "deformation",
                        "elastic", "plastic", "finite", "element", "buckling"
                    ],
                    "自动化控制": [
                        # 核心概念
                        "自动化", "控制", "PID", "反馈", "调节", "控制系统", "自动控制",
                        # 控制类型
                        "开环控制", "闭环控制", "前馈控制", "反馈控制", "复合控制",
                        "集中控制", "分散控制", "分布式控制", "层次控制", "智能控制",
                        # 控制算法
                        "PID", "模糊控制", "神经网络控制", "自适应控制", "预测控制",
                        "滑模控制", "鲁棒控制", "最优控制", "线性二次", "卡尔曼滤波",
                        # 应用领域
                        "工业自动化", "过程控制", "运动控制", "机器人控制", "电力系统",
                        "交通控制", "楼宇自动化", "家庭自动化", "农业自动化",
                        # 英文词汇
                        "automation", "control", "feedback", "regulation", "servo",
                        "pid", "fuzzy", "adaptive", "predictive", "robust"
                    ],
                    "实验数据处理": [
                        # 核心概念
                        "实验", "数据处理", "测量", "采集", "分析", "实验设计", "数据分析",
                        # 数据采集
                        "传感器", "数据采集", "信号调理", "A/D转换", "实时采集",
                        "多通道", "高速采集", "同步采集", "远程采集", "无线采集",
                        # 处理方法
                        "滤波", "去噪", "平滑", "插值", "拟合", "变换", "压缩",
                        "特征提取", "模式识别", "异常检测", "趋势分析", "相关分析",
                        # 统计分析
                        "描述统计", "推断统计", "方差分析", "回归分析", "聚类分析",
                        "主成分分析", "因子分析", "判别分析", "时间序列", "可靠性分析",
                        # 英文词汇
                        "experimental", "data", "processing", "measurement", "acquisition",
                        "sensor", "filtering", "analysis", "statistical", "regression"
                    ]
                },
                "应用领域": {
                    "流体力学": [
                        "流体", "湍流", "流动", "粘性", "雷诺", "层流", "压缩性",
                        "伯努利", "纳维斯托克斯", "边界层", "分离", "涡", "射流",
                        "尾流", "激波", "声学", "空化", "多相", "非牛顿流体",
                        "fluid", "turbulent", "viscous", "reynolds", "laminar",
                        "compressible", "boundary", "layer", "vortex", "shock"
                    ],
                    "热传递工程": [
                        "传热", "热交换", "热传导", "对流", "辐射", "换热器",
                        "散热", "冷却", "加热", "保温", "绝热", "相变", "沸腾",
                        "凝结", "热泵", "制冷", "空调", "供暖", "节能", "余热",
                        "heat", "transfer", "conduction", "convection", "radiation",
                        "thermal", "cooling", "heating", "exchanger", "refrigeration"
                    ],
                    "结构工程": [
                        "结构", "建筑", "桥梁", "力学", "材料", "混凝土", "钢结构",
                        "地震", "抗震", "风荷载", "基础", "框架", "剪力墙", "预应力",
                        "加固", "检测", "监测", "安全", "耐久性", "施工", "设计",
                        "structural", "construction", "concrete", "steel", "seismic",
                        "foundation", "building", "bridge", "earthquake", "safety"
                    ],
                    "自动化工程": [
                        "自动化", "控制", "机器人", "传感器", "执行器", "PLC",
                        "SCADA", "DCS", "现场总线", "工业以太网", "人机界面",
                        "组态", "编程", "调试", "维护", "故障诊断", "系统集成",
                        "automation", "control", "robot", "sensor", "actuator",
                        "programmable", "logic", "controller", "scada", "fieldbus"
                    ],
                    "计算工程": [
                        "计算", "数值", "仿真", "模拟", "求解", "有限元", "有限差分",
                        "边界元", "网格", "离散", "迭代", "收敛", "精度", "误差",
                        "并行计算", "高性能", "算法", "软件", "建模", "验证",
                        "computational", "numerical", "simulation", "finite", "element",
                        "mesh", "solver", "algorithm", "parallel", "verification"
                    ],
                    "机械工程": [
                        "机械", "机器", "零件", "制造", "加工", "设计", "材料",
                        "强度", "刚度", "疲劳", "磨损", "润滑", "振动", "噪声",
                        "精度", "公差", "装配", "维修", "可靠性", "寿命", "优化",
                        "mechanical", "machine", "manufacturing", "machining", "design",
                        "material", "strength", "fatigue", "vibration", "precision"
                    ],
                    "化工工程": [
                        "化工", "反应器", "分离", "精馏", "催化", "传质", "传热",
                        "流体", "工艺", "设备", "管道", "泵", "压缩机", "换热器",
                        "塔器", "反应", "动力学", "热力学", "相平衡", "安全",
                        "chemical", "reactor", "separation", "distillation", "catalyst",
                        "mass", "transfer", "process", "equipment", "thermodynamics"
                    ],
                    "航空航天": [
                        "航空", "航天", "飞行", "火箭", "卫星", "飞机", "发动机",
                        "推进", "空气动力学", "轨道", "制导", "导航", "控制",
                        "材料", "结构", "系统", "可靠性", "安全", "测试", "验证",
                        "aerospace", "aircraft", "rocket", "satellite", "propulsion",
                        "aerodynamics", "orbit", "guidance", "navigation", "flight"
                    ],
                    "材料工程": [
                        "材料", "金属", "复合材料", "陶瓷", "聚合物", "性能",
                        "结构", "相变", "热处理", "表面", "涂层", "腐蚀", "防护",
                        "制备", "合成", "表征", "测试", "分析", "设计", "应用",
                        "material", "metal", "composite", "ceramic", "polymer",
                        "properties", "structure", "surface", "corrosion", "synthesis"
                    ],
                    "控制工程": [
                        "控制", "系统", "反馈", "稳定性", "调节", "控制器", "传递函数",
                        "状态空间", "频域", "时域", "根轨迹", "奈奎斯特", "伯德图",
                        "最优", "鲁棒", "自适应", "预测", "模糊", "神经", "智能",
                        "control", "system", "feedback", "stability", "controller",
                        "transfer", "function", "optimal", "robust", "adaptive"
                    ]
                },
                "数据类型": [
                    "仿真数据", "实验数据", "传感器数据", "时序数据", "几何数据",
                    "网格数据", "图像数据", "信号数据", "测量数据", "监测数据",
                    "控制数据", "运行数据", "故障数据", "性能数据", "设计数据",
                    "工艺数据", "制造数据", "测试数据", "校准数据", "参数数据"
                ]

            },

        "物质科学": {
            "功能导向": {
                "分子性质预测": [
                    # 核心词汇
                    "性质", "预测", "分子性质", "物化性质", "理化性质", "性质预测",
                    # QSAR相关
                    "QSAR", "QSPR", "构效关系", "结构活性", "定量构效", "机器学习",
                    # 物理化学性质
                    "溶解度", "沸点", "熔点", "密度", "粘度", "表面张力", "蒸汽压",
                    "分配系数", "辛醇水分配", "logP", "pKa", "极性", "疏水性",
                    # 分子描述符
                    "分子描述符", "指纹", "拓扑指数", "电子性质", "几何性质", "立体性质",
                    "原子贡献", "基团贡献", "片段", "药效团", "原子坐标",
                    # 计算方法
                    "密度泛函", "DFT", "半经验", "分子动力学", "蒙特卡洛", "量子化学",
                    "神经网络", "支持向量机", "随机森林", "深度学习", "集成学习",
                    # 英文词汇
                    "property", "prediction", "QSAR", "descriptor", "molecular",
                    "solubility", "LogP", "fingerprint", "machine", "learning"
                ],
                "化学反应预测": [
                    # 核心词汇
                    "反应", "预测", "机理", "反应机理", "反应路径", "反应预测",
                    # 反应类型
                    "有机反应", "无机反应", "催化反应", "酶反应", "光化学反应",
                    "电化学反应", "聚合反应", "偶联反应", "环化反应", "开环反应",
                    # 反应特征
                    "产物", "收率", "选择性", "立体选择性", "区域选择性", "化学选择性",
                    "反应条件", "温度", "压力", "溶剂", "pH", "反应时间",
                    # 动力学热力学
                    "反应速率", "活化能", "反应焓", "反应熵", "吉布斯自由能",
                    "平衡常数", "速率常数", "阿伦尼乌斯", "过渡态", "中间体",
                    # 计算方法
                    "反应数据库", "反应模板", "图神经网络", "变换器", "注意机制",
                    "逆合成", "正向合成", "多步反应", "反应规划", "合成树",
                    # 英文词汇
                    "reaction", "mechanism", "yield", "selectivity", "catalysis",
                    "synthesis", "kinetics", "thermodynamics", "activation", "energy"
                ],
                "材料结构设计": [
                    # 核心词汇
                    "材料", "设计", "结构", "结构设计", "材料设计", "材料工程",
                    # 材料类型
                    "晶体", "非晶", "多晶", "单晶", "纳米材料", "二维材料",
                    "复合材料", "金属材料", "陶瓷材料", "聚合物材料", "功能材料",
                    # 结构特征
                    "晶格", "对称", "空间群", "点群", "缺陷", "位错", "界面",
                    "孔隙", "多孔", "介孔", "微孔", "表面", "形貌", "织构",
                    # 性能关系
                    "结构性能", "构效关系", "性能优化", "多目标优化", "设计准则",
                    "性能预测", "力学性能", "电学性能", "磁学性能", "光学性能",
                    # 设计方法
                    "计算材料学", "高通量计算", "材料基因组", "机器学习", "人工智能",
                    "遗传算法", "进化算法", "拓扑优化", "逆向设计", "数据驱动",
                    # 英文词汇
                    "material", "design", "structure", "crystal", "lattice",
                    "property", "optimization", "computational", "high-throughput", "genome"
                ],
                "催化活性筛选": [
                    # 核心词汇
                    "催化", "活性", "筛选", "催化剂", "催化活性", "活性筛选",
                    # 催化剂类型
                    "均相催化", "多相催化", "酶催化", "光催化", "电催化",
                    "单原子催化", "纳米催化", "负载催化", "分子催化", "仿生催化",
                    # 活性指标
                    "选择性", "转化率", "活性", "稳定性", "TOF", "TON",
                    "催化效率", "反应活性", "立体选择性", "化学选择性", "区域选择性",
                    # 反应类型
                    "氧化反应", "还原反应", "氢化反应", "脱氢反应", "异构化",
                    "聚合反应", "裂解反应", "重整反应", "烷基化", "酰化反应",
                    # 催化机理
                    "活性中心", "反应位点", "吸附", "活化", "脱附", "毒化",
                    "载体效应", "尺寸效应", "电子效应", "配体效应", "协同效应",
                    # 筛选方法
                    "高通量筛选", "虚拟筛选", "组合化学", "并行合成", "自动化",
                    "机器学习", "数据挖掘", "活性预测", "描述符", "特征工程",
                    # 英文词汇
                    "catalysis", "catalyst", "activity", "selectivity", "screening",
                    "conversion", "turnover", "heterogeneous", "homogeneous", "enzyme"
                ],
                "药物分子生成": [
                    # 核心词汇
                    "药物", "分子", "生成", "药物设计", "分子设计", "药物发现",
                    # 药物类型
                    "小分子", "大分子", "天然产物", "合成化合物", "先导化合物",
                    "候选药物", "原创药", "仿制药", "生物药", "抗体药物",
                    # 设计方法
                    "基于结构", "基于配体", "分子对接", "药效团", "虚拟筛选",
                    "从头设计", "片段组装", "支架跳跃", "分子优化", "先导优化",
                    # 生成算法
                    "深度学习", "生成对抗网络", "变分自编码器", "强化学习",
                    "图神经网络", "序列模型", "SMILES", "分子图", "原子贡献",
                    # 药物性质
                    "ADMET", "药代动力学", "药效学", "毒性", "安全性",
                    "生物利用度", "血脑屏障", "代谢稳定性", "溶解度", "渗透性",
                    # 靶点相关
                    "靶点", "结合亲和力", "选择性", "抑制剂", "激动剂",
                    "拮抗剂", "变构调节", "多靶点", "靶点验证", "脱靶效应",
                    # 英文词汇
                    "drug", "molecule", "generation", "design", "discovery",
                    "lead", "compound", "ADMET", "target", "binding", "affinity"
                ],
                "合成路径规划": [
                    # 核心词汇
                    "合成", "路径", "路线", "合成路径", "合成路线", "路径规划",
                    # 合成策略
                    "逆合成", "正向合成", "汇聚合成", "线性合成", "多组分反应",
                    "一锅反应", "串联反应", "多米诺反应", "级联反应", "协同合成",
                    # 规划方法
                    "步骤", "策略", "合成策略", "反应序列", "合成树", "反应网络",
                    "路径搜索", "最短路径", "最优路径", "成本分析", "风险评估",
                    # 起始原料
                    "起始原料", "商业可得", "建构单元", "合成砌块", "手性池",
                    "天然产物", "廉价原料", "可再生原料", "绿色原料", "工业原料",
                    # 反应条件
                    "反应条件", "试剂", "催化剂", "溶剂", "温度", "时间",
                    "收率", "纯度", "工艺", "放大", "工业化", "经济性",
                    # 计算工具
                    "反应数据库", "合成软件", "人工智能", "机器学习", "专家系统",
                    "算法", "启发式", "蒙特卡洛", "树搜索", "神经网络",
                    # 英文词汇
                    "synthesis", "route", "pathway", "retrosynthesis", "planning",
                    "strategy", "reaction", "sequence", "tree", "search", "algorithm"
                ],
                "晶体结构优化": [
                    # 核心词汇
                    "晶体", "优化", "结构优化", "晶体结构", "晶格优化", "结构弛豫",
                    # 晶体学基础
                    "晶格", "晶胞", "原胞", "布拉维格子", "对称", "空间群",
                    "点群", "晶系", "晶族", "米勒指数", "倒格子", "布里渊区",
                    # 结构参数
                    "晶格参数", "晶格常数", "键长", "键角", "配位数", "原子坐标",
                    "占据度", "热参数", "位移参数", "结构因子", "衍射强度",
                    # 相变与多晶型
                    "相", "相变", "多晶型", "同质多晶", "构象多晶", "溶剂化合物",
                    "水合物", "共晶", "固溶体", "超分子", "主客体化合物",
                    # 能量与稳定性
                    "晶格能", "内聚能", "形成能", "表面能", "缺陷形成能",
                    "热力学稳定性", "动力学稳定性", "亚稳态", "能量地貌", "全局最小",
                    # 优化方法
                    "结构预测", "晶体结构预测", "从头预测", "全局优化", "局部优化",
                    "遗传算法", "模拟退火", "粒子群", "差分进化", "盆地跳跃",
                    # 英文词汇
                    "crystal", "optimization", "lattice", "symmetry", "space",
                    "group", "polymorphism", "prediction", "energy", "minimization"
                ],
                "光谱特征分析": [
                    # 核心词汇
                    "光谱", "特征", "分析", "光谱分析", "光谱特征", "谱学分析",
                    # 光谱类型
                    "红外光谱", "拉曼光谱", "紫外可见", "核磁共振", "质谱",
                    "X射线", "电子能谱", "荧光光谱", "发光光谱", "圆二色谱",
                    # 光谱特征
                    "峰", "谱峰", "吸收", "发射", "振动", "转动", "电子跃迁",
                    "化学位移", "耦合常数", "峰强度", "峰面积", "峰位", "峰形",
                    # 振动光谱
                    "伸缩振动", "弯曲振动", "面内振动", "面外振动", "骨架振动",
                    "特征频率", "指纹区", "泛频", "组频", "费米共振", "倍频",
                    # NMR相关
                    "1H NMR", "13C NMR", "二维NMR", "多维NMR", "化学交换",
                    "NOE", "COSY", "HSQC", "HMBC", "弛豫时间", "自旋耦合",
                    # 质谱相关
                    "分子离子峰", "基峰", "碎片离子", "同位素峰", "准分子离子",
                    "离子化", "电喷雾", "MALDI", "质量精度", "分辨率", "碰撞解离",
                    # 分析应用
                    "结构鉴定", "定性分析", "定量分析", "纯度分析", "构型确定",
                    "构象分析", "相互作用", "动力学", "反应监测", "过程分析",
                    # 英文词汇
                    "spectroscopy", "spectrum", "infrared", "raman", "NMR",
                    "mass", "peak", "vibration", "chemical", "shift", "analysis"
                ],
                "热力学性质计算": [
                    # 核心词汇
                    "热力学", "性质", "计算", "热力学性质", "热力学计算", "热化学",
                    # 基本性质
                    "焓", "熵", "自由能", "内能", "热容", "吉布斯自由能",
                    "亥姆霍兹自由能", "化学势", "活度", "活度系数", "逸度",
                    # 反应热力学
                    "形成焓", "燃烧焓", "升华焓", "熔化焓", "汽化焓", "溶解焓",
                    "稀释焓", "混合焓", "反应焓", "活化焓", "结合焓", "解离焓",
                    # 相平衡
                    "平衡", "相平衡", "化学平衡", "平衡常数", "分配系数",
                    "溶解度", "饱和蒸汽压", "沸点", "熔点", "临界点", "三相点",
                    # 溶液热力学
                    "溶液", "理想溶液", "正规溶液", "活度模型", "状态方程",
                    "混合规则", "过量性质", "部分摩尔性质", "表观摩尔性质",
                    # 统计热力学
                    "配分函数", "玻尔兹曼分布", "分子配分函数", "振动配分函数",
                    "转动配分函数", "平动配分函数", "电子配分函数", "统计权重",
                    # 计算方法
                    "量子化学", "密度泛函", "从头计算", "半经验", "分子动力学",
                    "蒙特卡洛", "群贡献法", "Benson方法", "Joback方法", "UNIFAC",
                    # 英文词汇
                    "thermodynamics", "enthalpy", "entropy", "gibbs", "free",
                    "energy", "equilibrium", "partition", "function", "calculation"
                ],
                "毒性安全评估": [
                    # 核心词汇
                    "毒性", "安全", "评估", "毒理学", "安全评估", "风险评估",
                    # 毒性类型
                    "急性毒性", "慢性毒性", "亚急性毒性", "亚慢性毒性", "遗传毒性",
                    "致癌性", "致畸性", "致突变性", "生殖毒性", "发育毒性",
                    # ADMET性质
                    "ADMET", "吸收", "分布", "代谢", "排泄", "毒性",
                    "生物利用度", "血浆蛋白结合", "血脑屏障", "肝毒性", "肾毒性",
                    # 毒性终点
                    "LD50", "LC50", "NOEL", "NOAEL", "LOEL", "LOAEL",
                    "基准剂量", "安全剂量", "每日允许摄入量", "参考剂量", "阈值",
                    # 评估方法
                    "副作用", "不良反应", "药物相互作用", "禁忌症", "警告",
                    "体外试验", "体内试验", "细胞毒性", "器官毒性", "系统毒性",
                    # 替代方法
                    "QSAR", "Read-across", "体外替代", "计算毒理学", "系统毒理学",
                    "毒性预测", "机器学习", "人工智能", "专家系统", "决策树",
                    # 法规标准
                    "GHS", "REACH", "OECD", "ICH", "FDA", "EMA",
                    "毒理学指导原则", "安全性评价", "风险管理", "风险控制", "监管",
                    # 英文词汇
                    "toxicity", "safety", "assessment", "ADMET", "adverse",
                    "effect", "risk", "evaluation", "regulatory", "guidelines"
                ]
            },
            "应用领域": {
                "药物化学": [
                    # 核心概念
                    "药物", "制药", "医药", "药化", "药物化学", "医药化学",
                    # 药物类型
                    "活性", "药效", "药理", "药物活性", "生物活性", "药效学",
                    "小分子药物", "大分子药物", "生物药", "化学药", "天然药物",
                    # 先导化合物
                    "先导", "先导化合物", "候选药物", "活性化合物", "药物前体",
                    "原型药物", "母体化合物", "活性分子", "药物分子", "化合物库",
                    # 药物发现
                    "药物发现", "药物设计", "药物开发", "新药研发", "创新药",
                    "靶点发现", "靶点验证", "高通量筛选", "虚拟筛选", "表型筛选",
                    # 药物优化
                    "先导优化", "结构优化", "活性优化", "选择性优化", "ADMET优化",
                    "构效关系", "SAR", "QSAR", "分子修饰", "结构改造",
                    # 药理作用
                    "受体", "酶抑制剂", "激动剂", "拮抗剂", "变构调节",
                    "信号通路", "作用机制", "药理机制", "分子机制", "靶点结合",
                    # 英文词汇
                    "medicinal", "chemistry", "pharmaceutical", "drug", "discovery",
                    "lead", "compound", "bioactive", "pharmacology", "receptor"
                ],
                "催化科学": [
                    # 催化基础
                    "催化", "催化剂", "催化反应", "催化科学", "催化化学",
                    # 催化类型
                    "反应", "机理", "催化机理", "反应机理", "催化循环",
                    "均相催化", "多相催化", "酶催化", "仿生催化", "光催化",
                    # 催化性能
                    "活性", "催化活性", "反应活性", "催化效率", "催化性能",
                    "选择性", "立体选择性", "化学选择性", "区域选择性", "尺寸选择性",
                    # 催化剂设计
                    "催化剂设计", "活性中心", "反应位点", "载体", "负载",
                    "修饰", "改性", "掺杂", "合金", "单原子催化剂",
                    # 催化应用
                    "石油化工", "精细化工", "环境催化", "能源催化", "绿色催化",
                    "工业催化", "有机合成", "聚合催化", "电催化", "光电催化",
                    # 表征分析
                    "表征", "结构表征", "性能评价", "原位表征", "催化剂失活",
                    "再生", "稳定性", "寿命", "中毒", "烧结", "积碳",
                    # 英文词汇
                    "catalysis", "catalyst", "catalytic", "reaction", "mechanism",
                    "selectivity", "activity", "heterogeneous", "homogeneous", "enzyme"
                ],
                "材料科学": [
                    # 材料基础
                    "材料", "材料科学", "材料工程", "材料物理", "材料化学",
                    # 材料分类
                    "金属", "金属材料", "合金", "钢铁", "有色金属",
                    "陶瓷", "陶瓷材料", "氧化物陶瓷", "非氧化物陶瓷", "功能陶瓷",
                    "聚合物", "高分子", "塑料", "橡胶", "纤维", "涂料",
                    "复合", "复合材料", "纤维增强", "颗粒增强", "层状复合",
                    # 纳米材料
                    "纳米", "纳米材料", "纳米粒子", "纳米结构", "纳米技术",
                    "量子点", "纳米管", "纳米线", "纳米片", "二维材料",
                    # 功能材料
                    "功能材料", "智能材料", "形状记忆", "超导材料", "磁性材料",
                    "压电材料", "铁电材料", "光电材料", "热电材料", "储能材料",
                    # 材料性能
                    "力学性能", "电学性能", "磁学性能", "光学性能", "热学性能",
                    "强度", "韧性", "硬度", "弹性", "塑性", "导电性", "绝缘性",
                    # 英文词汇
                    "material", "science", "metal", "ceramic", "polymer",
                    "composite", "nanomaterial", "functional", "properties", "structure"
                ],
                "有机化学": [
                    # 有机基础
                    "有机", "有机化学", "有机合成", "有机分子", "有机反应",
                    # 碳化学
                    "碳", "碳化合物", "烃", "脂肪烃", "芳香烃", "杂环化合物",
                    "碳链", "碳骨架", "碳环", "苯环", "杂环", "稠环",
                    # 官能团
                    "官能团", "烷基", "烯基", "炔基", "苯基", "羟基",
                    "羰基", "羧基", "氨基", "硝基", "卤代", "醚键",
                    # 有机反应
                    "反应", "加成反应", "消除反应", "取代反应", "重排反应",
                    "氧化反应", "还原反应", "偶联反应", "环化反应", "开环反应",
                    # 立体化学
                    "立体化学", "手性", "对映体", "非对映体", "构型", "构象",
                    "顺反异构", "光学活性", "旋光性", "绝对构型", "相对构型",
                    # 合成策略
                    "合成", "全合成", "不对称合成", "立体选择性合成", "区域选择性",
                    "保护基", "脱保护", "官能团转化", "碳碳键形成", "杂原子引入",
                    # 英文词汇
                    "organic", "chemistry", "carbon", "functional", "group",
                    "reaction", "synthesis", "stereochemistry", "chiral", "aromatic"
                ],
                "无机化学": [
                    # 无机基础
                    "无机", "无机化学", "无机物", "无机材料", "无机合成",
                    # 金属化学
                    "金属", "过渡金属", "主族金属", "稀土金属", "贵金属",
                    "金属离子", "金属原子", "金属簇", "金属键", "金属性",
                    # 配位化学
                    "配合物", "配位化合物", "配体", "螯合物", "络合物",
                    "配位数", "配位几何", "晶体场", "配体场", "电子结构",
                    # 离子化合物
                    "离子", "阳离子", "阴离子", "离子键", "离子晶体",
                    "盐", "酸", "碱", "氧化物", "氢氧化物", "硫化物",
                    # 固体化学
                    "晶体", "晶体结构", "晶格", "点缺陷", "线缺陷", "面缺陷",
                    "固溶体", "相变", "超导", "磁性", "铁电性", "压电性",
                    # 材料应用
                    "陶瓷", "玻璃", "水泥", "耐火材料", "半导体", "超导体",
                    "磁性材料", "发光材料", "催化材料", "储氢材料", "电池材料",
                    # 英文词汇
                    "inorganic", "chemistry", "metal", "complex", "coordination",
                    "ion", "crystal", "solid", "state", "material", "ceramic"
                ],
                "物理化学": [
                    # 基础概念
                    "物理化学", "物化", "理论化学", "化学物理", "分子物理",
                    # 热力学
                    "热力学", "热化学", "化学热力学", "统计热力学", "热力学函数",
                    "状态函数", "过程函数", "热力学定律", "可逆过程", "不可逆过程",
                    # 动力学
                    "动力学", "反应动力学", "化学动力学", "反应速率", "速率方程",
                    "反应级数", "速率常数", "活化能", "阿伦尼乌斯方程", "反应机理",
                    # 量子化学
                    "量子", "量子化学", "量子力学", "分子轨道", "原子轨道",
                    "薛定谔方程", "波函数", "哈密顿算符", "本征值", "本征函数",
                    # 电化学
                    "电化学", "电极", "电解", "腐蚀", "电池", "燃料电池",
                    "电极电位", "能斯特方程", "法拉第定律", "电导", "电迁移",
                    # 表面化学
                    "表面", "界面", "吸附", "脱附", "表面张力", "润湿",
                    "胶体", "乳液", "泡沫", "凝胶", "自组装", "LB膜",
                    # 英文词汇
                    "physical", "chemistry", "thermodynamics", "kinetics", "quantum",
                    "electrochemistry", "surface", "interface", "adsorption", "colloid"
                ],
                "计算化学": [
                    # 基础概念
                    "计算", "计算化学", "理论化学", "计算模拟", "分子模拟",
                    # 量子化学方法
                    "量子", "量子化学", "从头计算", "ab initio", "密度泛函",
                    "DFT", "Hartree-Fock", "HF", "MP2", "CCSD", "CI",
                    # 分子动力学
                    "分子动力学", "MD", "蒙特卡洛", "MC", "分子力学", "力场",
                    "经典动力学", "量子动力学", "Car-Parrinello", "BOMD", "AIMD",
                    # 软件工具
                    "Gaussian", "VASP", "CASTEP", "Materials Studio", "LAMMPS",
                    "GROMACS", "AMBER", "CHARMM", "NAMD", "Quantum Espresso",
                    # 计算内容
                    "几何优化", "频率计算", "能量计算", "轨道分析", "电荷分析",
                    "振动分析", "光谱计算", "反应路径", "过渡态", "IRC",
                    # 应用领域
                    "药物设计", "材料设计", "催化机理", "反应机理", "性质预测",
                    "构效关系", "分子识别", "超分子", "生物大分子", "蛋白质折叠",
                    # 英文词汇
                    "computational", "chemistry", "quantum", "DFT", "molecular",
                    "dynamics", "simulation", "calculation", "optimization", "theory"
                ],
                "高分子科学": [
                    # 高分子基础
                    "高分子", "聚合物", "大分子", "高分子化学", "高分子物理",
                    # 聚合物类型
                    "塑料", "合成树脂", "热塑性", "热固性", "弹性体",
                    "橡胶", "天然橡胶", "合成橡胶", "硅橡胶", "丁苯橡胶",
                    "纤维", "合成纤维", "尼龙", "聚酯", "丙纶", "腈纶",
                    # 聚合反应
                    "聚合", "聚合反应", "加聚", "缩聚", "开环聚合", "配位聚合",
                    "自由基聚合", "离子聚合", "活性聚合", "可控聚合", "ATRP", "RAFT",
                    # 分子结构
                    "分子量", "分子量分布", "重均分子量", "数均分子量", "多分散性",
                    "构型", "立构规整性", "等规", "间规", "无规", "序列分布",
                    # 物理性质
                    "玻璃化转变", "熔点", "结晶", "结晶度", "取向", "缠结",
                    "粘弹性", "蠕变", "应力松弛", "动态力学", "流变", "加工性能",
                    # 改性与应用
                    "改性", "共混", "复合", "增强", "增韧", "阻燃",
                    "功能高分子", "智能材料", "生物材料", "医用材料", "包装材料",
                    # 英文词汇
                    "polymer", "macromolecule", "plastic", "rubber", "fiber",
                    "polymerization", "molecular", "weight", "crystallization", "glass"
                ],
                "纳米材料": [
                    # 纳米基础
                    "纳米", "纳米材料", "纳米科学", "纳米技术", "纳米结构",
                    # 维度分类
                    "纳米粒子", "零维", "一维", "二维", "三维", "量子点",
                    "纳米线", "纳米管", "纳米棒", "纳米片", "纳米带", "纳米环",
                    # 二维材料
                    "纳米材料", "石墨烯", "氧化石墨烯", "MXene", "过渡金属硫化物",
                    "黑磷", "硼烯", "氮化硼", "单层", "双层", "多层", "范德华层状",
                    # 制备方法
                    "自上而下", "自下而上", "物理制备", "化学制备", "生物制备",
                    "溅射", "蒸发", "CVD", "ALD", "溶胶凝胶", "水热", "微波",
                    # 表征手段
                    "扫描电镜", "透射电镜", "原子力显微镜", "X射线衍射", "拉曼光谱",
                    "X射线光电子能谱", "比表面积", "孔径分布", "粒径分析", "Zeta电位",
                    # 特殊效应
                    "表面", "界面", "尺寸效应", "量子效应", "表面等离激元",
                    "超疏水", "超亲水", "光催化", "电催化", "传感", "药物载体",
                    # 英文词汇
                    "nanomaterial", "nanoparticle", "nanotechnology", "graphene", "quantum",
                    "dot", "nanotube", "nanowire", "surface", "interface", "size", "effect"
                ],
                "能源材料": [
                    # 能源基础
                    "能源", "新能源", "可再生能源", "清洁能源", "能源材料",
                    # 电池材料
                    "电池", "锂电池", "钠电池", "锂离子电池", "固态电池",
                    "正极材料", "负极材料", "电解质", "隔膜", "集流体", "粘结剂",
                    # 太阳能材料
                    "太阳能", "光伏", "太阳能电池", "硅电池", "薄膜电池",
                    "钙钛矿电池", "有机光伏", "染料敏化", "光电转换", "能量转换效率",
                    # 燃料电池
                    "燃料电池", "质子交换膜", "固体氧化物", "催化剂", "载体",
                    "膜电极", "双极板", "气体扩散层", "电解质膜", "离子导体",
                    # 储能材料
                    "储能", "超级电容器", "储氢", "储氢材料", "相变材料",
                    "热电材料", "压电材料", "摩擦电", "能量收集", "能量存储",
                    # 功能特性
                    "能量密度", "功率密度", "循环稳定性", "倍率性能", "库伦效率",
                    "充放电", "电化学性能", "离子传导", "电子传导", "界面阻抗",
                    # 英文词汇
                    "energy", "material", "battery", "solar", "cell", "fuel",
                    "storage", "electrode", "electrolyte", "conversion", "efficiency"
                ]
            },
            "数据类型": [
                # 化学数据类型
                "量子化学数据", "分子描述符数据", "反应数据", "光谱数据",
                "热力学数据", "动力学数据", "结构数据", "电化学数据",
                # 材料数据类型
                "晶体结构数据", "物性数据", "力学性能数据", "电学性能数据",
                "磁学性能数据", "光学性能数据", "热学性能数据", "表面数据",
                # 表征数据类型
                "X射线衍射数据", "电子显微镜数据", "核磁共振数据", "红外光谱数据",
                "拉曼光谱数据", "质谱数据", "热分析数据", "元素分析数据",
                # 实验数据类型
                "合成条件数据", "催化性能数据", "反应条件数据", "分离纯化数据",
                "稳定性数据", "毒性数据", "安全性数据", "环境数据"
            ]
        },
        
        "空间信息": {
            "功能导向": {
                "天体目标识别": [
                    # 核心概念
                    "识别", "分类", "天体", "星体", "目标", "天体识别", "目标检测",
                    # 天体类型
                    "恒星", "行星", "小行星", "彗星", "流星", "星系", "星云",
                    "类星体", "脉冲星", "中子星", "白矮星", "红巨星", "超新星",
                    # 观测技术
                    "望远镜", "CCD", "CMOS", "光学观测", "射电观测", "红外观测",
                    "紫外观测", "X射线观测", "γ射线观测", "多波段观测", "全天观测",
                    # 图像处理
                    "图像识别", "模式识别", "特征提取", "目标分割", "背景扣除",
                    "噪声滤除", "图像增强", "形态学操作", "边缘检测", "轮廓提取",
                    # 机器学习
                    "机器学习", "深度学习", "神经网络", "卷积网络", "支持向量机",
                    "随机森林", "聚类分析", "分类算法", "目标检测算法", "语义分割",
                    # 英文词汇
                    "identification", "classification", "celestial", "object", "detection",
                    "star", "galaxy", "nebula", "telescope", "astronomical", "survey"
                ],
                "轨道参数确定": [
                    # 核心概念
                    "轨道", "参数", "确定", "轨道参数", "轨道要素", "开普勒要素",
                    # 轨道要素
                    "半长轴", "偏心率", "倾角", "升交点", "近点角", "真近点角",
                    "平近点角", "偏近点角", "轨道周期", "轨道速度", "角速度",
                    # 轨道类型
                    "椭圆", "椭圆轨道", "圆轨道", "抛物线轨道", "双曲线轨道",
                    "地心轨道", "日心轨道", "月球轨道", "行星轨道", "小行星轨道",
                    # 计算方法
                    "计算", "数值计算", "分析解", "数值积分", "摄动理论",
                    "最小二乘", "卡尔曼滤波", "统计轨道确定", "初轨确定", "精密定轨",
                    # 观测数据
                    "位置观测", "速度观测", "测距", "测角", "径向速度",
                    "视差", "自行", "光行时", "相对论修正", "大气折射",
                    # 应用领域
                    "天体力学", "航天动力学", "空间探测", "卫星导航", "空间监视",
                    "近地天体", "空间碎片", "交会对接", "轨道转移", "轨道机动",
                    # 英文词汇
                    "orbital", "parameter", "determination", "kepler", "elements",
                    "ellipse", "eccentricity", "inclination", "calculation", "trajectory"
                ],
                "光度测量分析": [
                    # 核心概念
                    "光度", "测量", "分析", "光度学", "测光", "光度测量",
                    # 光度系统
                    "亮度", "星等", "视星等", "绝对星等", "色指数", "颜色",
                    "UBV系统", "约翰逊系统", "斯特伦姆格林系统", "2MASS", "SDSS",
                    # 光度量
                    "流量", "光度", "发光度", "光通量", "辐射流量", "表面亮度",
                    "中心亮度", "积分星等", "孔径测光", "轮廓测光", "表面测光",
                    # 测量技术
                    "CCD测光", "光电测光", "照相测光", "红外测光", "空间测光",
                    "地基测光", "多色测光", "宽带测光", "窄带测光", "中带测光",
                    # 数据处理
                    "定标", "流量定标", "大气消光", "仪器响应", "系统误差",
                    "统计误差", "标准星", "次级标准", "测光精度", "测光标准系统",
                    # 变星观测
                    "变星", "光变", "光变曲线", "周期", "振幅", "相位",
                    "脉动变星", "食变星", "爆发变星", "新星", "超新星", "激变变星",
                    # 英文词汇
                    "photometry", "magnitude", "brightness", "flux", "luminosity",
                    "color", "index", "measurement", "calibration", "photometric"
                ],
                "光谱特征提取": [
                    # 核心概念
                    "光谱", "特征", "提取", "光谱学", "光谱分析", "谱线分析",
                    # 光谱类型
                    "谱线", "吸收谱", "发射谱", "连续谱", "线谱", "带谱",
                    "恒星光谱", "星系光谱", "行星光谱", "太阳光谱", "星际介质光谱",
                    # 光谱特征
                    "红移", "蓝移", "多普勒效应", "谱线轮廓", "等值宽度", "线强",
                    "中心深度", "半高全宽", "不对称性", "翼部", "核心", "吸收线",
                    # 光谱仪器
                    "分析", "光谱仪", "色散", "分辨率", "光谱分辨率", "波长定标",
                    "流量定标", "响应函数", "狭缝", "光纤", "积分视场", "回声光栅",
                    # 数据处理
                    "光谱提取", "背景扣除", "宇宙线去除", "波长定标", "流量定标",
                    "大气谱线", "视向速度", "径向速度", "光谱型", "光谱分类",
                    # 物理信息
                    "温度", "表面重力", "金属丰度", "化学丰度", "元素丰度",
                    "电离度", "激发温度", "湍动速度", "磁场", "压力致宽",
                    # 英文词汇
                    "spectroscopy", "spectrum", "spectral", "line", "feature",
                    "redshift", "doppler", "wavelength", "resolution", "analysis"
                ],
                "时间序列分析": [
                    # 核心概念
                    "时间序列", "时序", "时变", "时域分析", "变化", "演化",
                    # 变化类型
                    "周期", "周期性", "准周期", "非周期", "趋势", "长期变化",
                    "短期变化", "突发变化", "渐变", "阶跃变化", "随机变化",
                    # 分析方法
                    "傅里叶分析", "功率谱", "周期图", "自相关", "交叉相关",
                    "小波分析", "希尔伯特变换", "经验模态分解", "主成分分析",
                    # 统计方法
                    "统计分析", "概率分布", "方差分析", "回归分析", "时间序列建模",
                    "ARIMA", "状态空间模型", "卡尔曼滤波", "贝叶斯分析",
                    # 天体应用
                    "变星", "脉冲星", "活动星系核", "γ射线暴", "超新星",
                    "太阳活动", "黑洞", "吸积盘", "喷流", "耀斑", "爆发",
                    # 信号处理
                    "噪声", "信噪比", "滤波", "去趋势", "插值", "重采样",
                    "数据质量", "异常值", "缺失数据", "采样率", "奈奎斯特频率",
                    # 英文词汇
                    "time", "series", "temporal", "variability", "periodic",
                    "trend", "fourier", "power", "spectrum", "correlation"
                ],
                "位置导航定位": [
                    # 核心概念
                    "位置", "导航", "定位", "坐标", "定位系统", "导航系统",
                    # 全球导航系统
                    "GPS", "GNSS", "北斗", "GLONASS", "Galileo", "QZSS",
                    "卫星导航", "伪距", "载波相位", "差分GPS", "RTK", "PPP",
                    # 坐标系统
                    "坐标", "天球坐标", "地心坐标", "大地坐标", "投影坐标",
                    "赤道坐标", "银道坐标", "黄道坐标", "地平坐标", "时角坐标",
                    # 参考系
                    "参考系", "参考框架", "国际天球参考系", "国际地球参考系",
                    "WGS84", "ITRF", "J2000", "岁差", "章动", "极移",
                    # 测量技术
                    "VLBI", "激光测距", "卫星测距", "多普勒测量", "干涉测量",
                    "天体测量", "位置天文学", "视差", "自行", "三角视差",
                    # 精度评估
                    "精度", "准确度", "误差", "系统误差", "随机误差", "不确定度",
                    "协方差", "误差椭圆", "GDOP", "PDOP", "置信区间",
                    # 英文词汇
                    "position", "navigation", "positioning", "coordinate", "GPS",
                    "GNSS", "satellite", "accuracy", "precision", "reference"
                ],
                "空间环境监测": [
                    # 核心概念
                    "空间", "环境", "监测", "空间环境", "空间天气", "日地关系",
                    # 监测对象
                    "辐射", "粒子辐射", "电磁辐射", "宇宙线", "太阳粒子",
                    "范艾伦带", "辐射带", "等离子体", "电离层", "磁层",
                    # 磁场环境
                    "磁场", "地磁场", "行星际磁场", "磁暴", "磁亚暴",
                    "磁重联", "磁层顶", "激波", "等离子体片", "环电流",
                    # 太阳活动
                    "太阳风", "日冕物质抛射", "太阳耀斑", "质子事件",
                    "太阳周期", "太阳黑子", "日冕洞", "太阳射电爆发",
                    # 监测手段
                    "卫星监测", "地面监测", "雷达监测", "光学监测",
                    "粒子探测器", "磁强计", "等离子体分析仪", "射电望远镜",
                    # 空间效应
                    "单粒子效应", "总剂量效应", "位移损伤", "表面充电",
                    "深层充电", "大气阻尼", "轨道衰减", "通信中断",
                    # 英文词汇
                    "space", "environment", "monitoring", "radiation", "magnetic",
                    "field", "solar", "wind", "plasma", "ionosphere", "magnetosphere"
                ],
                "图像处理识别": [
                    # 核心概念
                    "图像", "处理", "识别", "图像处理", "图像识别", "计算机视觉",
                    # 预处理
                    "增强", "滤波", "去噪", "锐化", "平滑", "对比度增强",
                    "直方图均衡", "伽马校正", "几何校正", "辐射校正", "大气校正",
                    # 特征提取
                    "特征提取", "边缘检测", "角点检测", "纹理分析", "形状分析",
                    "SIFT", "SURF", "ORB", "HOG", "LBP", "Gabor滤波器",
                    # 图像分割
                    "分割", "阈值分割", "区域生长", "边缘分割", "聚类分割",
                    "水平集", "活动轮廓", "图割", "语义分割", "实例分割",
                    # 模式识别
                    "模式识别", "分类", "聚类", "支持向量机", "神经网络",
                    "深度学习", "卷积神经网络", "循环神经网络", "注意机制",
                    # 目标检测
                    "目标检测", "目标跟踪", "YOLO", "R-CNN", "SSD",
                    "候选区域", "非极大值抑制", "边界框", "锚点", "特征金字塔",
                    # 应用场景
                    "天体识别", "表面特征", "撞击坑", "地形分析", "变化检测",
                    "运动目标", "遥感图像", "多光谱", "高光谱", "合成孔径雷达",
                    # 英文词汇
                    "image", "processing", "recognition", "enhancement", "filtering",
                    "segmentation", "feature", "detection", "classification", "CNN"
                ],
                "射电信号分析": [
                    # 核心概念
                    "射电", "信号", "分析", "射电天文", "射电望远镜", "无线电",
                    # 射电源
                    "频谱", "射电源", "脉冲星", "类星体", "射电星系",
                    "超新星遗迹", "HII区", "分子云", "同步辐射", "自由自由辐射",
                    # 观测技术
                    "干涉", "干涉测量", "综合孔径", "VLBI", "阵列", "相关器",
                    "波束形成", "校准", "成像", "频率", "带宽", "时间分辨率",
                    # 信号处理
                    "数字信号处理", "快速傅里叶变换", "滤波", "去色散",
                    "相位校准", "幅度校准", "RFI抑制", "基线减除", "去卷积",
                    # 脉冲星
                    "脉冲", "脉冲星", "脉冲轮廓", "色散", "色散测量", "周期",
                    "周期导数", "计时", "脉冲到达时间", "残差", "引力波",
                    # 数据分析
                    "时频分析", "动态频谱", "功率谱", "自相关", "交叉相关",
                    "相干分析", "偏振分析", "法拉第旋转", "同步辐射", "热辐射",
                    # 英文词汇
                    "radio", "signal", "analysis", "pulsar", "interferometry",
                    "spectrum", "correlation", "calibration", "imaging", "dispersion"
                ],
                "变源监测预警": [
                    # 核心概念
                    "变源", "监测", "预警", "变源监测", "瞬变源", "爆发源",
                    # 变源类型
                    "爆发", "γ射线暴", "超新星", "新星", "矮新星", "耀斑星",
                    "激变变星", "X射线暴", "软γ重复暴", "快速射电暴", "引力波源",
                    # 监测系统
                    "闪烁", "光变", "全天监测", "巡天", "实时监测", "多信使",
                    "早期预警", "快速跟进", "后随观测", "多波段", "多望远镜",
                    # 探测方法
                    "阈值检测", "异常检测", "模式识别", "机器学习", "神经网络",
                    "统计检验", "假阳性控制", "灵敏度", "完备性", "污染率",
                    # 数据处理
                    "实时处理", "流水线", "自动化", "事件识别", "分类",
                    "参数估计", "光变曲线", "能谱分析", "时间分析", "空间定位",
                    # 预警系统
                    "快速通知", "警报", "GCN", "ATel", "TNS", "VOEvent",
                    "坐标分发", "多信使协调", "国际合作", "数据共享", "开放获取",
                    # 英文词汇
                    "transient", "monitoring", "alert", "burst", "flare",
                    "detection", "real-time", "follow-up", "multi-messenger", "GRB"
                ]
            },
            "应用领域": {
                "天体物理学": [
                    # 基础概念
                    "天体物理", "天体物理学", "理论天体物理", "观测天体物理",
                    # 天体类型
                    "恒星", "恒星物理", "恒星演化", "恒星形成", "恒星死亡",
                    "星系", "星系物理", "星系演化", "星系形成", "星系团",
                    "宇宙", "宇宙学", "宇宙演化", "大爆炸", "暗物质", "暗能量",
                    # 致密天体
                    "白矮星", "中子星", "黑洞", "脉冲星", "磁星",
                    "吸积盘", "喷流", "X射线双星", "伽马射线暴", "引力波",
                    # 星际介质
                    "星际介质", "星际尘埃"],
                    "行星科学": ["行星", "火星", "木星", "土星", "金星"],
                    "恒星物理学": ["恒星", "太阳", "恒星演化", "超新星"],
                    "星系天文学": ["星系", "银河系", "星系团", "暗物质"],
                    "空间技术": ["卫星", "航天", "遥感", "通信", "导航"],
                    "射电天文学": ["射电", "射电望远镜", "脉冲星", "类星体"],
                    "光学天文学": ["光学", "望远镜", "CCD", "光度", "测光"],
                    "高能天体物理": ["高能", "X射线", "伽马射线", "中子星"],
                    "太阳物理学": ["太阳", "日冕", "太阳风", "磁场"],
                    "宇宙学": ["宇宙学", "大爆炸", "暗能量", "微波背景"]
                },
                "数据类型": ["光学", "射电", "光谱", "测距", "图像", "时序", "位置"]
            },
            "大气海洋": {
                "功能导向": {
                    "天气数值预报": ["天气", "预报", "数值", "模式", "预测"],
                    "气候变化检测": ["气候", "变化", "检测", "趋势", "异常"],
                    "极端事件预警": ["极端", "事件", "预警", "灾害", "台风"],
                    "环境质量评估": ["环境", "质量", "评估", "污染", "监测"],
                    "数据同化分析": ["同化", "分析", "融合", "观测", "模式"],
                    "海洋动力学建模": ["海洋", "动力学", "建模", "环流", "波浪"],
                    "大气成分监测": ["大气", "成分", "监测", "温室气体", "臭氧"],
                    "趋势模式识别": ["趋势", "模式", "识别", "周期", "振荡"],
                    "灾害风险评估": ["灾害", "风险", "评估", "洪水", "干旱"],
                    "生态环境预测": ["生态", "环境", "预测", "生物", "栖息地"]
                },
                "应用领域": {
                    "数值天气预报": ["数值预报", "天气模式", "预报", "气象"],
                    "气候变化研究": ["气候变化", "全球变暖", "温室效应"],
                    "海洋学": ["海洋", "海流", "海温", "盐度", "波浪"],
                    "大气化学": ["大气化学", "臭氧", "气溶胶", "污染"],
                    "环境科学": ["环境", "生态", "污染", "保护"],
                    "气象学": ["气象", "天气", "降水", "温度", "风"],
                    "水文学": ["水文", "径流", "蒸发", "降水", "水循环"],
                    "极地科学": ["极地", "南极", "北极", "冰川", "海冰"],
                    "遥感应用": ["遥感", "卫星", "观测", "监测"],
                    "生态气象学": ["生态气象", "农业", "植被", "生长"]
                },
                "数据类型": ["观测", "再分析", "卫星", "模式", "代理", "时序", "网格"]
            },
            "其他": {
                "功能导向": {
                    "知识图谱构建": ["知识图谱", "本体", "关系", "实体", "语义"],
                    "信息检索排序": ["检索", "排序", "搜索", "相关性", "索引"],
                    "文本挖掘分析": ["文本挖掘", "自然语言", "语义", "主题"],
                    "多语言处理": ["多语言", "翻译", "语言模型", "词汇"],
                    "社会网络分析": ["社会网络", "网络分析", "图论", "关系"],
                    "统计建模预测": ["统计", "建模", "回归", "预测", "概率"],
                    "可视化呈现": ["可视化", "图表", "展示", "界面", "交互"],
                    "情感态度分析": ["情感", "态度", "情绪", "观点", "倾向"],
                    "档案数字化管理": ["档案", "数字化", "管理", "保存", "整理"],
                    "跨学科数据融合": ["跨学科", "融合", "整合", "多源", "综合"]
                },
                "应用领域": {
                    "数字人文": ["数字人文", "文化", "文学", "艺术"],
                    "计算社会科学": ["计算社会科学", "社会", "行为", "群体"],
                    "语言学": ["语言学", "语言", "语法", "语音", "方言"],
                    "历史学": ["历史", "史学", "古代", "近代", "文献"],
                    "文化研究": ["文化", "民俗", "传统", "习俗"],
                    "政治学": ["政治", "政府", "政策", "治理"],
                    "经济学": ["经济", "市场", "金融", "贸易"],
                    "人口学": ["人口", "人口统计", "生育", "迁移"],
                    "教育学": ["教育", "学习", "教学", "课程"],
                    "科学计量学": ["科学计量", "引文", "期刊", "学术"]
                },
                "数据类型": ["文本", "调查", "档案", "多媒体", "网络", "语音", "图像"]
            }
        }
    
    def extract_keywords(self, text: str) -> Set[str]:
        """从文本中提取关键词"""
        # 转换为小写并移除标点符号
        clean_text = re.sub(r'[^\w\s\u4e00-\u9fff]', ' ', text.lower())
        words = clean_text.split()
        
        # 提取有意义的词汇（长度大于1）
        keywords = set()
        for word in words:
            if len(word) > 1:
                keywords.add(word)
        
        return keywords
    
    def calculate_discipline_scores(self, text: str) -> Dict[str, float]:
        """计算各学科的匹配分数"""
        keywords = self.extract_keywords(text)
        scores = {}
        
        for discipline, mapping in self.keyword_mapping.items():
            score = 0
            total_weight = 0
            
            # 功能导向匹配
            for function, func_keywords in mapping["功能导向"].items():
                for keyword in func_keywords:
                    total_weight += 3
                    if any(kw in keyword.lower() or keyword.lower() in kw for kw in keywords):
                        score += 3
            
            # 应用领域匹配
            for domain, domain_keywords in mapping["应用领域"].items():
                for keyword in domain_keywords:
                    total_weight += 2
                    if any(kw in keyword.lower() or keyword.lower() in kw for kw in keywords):
                        score += 2
            
            # 数据类型匹配
            for data_type in mapping["数据类型"]:
                total_weight += 1
                if any(kw in data_type.lower() or data_type.lower() in kw for kw in keywords):
                    score += 1
            
            # 计算归一化分数
            scores[discipline] = score / max(total_weight, 1) if total_weight > 0 else 0
        
        return scores
    
    def assign_primary_discipline(self, text: str) -> str:
        """分配主要学科标签"""
        scores = self.calculate_discipline_scores(text)
        if not scores:
            return "其他"
        
        max_score = max(scores.values())
        if max_score == 0:
            return "其他"
        
        return max(scores, key=scores.get)
    
    def assign_detailed_tags(self, text: str, discipline: str) -> Tuple[str, str, str]:
        """为特定学科分配详细标签，返回(功能导向, 应用领域, 数据类型)"""
        if discipline not in self.tag_hierarchy:
            return ("跨学科数据融合", "数字人文", "文本数据")
        
        keywords = self.extract_keywords(text)
        hierarchy = self.tag_hierarchy[discipline]
        mapping = self.keyword_mapping[discipline]
        
        # 分配功能导向标签
        function_scores = {}
        for function in hierarchy["功能导向"]:
            score = 0
            if function in mapping["功能导向"]:
                for keyword in mapping["功能导向"][function]:
                    if any(kw in keyword.lower() or keyword.lower() in kw for kw in keywords):
                        score += 1
            function_scores[function] = score
        
        best_function = max(function_scores, key=function_scores.get) if max(function_scores.values()) > 0 else hierarchy["功能导向"][0]
        
        # 分配应用领域标签
        domain_scores = {}
        for domain in hierarchy["应用领域"]:
            score = 0
            if domain in mapping["应用领域"]:
                for keyword in mapping["应用领域"][domain]:
                    if any(kw in keyword.lower() or keyword.lower() in kw for kw in keywords):
                        score += 1
            domain_scores[domain] = score
        
        best_domain = max(domain_scores, key=domain_scores.get) if max(domain_scores.values()) > 0 else hierarchy["应用领域"][0]
        
        # 分配数据类型标签
        data_scores = {}
        for data_type in hierarchy["数据类型"]:
            score = 0
            if any(kw in data_type.lower() or data_type.lower() in kw for kw in keywords):
                score += 1
            data_scores[data_type] = score
        
        best_data_type = max(data_scores, key=data_scores.get) if max(data_scores.values()) > 0 else hierarchy["数据类型"][0]
        
        return (best_function, best_domain, best_data_type)
    
    def tag_corpus(self, text: str, title: str = "") -> Dict:
        """对语料进行完整标签分配"""
        # 合并标题和内容进行分析
        full_text = f"{title} {text}" if title else text
        
        # 分配主要学科
        primary_discipline = self.assign_primary_discipline(full_text)
        
        # 分配详细标签
        function, domain, data_type = self.assign_detailed_tags(full_text, primary_discipline)
        
        # 生成标签字符串
        tag_string = f"{primary_discipline}/{function}/{domain}/{data_type}"
        
        # 计算置信度分数
        scores = self.calculate_discipline_scores(full_text)
        confidence = scores.get(primary_discipline, 0)
        
        return {
            "标签": tag_string,
            "主要学科": primary_discipline,
            "功能导向": function,
            "应用领域": domain,
            "数据类型": data_type,
            "置信度": round(confidence, 3),
            "学科分数": {k: round(v, 3) for k, v in scores.items()},
            "文本长度": len(full_text),
            "关键词数量": len(self.extract_keywords(full_text))
        }
    
    def batch_tag_corpus(self, corpus_list: List[Dict]) -> List[Dict]:
        """批量标签分配"""
        results = []
        for i, item in enumerate(corpus_list):
            text = item.get("content", "")
            title = item.get("title", "")
            
            tags = self.tag_corpus(text, title)
            
            result = {
                "id": item.get("id", i),
                "title": title,
                "标签": tags["标签"],
                "置信度": tags["置信度"],
                "详细信息": {
                    "主要学科": tags["主要学科"],
                    "功能导向": tags["功能导向"],
                    "应用领域": tags["应用领域"],
                    "数据类型": tags["数据类型"],
                    "学科分数": tags["学科分数"]
                }
            }
            results.append(result)
        
        return results
    
    def export_tags_to_json(self, results: List[Dict], filename: str):
        """导出标签结果为JSON文件"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
    
    def export_simple_format(self, results: List[Dict], filename: str):
        """导出简化格式（ID, 标题, 标签）"""
        simple_results = []
        for result in results:
            simple_results.append({
                "id": result["id"],
                "title": result["title"],
                "tag": result["标签"]
            })
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(simple_results, f, ensure_ascii=False, indent=2)
    
    def generate_tag_statistics(self, results: List[Dict]) -> Dict:
        """生成标签统计信息"""
        stats = {
            "总数据量": len(results),
            "学科分布": defaultdict(int),
            "置信度分布": {"高(>0.3)": 0, "中(0.1-0.3)": 0, "低(<0.1)": 0},
            "功能导向分布": defaultdict(int),
            "应用领域分布": defaultdict(int),
            "数据类型分布": defaultdict(int)
        }
        
        for result in results:
            # 学科分布
            discipline = result["详细信息"]["主要学科"]
            stats["学科分布"][discipline] += 1
            
            # 置信度分布
            confidence = result["置信度"]
            if confidence > 0.3:
                stats["置信度分布"]["高(>0.3)"] += 1
            elif confidence > 0.1:
                stats["置信度分布"]["中(0.1-0.3)"] += 1
            else:
                stats["置信度分布"]["低(<0.1)"] += 1
            
            # 各层级分布
            stats["功能导向分布"][result["详细信息"]["功能导向"]] += 1
            stats["应用领域分布"][result["详细信息"]["应用领域"]] += 1
            stats["数据类型分布"][result["详细信息"]["数据类型"]] += 1
        
        return dict(stats)


# 使用示例
def main():
    """主函数示例"""
    
    # 初始化标签系统
    tagger = ScienceCorpusTagger()
    
    # 示例数据
    sample_corpus = [
        {
            "id": 1,
            "title": "MammalNet哺乳动物行为识别数据集",
            "content": "MammalNet视频数据集围绕涵盖17个目、69个科和173个哺乳动物类别的生物哺乳动物分类学构建，并包括12种常见的高级哺乳动物行为（例如狩猎、梳理行为）。用于动物和行为识别研究。"
        },
        {
            "id": 2,
            "title": "湍流流动DNS数据库",
            "content": ""
        },
        {
            "id": 3,
            "title": "分子机器学习基准MoleculeNet",
            "content": "MoleculeNet是分子机器学习的大规模基准。管理多个公共数据集，建立评估指标，并提供多个先前提出的分子特征化和学习算法的高质量开源实现。用于分子性质预测和药物发现。"
        },
        {
            "id": 4,
            "title": "超高清月面地图",
            "content": "超高清月面地图第三版，具有4.5亿像素，有1294个月面地形被标注，包括655个主环形坑，348个卫星坑，19个月海，17个月湖等15种不同类型地标。"
        },
        {
            "id": 5,
            "title": "全球海洋再分析数据GLORYS12V1",
            "content": "GLORYS12V1产品是CMEMS全球海洋涡旋分辨率再分析产品，涵盖了高度计数据。该产品主要基于当前实时全球预报CMEMS系统，用于海洋动力学研究和环境监测。"
        }
    ]
    
    # 批量标签分配
    results = tagger.batch_tag_corpus(sample_corpus)
    
    # 打印结果
    print("=== 标签分配结果 ===\n")
    for result in results:
        print(f"ID: {result['id']}")
        print(f"标题: {result['title']}")
        print(f"标签: {result['标签']}")
        print(f"置信度: {result['置信度']}")
        print("-" * 50)
    
    # 生成统计信息
    stats = tagger.generate_tag_statistics(results)
    print(f"\n=== 标签统计信息 ===")
    print(json.dumps(stats, ensure_ascii=False, indent=2))

if __name__ == "__main__":
    main()

=== 标签分配结果 ===

ID: 1
标题: MammalNet哺乳动物行为识别数据集
标签: 大气海洋/趋势模式识别/数值天气预报/观测数据
置信度: 0.025
--------------------------------------------------
ID: 2
标题: 湍流流动DNS数据库
标签: 工程技术/流体动力学分析/流体力学/仿真数据
置信度: 0.006
--------------------------------------------------
ID: 3
标题: 分子机器学习基准MoleculeNet
标签: 大气海洋/环境质量评估/数值天气预报/观测数据
置信度: 0.062
--------------------------------------------------
ID: 4
标题: 超高清月面地图
标签: 大气海洋/天气数值预报/遥感应用/观测数据
置信度: 0.012
--------------------------------------------------
ID: 5
标题: 全球海洋再分析数据GLORYS12V1
标签: 大气海洋/环境质量评估/数值天气预报/再分析数据
置信度: 0.136
--------------------------------------------------

=== 标签统计信息 ===
{
  "总数据量": 5,
  "学科分布": {
    "大气海洋": 4,
    "工程技术": 1
  },
  "置信度分布": {
    "高(>0.3)": 0,
    "中(0.1-0.3)": 1,
    "低(<0.1)": 4
  },
  "功能导向分布": {
    "趋势模式识别": 1,
    "流体动力学分析": 1,
    "环境质量评估": 2,
    "天气数值预报": 1
  },
  "应用领域分布": {
    "数值天气预报": 3,
    "流体力学": 1,
    "遥感应用": 1
  },
  "数据类型分布": {
    "观测数据": 3,
    "仿真数据": 1,
    "再分析数据": 1
  }
}


In [68]:
# 创建科学领域映射字典
def create_domain_mapping():
    """创建科学领域到标签系统第一级的映射"""
    domain_mapping = {
        # 生命科学相关
        "生命科学": "生命科学",
        "生物学": "生命科学", 
        "医学": "生命科学",
        "生物医学": "生命科学",
        "基因组学": "生命科学",
        "蛋白质组学": "生命科学",
        "神经科学": "生命科学",
        "生物信息学": "生命科学",
        
        # 工程技术相关
        "工程技术": "工程技术",
        "机械工程": "工程技术",
        "电气工程": "工程技术", 
        "自动化": "工程技术",
        "控制工程": "工程技术",
        "流体力学": "工程技术",
        "热传递": "工程技术",
        "结构工程": "工程技术",
        
        # 物质科学相关
        "物质科学": "物质科学",
        "化学": "物质科学",
        "材料科学": "物质科学",
        "物理化学": "物质科学",
        "催化": "物质科学",
        "药物化学": "物质科学",
        
        # 空间信息相关
        "空间信息": "空间信息",
        "天体物理": "空间信息",
        "天文学": "空间信息",
        "空间科学": "空间信息",
        "遥感": "空间信息",
        "地理信息": "空间信息",
        
        # 大气海洋相关
        "大气海洋": "大气海洋",
        "气象学": "大气海洋",
        "海洋学": "大气海洋",
        "气候学": "大气海洋",
        "环境科学": "大气海洋",
        "水文学": "大气海洋",
        
        # 其他
        "计算机科学": "其他",
        "数学": "其他",
        "物理学": "其他",
        "地球科学": "其他",
        "社会科学": "其他"
    }
    return domain_mapping

def process_real_data(df, science_domain_col='学科', description_col='介绍'):
    """
    处理真实数据，生成标签列
    
    Args:
        df: pandas DataFrame
        science_domain_col: 科学领域列名
        description_col: 数据介绍列名
    
    Returns:
        pandas DataFrame with new tag column
    """
    # 初始化标签系统
    tagger = ScienceCorpusTagger()
    domain_mapping = create_domain_mapping()
    
    # 创建结果列表
    tags = []
    
    for idx, row in df.iterrows():
        # 获取科学领域
        science_domain = str(row[science_domain_col]) if pd.notna(row[science_domain_col]) else ""
        
        # 获取数据介绍文本
        description = str(row[description_col]) if pd.notna(row[description_col]) else ""
        
        # 映射科学领域到第一级标签
        first_level = "其他"  # 默认值
        for domain_key, mapped_value in domain_mapping.items():
            if domain_key in science_domain:
                first_level = mapped_value
                break
        
        # 使用数据介绍文本分析后三级标签
        if description.strip():
            # 分析描述文本获取详细标签
            function, domain, data_type = tagger.assign_detailed_tags(description, first_level)
        else:
            # 如果没有描述，使用默认值
            function = "跨学科数据融合"
            domain = "数字人文"  
            data_type = "文本数据"
        
        # 组合完整标签
        tag = f"{first_level}/{function}/{domain}/{data_type}"
        tags.append(tag)
    
    # 添加标签列到DataFrame
    result_df = df.copy()
    result_df['标签'] = tags
    
    return result_df

# 处理数据
print("开始处理数据...")
df_with_tags = process_real_data(df_result)
print("处理完成！")



开始处理数据...
处理完成！


In [69]:
# 对标签进行分割处理，按/分为四层
def split_tags_to_columns(df):
    """
    将标签列按/分割为四个独立的列
    标签格式：科学领域/功能标签-任务导向/二级标签-应用领域/二级标签-数据类型
    """
    df_result = df.copy()
    
    # 分割标签列
    tag_splits = df_result['标签'].str.split('/', expand=True)
    
    # 确保有四列
    if tag_splits.shape[1] >= 4:
        # 将分割后的标签填入对应列
        df_result['科学领域'] = tag_splits[0]
        df_result['功能标签-任务导向'] = tag_splits[1] 
        df_result['二级标签-应用领域'] = tag_splits[2]
        
        print("标签分割完成！")
        print(f"处理了 {len(df_result)} 条记录")
        
        # 显示分割后的统计信息
        print("\n各列的唯一值数量:")
        print(f"科学领域: {df_result['科学领域'].nunique()} 个")
        print(f"功能标签-任务导向: {df_result['功能标签-任务导向'].nunique()} 个") 
        print(f"二级标签-应用领域: {df_result['二级标签-应用领域'].nunique()} 个")
        
    else:
        print(f"警告：标签分割结果只有 {tag_splits.shape[1]} 列，预期为4列")
        
    return df_result

# 执行标签分割
df_final = split_tags_to_columns(df_with_tags)

# 查看分割结果示例
print("\n分割结果示例:")
display_cols = ['item_name', '学科', '功能标签-任务导向', '二级标签-应用领域', '标签']
print(df_final[display_cols].head())

标签分割完成！
处理了 101 条记录

各列的唯一值数量:
科学领域: 4 个
功能标签-任务导向: 11 个
二级标签-应用领域: 9 个

分割结果示例:
      item_name    学科 功能标签-任务导向 二级标签-应用领域                     标签
0         dolma  None     多语言处理       语言学      其他/多语言处理/语言学/文本数据
1  SDO_training  None    统计建模预测       教育学     其他/统计建模预测/教育学/文本数据
2     4DNeX-10M  None    统计建模预测      数字人文    其他/统计建模预测/数字人文/文本数据
3    MADLAD-400  None    文本挖掘分析       语言学     其他/文本挖掘分析/语言学/文本数据
4         ASPED  None    知识图谱构建    计算社会科学  其他/知识图谱构建/计算社会科学/文本数据


In [70]:
# 执行数据清洗操作
print("\n=== 开始执行数据清洗 ===")

# 1. 处理模态列
# 将空值替换为"多模态"
modal_null_count = df_final['modalities'].isnull().sum()
df_final['modalities'] = df_final['modalities'].fillna('多模态')

# 处理模态列中的异常值（如"公开"等明显不属于模态的值）
# 这些值也替换为"多模态"
invalid_modal_values = ['公开']
for invalid_value in invalid_modal_values:
    invalid_count = (df_final['modalities'] == invalid_value).sum()
    if invalid_count > 0:
        print(f"将模态列中的 '{invalid_value}' ({invalid_count}条) 替换为 '多模态'")
        df_final['modalities'] = df_final['modalities'].replace(invalid_value, '多模态')

print(f"模态列：已将 {modal_null_count} 个空值替换为 '多模态'")

# 2. 处理协议列
# 将空值替换为"公开"
protocol_null_count = df_final['开源协议'].isnull().sum()
df_final['开源协议'] = df_final['开源协议'].fillna('公开')
print(f"协议列：已将 {protocol_null_count} 个空值替换为 '公开'")

print("\n=== 数据清洗完成 ===")




=== 开始执行数据清洗 ===
模态列：已将 97 个空值替换为 '多模态'
协议列：已将 23 个空值替换为 '公开'

=== 数据清洗完成 ===


# 表格格式整理

In [None]:
# 1. 删除不需要的列
cols_to_drop = ['name', 'display_text', 'tags', 'libraries', 'license', 'description', 'extraction_status', 'tags_cleaned', '识别出的AI公司', '标签', '学科']
df_final = df_final.drop(columns=[col for col in cols_to_drop if col in df_final.columns])

# 2. 替换列名称
df_final = df_final.rename(columns={
    'item_name': '数据原始名称',
    'url': '数据原始链接',
    'author': '备注-平台作者',
    'update_date': '原始数据发布/更新日期',
    'modalities': '模态',
    'size': '数据集大小',
    '介绍': '数据介绍',
    '开源协议': '协议',
    'formats': '主要数据格式'
})

# 3. 按照指定顺序重排
desired_order = [
    '数据原始名称', '科学领域', '功能标签-任务导向', '二级标签-应用领域', '二级标签-数据类型',
    '模态', '协议', '国内/外', '数据集大小', '主要数据格式',
    '原始数据发布/更新日期', '数据介绍', '数据原始发布机构', '数据原始链接', '备注-平台作者'
 ]
df_final = df_final[[col for col in desired_order if col in df_final.columns]]
df_final.head()

Unnamed: 0,数据原始名称,数据翻译名称,是否上传门户,科学领域,功能标签-任务导向,二级标签-应用领域,模态,是否开源,协议,国内/外,数据集内数量（条）,数据集大小,主要数据格式,原始数据发布/更新日期,数据介绍,来源合作机构,来源合作机构数据连接,数据原始发布机构,数据原始链接,备注-平台作者
0,dolma,,否,其他,多语言处理,语言学,多模态,是,odc-by,国外,,1.32 MB,".md, .py, .txt",2024-04-17,Dolma是一个包含3万亿个token的大型数据集，涵盖网页内容、学术出版物、代码、书籍和百...,,,Allen Institute for AI,https://huggingface.co/datasets/allenai/dolma,allenai
1,SDO_training,,否,其他,统计建模预测,教育学,多模态,是,MIT,国外,,329.04 GB,".csv, .md, .part_aa, .part_ab, .part_ac, .part...",2025-05-21,该数据集提供来自NASA太阳动力学观测站（SDO）的机器学习就绪太阳数据，涵盖2010年5月...,,,Huggingface,https://huggingface.co/datasets/nasa-ibm-ai4sc...,nasa-ibm-ai4science
2,4DNeX-10M,,否,其他,统计建模预测,数字人文,多模态,是,Apache-2.0,国外,,6.63 TB,".gz, .md, .zip",2025-08-16,4DNeX-10M是一个大规模混合数据集，汇集了来自不同来源的单目视频，包含静态和动态场景，...,,,Huggingface,https://huggingface.co/datasets/3DTopia/4DNeX-10M,3DTopia
3,MADLAD-400,,否,其他,文本挖掘分析,语言学,多模态,是,odc-by,国外,,12.02 TB,".gz, .md, .zip",2024-09-10,MADLAD-400是一个基于Common Crawl构建的多语言文档级数据集，覆盖419种...,,,Allen Institute for AI,https://huggingface.co/datasets/allenai/MADLAD...,allenai
4,ASPED,,否,其他,知识图谱构建,计算社会科学,多模态,是,CC-BY-4.0,国外,,1.24 TB,".csv, .flac, .md, .mp4",2024-01-24,该数据集为ICASSP 2024会议上提出的ASPED数据集，包含在佐治亚理工学院及周边多个...,,,Huggingface,https://huggingface.co/datasets/urbanaudiosens...,urbanaudiosensing


In [72]:
df_final.to_excel('科学智能语料.xlsx', index=False)