# 爬取模型列表

In [28]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
import time
import csv
import json
import logging
import re
from urllib.parse import urljoin, urlparse, parse_qs

# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class FixedHuggingFaceModelsCrawler:
    def __init__(self, headless=True, delay=3):
        """
        修复版模型爬虫
        :param headless: 是否使用无头模式
        :param delay: 页面加载等待时间
        """
        self.delay = delay
        self.models = []
        self.current_page = 1
        self.setup_driver(headless)
        
    def setup_driver(self, headless=True):
        """
        设置Chrome浏览器驱动
        """
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
        
        # 禁用图片和CSS加载以提高速度
        prefs = {
            "profile.managed_default_content_settings.images": 2,
            "profile.default_content_setting_values.notifications": 2
        }
        chrome_options.add_experimental_option("prefs", prefs)
        
        try:
            self.driver = webdriver.Chrome(options=chrome_options)
            self.wait = WebDriverWait(self.driver, 15)
            logger.info("Chrome浏览器驱动初始化成功")
        except Exception as e:
            logger.error(f"浏览器驱动初始化失败: {e}")
            raise
    
    def load_page(self, url):
        """
        加载页面并等待内容完全加载
        :param url: 页面URL
        :return: 是否成功加载
        """
        try:
            logger.info(f"正在加载页面: {url}")
            self.driver.get(url)
            
            # 等待页面基本结构加载
            self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "main")))
            
            # 等待模型列表加载完成
            time.sleep(self.delay)
            
            # 尝试等待具体的模型容器加载
            try:
                self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "article, .overview-card, h4 a")))
            except TimeoutException:
                logger.warning("未找到明确的模型容器，继续尝试提取")
            
            # 滚动页面确保所有内容加载
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
            time.sleep(1)
            self.driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(1)
            
            return True
        except TimeoutException:
            logger.error(f"页面加载超时: {url}")
            return False
        except Exception as e:
            logger.error(f"页面加载失败: {url}, 错误: {e}")
            return False
    
    def extract_models_from_current_page(self):
        """
        从当前页面提取模型信息 - 改进版
        :return: 模型列表
        """
        models = []
        
        try:
            # 等待页面完全加载
            time.sleep(2)
            
            # 基于实际观察到的HTML结构，使用更精确的选择器
            selectors_and_strategies = [
                # 策略1: 查找包含模型路径的链接
                ("a[href^='/']", self.extract_from_links),
                # 策略2: 查找文章标题中的链接
                ("article h4 a, article h3 a", self.extract_from_article_links),
                # 策略3: 从页面文本中提取
                ("body", self.extract_from_page_text),
            ]
            
            for selector, extraction_func in selectors_and_strategies:
                try:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    if elements:
                        extracted_models = extraction_func(elements)
                        if extracted_models:
                            models.extend(extracted_models)
                            logger.info(f"使用策略 '{selector}' 找到 {len(extracted_models)} 个模型")
                            break  # 如果成功提取到模型，就不再尝试其他策略
                except Exception as e:
                    logger.debug(f"策略 '{selector}' 失败: {e}")
                    continue
            
            # 如果上述策略都失败，使用正则表达式从页面源代码提取
            if not models:
                models = self.extract_from_page_source()
            
            return models
            
        except Exception as e:
            logger.error(f"提取模型时出错: {e}")
            return []
    
    def extract_from_links(self, elements):
        """
        从链接元素中提取模型信息
        """
        models = []
        processed_names = set()
        
        for element in elements:
            try:
                href = element.get_attribute("href")
                if not href:
                    continue
                
                model_name = self.extract_model_name_from_url(href)
                if model_name and model_name not in processed_names:
                    display_text = element.text.strip()
                    
                    model_info = {
                        'name': model_name,
                        'url': f"https://huggingface.co/{model_name}",
                        'display_text': display_text[:100] if display_text else "",
                        'href': href
                    }
                    
                    models.append(model_info)
                    processed_names.add(model_name)
                    logger.debug(f"从链接找到模型: {model_name}")
                    
            except Exception as e:
                logger.debug(f"处理链接元素时出错: {e}")
                continue
        
        return models
    
    def extract_from_article_links(self, elements):
        """
        从文章标题链接中提取模型信息
        """
        models = []
        processed_names = set()
        
        for element in elements:
            try:
                href = element.get_attribute("href")
                if not href:
                    continue
                
                model_name = self.extract_model_name_from_url(href)
                if model_name and model_name not in processed_names:
                    display_text = element.text.strip()
                    
                    # 尝试获取父元素的更多信息
                    parent_text = ""
                    try:
                        parent = element.find_element(By.XPATH, "./ancestor::article[1]")
                        parent_text = parent.text.strip()[:200]
                    except:
                        pass
                    
                    model_info = {
                        'name': model_name,
                        'url': f"https://huggingface.co/{model_name}",
                        'display_text': display_text[:100] if display_text else "",
                        'parent_text': parent_text,
                        'href': href
                    }
                    
                    models.append(model_info)
                    processed_names.add(model_name)
                    logger.debug(f"从文章标题找到模型: {model_name}")
                    
            except Exception as e:
                logger.debug(f"处理文章链接时出错: {e}")
                continue
        
        return models
    
    def extract_from_page_text(self, elements):
        """
        从页面文本中提取模型信息
        """
        models = []
        try:
            page_text = self.driver.page_source
            
            # 使用正则表达式查找模型模式
            patterns = [
                # 匹配 用户名/模型名 的模式
                r'([a-zA-Z0-9_.-]+\/[a-zA-Z0-9_.-]+)(?=\s|"|\<|$)',
                # 匹配href中的模型路径
                r'href="\/([a-zA-Z0-9_.-]+\/[a-zA-Z0-9_.-]+)"',
            ]
            
            processed_names = set()
            for pattern in patterns:
                matches = re.findall(pattern, page_text)
                for match in matches:
                    if isinstance(match, tuple):
                        match = match[0] if match[0] else match[1]
                    
                    if self.is_valid_model_name(match) and match not in processed_names:
                        model_info = {
                            'name': match,
                            'url': f"https://huggingface.co/{match}",
                            'display_text': "",
                            'parent_text': "",
                            'href': f"/{match}"
                        }
                        models.append(model_info)
                        processed_names.add(match)
                        logger.debug(f"从页面文本找到模型: {match}")
            
            return models
            
        except Exception as e:
            logger.error(f"从页面文本提取模型时出错: {e}")
            return []
    
    def extract_model_name_from_url(self, url):
        """
        从URL中提取模型名称
        """
        try:
            if not url:
                return None
            
            # 处理完整URL和相对路径
            if url.startswith('http'):
                parsed = urlparse(url)
                path = parsed.path
            else:
                path = url
            
            # 移除开头的斜杠并分割路径
            path = path.strip('/')
            path_parts = path.split('/')
            
            # 模型路径应该是 用户名/模型名 的格式
            if len(path_parts) >= 2:
                username = path_parts[0]
                modelname = path_parts[1]
                
                # 验证格式
                if (username and modelname and 
                    re.match(r'^[a-zA-Z0-9_.-]+$', username) and 
                    re.match(r'^[a-zA-Z0-9_.-]+$', modelname) and
                    not self.is_excluded_path(f"{username}/{modelname}")):
                    return f"{username}/{modelname}"
            
            return None
        except Exception as e:
            logger.debug(f"从URL提取模型名称时出错: {e}")
            return None
    
    def is_excluded_path(self, path):
        """
        检查路径是否应该被排除
        """
        exclude_patterns = [
            'models/tasks', 'models/languages', 'models/libraries',
            'models/licenses', 'docs/', 'datasets/', 'spaces/',
            'settings/', 'login', 'join', 'pricing', 'enterprise'
        ]
        
        return any(pattern in path.lower() for pattern in exclude_patterns)
    
    def extract_from_page_source(self):
        """
        从页面源代码中使用正则表达式提取模型信息
        """
        models = []
        try:
            page_source = self.driver.page_source
            
            # 更精确的正则表达式模式
            patterns = [
                r'href="\/([a-zA-Z0-9_.-]+\/[a-zA-Z0-9_.-]+)"',
                r'"\/([a-zA-Z0-9_.-]+\/[a-zA-Z0-9_.-]+)"',
                r'>([a-zA-Z0-9_.-]+\/[a-zA-Z0-9_.-]+)<',
            ]
            
            processed_names = set()
            for pattern in patterns:
                matches = re.findall(pattern, page_source)
                for match in matches:
                    if self.is_valid_model_name(match) and match not in processed_names:
                        model_info = {
                            'name': match,
                            'url': f"https://huggingface.co/{match}",
                            'display_text': "",
                            'parent_text': "",
                            'href': f"/{match}"
                        }
                        models.append(model_info)
                        processed_names.add(match)
                        logger.debug(f"从源代码找到模型: {match}")
            
            return models
            
        except Exception as e:
            logger.error(f"从页面源代码提取模型时出错: {e}")
            return []
    
    def is_valid_model_name(self, name):
        """
        验证是否是有效的模型名称
        """
        if not name or '/' not in name:
            return False
        
        parts = name.split('/')
        if len(parts) != 2:
            return False
        
        username, modelname = parts
        
        # 检查基本格式
        if (len(username) < 1 or len(modelname) < 1 or
            not re.match(r'^[a-zA-Z0-9_.-]+$', username) or 
            not re.match(r'^[a-zA-Z0-9_.-]+$', modelname)):
            return False
        
        # 排除明显不是模型的路径
        if self.is_excluded_path(name):
            return False
        
        return True
    
    def find_next_page_element(self):
        """
        查找下一页元素 - 改进版
        """
        try:
            # 先滚动到页面底部确保分页元素可见
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            
            # 多种策略查找下一页元素
            next_strategies = [
                # 策略1: 查找包含"Next"文本的链接
                ("//a[contains(text(), 'Next')]", By.XPATH),
                # 策略2: 查找aria-label为Next的元素
                ("a[aria-label*='Next']", By.CSS_SELECTOR),
                # 策略3: 查找包含下一页页码的链接
                (f"//a[contains(@href, 'p={self.current_page + 1}')]", By.XPATH),
                # 策略4: 查找分页容器中的最后一个链接
                (".pagination a:last-child", By.CSS_SELECTOR),
                # 策略5: 通过分页逻辑查找
                ("//a[position()=last() and contains(@href, 'p=')]", By.XPATH),
            ]
            
            for selector, by_type in next_strategies:
                try:
                    if by_type == By.XPATH:
                        elements = self.driver.find_elements(By.XPATH, selector)
                    else:
                        elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    
                    for element in elements:
                        if (element.is_displayed() and element.is_enabled() and
                            self.is_next_page_element(element)):
                            logger.info(f"找到下一页元素，策略: {selector}")
                            return element
                            
                except Exception as e:
                    logger.debug(f"策略 {selector} 失败: {e}")
                    continue
            
            logger.info("未找到下一页元素")
            return None
            
        except Exception as e:
            logger.error(f"查找下一页元素时出错: {e}")
            return None
    
    def is_next_page_element(self, element):
        """
        验证元素是否是下一页链接
        """
        try:
            # 检查文本内容
            text = element.text.lower().strip()
            if 'next' in text or '>' in text:
                return True
            
            # 检查href属性
            href = element.get_attribute('href')
            if href and f'p={self.current_page + 1}' in href:
                return True
            
            # 检查aria-label
            aria_label = element.get_attribute('aria-label')
            if aria_label and 'next' in aria_label.lower():
                return True
            
            return False
            
        except Exception as e:
            logger.debug(f"验证下一页元素时出错: {e}")
            return False
    
    def go_to_next_page(self):
        """
        跳转到下一页 - 改进版
        """
        try:
            next_element = self.find_next_page_element()
            
            if not next_element:
                logger.info("未找到下一页元素，可能已到最后一页")
                return False
            
            # 获取下一页URL
            next_url = next_element.get_attribute('href')
            if not next_url:
                logger.error("下一页元素没有href属性")
                return False
            
            # 记录当前页面URL用于验证
            current_url = self.driver.current_url
            
            # 方法1: 直接点击元素
            try:
                # 滚动到元素位置
                self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_element)
                time.sleep(1)
                
                # 点击元素
                next_element.click()
                logger.info("已点击下一页元素")
                
            except ElementClickInterceptedException:
                # 方法2: 如果点击被拦截，使用JavaScript点击
                logger.info("点击被拦截，尝试JavaScript点击")
                self.driver.execute_script("arguments[0].click();", next_element)
                
            except Exception as e:
                # 方法3: 如果点击失败，直接导航到URL
                logger.info(f"点击失败 ({e})，直接导航到下一页URL")
                self.driver.get(next_url)
            
            # 等待页面加载
            time.sleep(self.delay)
            
            # 验证页面是否真的发生了变化
            new_url = self.driver.current_url
            if new_url == current_url:
                logger.warning("页面URL没有变化，可能跳转失败")
                return False
            
            # 等待新页面内容加载
            try:
                self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "main")))
                time.sleep(2)  # 额外等待确保内容完全加载
            except TimeoutException:
                logger.warning("新页面加载超时，但继续尝试")
            
            # 更新当前页码
            self.current_page += 1
            logger.info(f"成功跳转到第 {self.current_page} 页")
            return True
            
        except Exception as e:
            logger.error(f"跳转下一页时出错: {e}")
            return False
    
    def crawl_all_pages(self, start_url, max_pages=None):
        """
        爬取所有页面 - 改进版
        """
        all_models = []
        self.current_page = 1
        
        # 加载第一页
        if not self.load_page(start_url):
            logger.error("无法加载起始页面")
            return []
        
        while True:
            logger.info(f"正在处理第 {self.current_page} 页")
            
            # 提取当前页面的模型
            page_models = self.extract_models_from_current_page()
            
            if page_models:
                all_models.extend(page_models)
                logger.info(f"第 {self.current_page} 页找到 {len(page_models)} 个模型")
                
                # 显示前几个找到的模型作为验证
                for i, model in enumerate(page_models[:3], 1):
                    logger.info(f"  {i}. {model['name']}")
                
            else:
                logger.warning(f"第 {self.current_page} 页没有找到模型")
                # 如果连续多页都没有找到模型，可能是页面结构变化了
                if self.current_page > 1:
                    logger.error("可能遇到了页面结构变化，停止爬取")
                    break
            
            # 尝试跳转到下一页
            if not self.go_to_next_page():
                logger.info("没有更多页面或跳转失败")
                break
        
        logger.info(f"爬取完成，总共处理 {self.current_page} 页，找到 {len(all_models)} 个模型")
        return all_models
    
    def save_results(self, models, filename_prefix="huggingface_models_fixed"):
        """
        保存结果到文件 - 已禁用，不保存任何文件
        """
        # 不执行任何保存操作
        pass
    
    def close(self):
        """
        关闭浏览器
        """
        if hasattr(self, 'driver'):
            self.driver.quit()
            logger.info("浏览器已关闭")

In [52]:
def batch_crawl_huggingface_models(save_path=None):
    """
    批量爬取多个HuggingFace模型分类页面
    
    Args:
        save_path: CSV文件保存路径，如果为None则使用默认路径
    """
    # 要爬取的URL列表
    urls_to_crawl = [
        "https://huggingface.co/models?other=climate",
        "https://huggingface.co/models?other=science"
    ]
    
    all_results = {}
    all_models_combined = []  # 存储所有模型数据
    
    for i, url in enumerate(urls_to_crawl, 1):
        print(f"\n{'='*60}")
        print(f"开始爬取第 {i}/{len(urls_to_crawl)} 个网站")
        print(f"URL: {url}")
        print(f"{'='*60}")
        
        # 从URL中提取分类名称
        category = ""
        try:
            from urllib.parse import urlparse, parse_qs
            parsed_url = urlparse(url)
            query_params = parse_qs(parsed_url.query)
            if 'other' in query_params:
                category = query_params['other'][0]
        except:
            category = f"category_{i}"
        
        print(f"分类: {category}")
        
        # 创建爬虫实例
        crawler = FixedHuggingFaceModelsCrawler(headless=True, delay=3)
        
        try:
            # 爬取当前URL
            models = crawler.crawl_all_pages(url)  # 限制为5页用于测试
            
            # 去重
            unique_models = []
            seen_names = set()
            for model in models:
                if model['name'] not in seen_names:
                    unique_models.append(model)
                    seen_names.add(model['name'])
            
            print(f"\n{category} 分类爬取结果:")
            print(f"- 找到 {len(unique_models)} 个唯一模型")
            
            # 显示前5个模型作为示例
            if unique_models:
                print("\n前5个模型:")
                for j, model in enumerate(unique_models[:5], 1):
                    print(f"  {j}. {model['name']}")
                
                # 为合并的CSV添加类别标记
                for model in unique_models:
                    model['category'] = category
                all_models_combined.extend(unique_models)
                
                # 存储到总结果中
                all_results[category] = {
                    'url': url,
                    'models_count': len(unique_models),
                    'models': unique_models[:10]  # 只保存前10个作为示例
                }
            else:
                print("  未找到任何模型")
                all_results[category] = {
                    'url': url,
                    'models_count': 0,
                    'models': []
                }
        
        except Exception as e:
            print(f"爬取 {category} 时出现错误: {e}")
            all_results[category] = {
                'url': url,
                'models_count': 0,
                'models': [],
                'error': str(e)
            }
        
        finally:
            # 关闭当前爬虫实例
            crawler.close()
            
        # 在两次爬取之间暂停
        if i < len(urls_to_crawl):
            print(f"\n等待5秒后开始下一个...")
            time.sleep(5)
    
    # 输出总结
    print(f"\n{'='*80}")
    print("批量爬取完成总结")
    print(f"{'='*80}")
    
    total_models = 0
    for category, result in all_results.items():
        status = "成功" if result['models_count'] > 0 else "失败" if 'error' in result else "无结果"
        print(f"{category:12} | 模型数量: {result['models_count']:4d} | 状态: {status}")
        total_models += result['models_count']
    
    print(f"\n总计爬取到 {total_models} 个模型")
    
    # 保存合并的CSV文件
    if all_models_combined:
        if save_path is None:
            # 默认文件名
            timestamp = time.strftime("%Y%m%d_%H%M%S")
            save_path = f"huggingface_models_combined_{timestamp}.csv"
        
        # 创建DataFrame并保存为CSV
        import pandas as pd
        df = pd.DataFrame(all_models_combined)
        
        # 重新排列列的顺序，把category放在前面
        columns = ['category', 'name', 'url', 'display_text', 'parent_text']
        df = df.reindex(columns=[col for col in columns if col in df.columns])
        
        df.to_csv(save_path, index=False, encoding='utf-8')
        print(f"\n✅ 模型数据已保存为CSV: {save_path}")
        print(f"   总共包含 {len(df)} 个模型")
    
    return all_results

# 执行批量爬取
print("开始批量爬取HuggingFace模型...")

# 用户可以指定CSV文件保存路径
custom_csv_path = "备份_中间流程结果/models_data.csv"  # 用户可以修改这个路径

batch_results = batch_crawl_huggingface_models(save_path=custom_csv_path)

开始批量爬取HuggingFace模型...

开始爬取第 1/1 个网站
URL: https://huggingface.co/models?other=sci
分类: sci


2025-08-26 11:39:57,083 - INFO - Chrome浏览器驱动初始化成功
2025-08-26 11:39:57,087 - INFO - 正在加载页面: https://huggingface.co/models?other=sci
2025-08-26 11:40:05,418 - INFO - 正在处理第 1 页
2025-08-26 11:40:07,620 - INFO - 使用策略 'a[href^='/']' 找到 3 个模型
2025-08-26 11:40:07,621 - INFO - 第 1 页找到 3 个模型
2025-08-26 11:40:07,621 - INFO -   1. search/full-text
2025-08-26 11:40:07,621 - INFO -   2. malteos/aspect-acl-scibert-scivocab-uncased
2025-08-26 11:40:07,622 - INFO -   3. malteos/aspect-cord19-scibert-scivocab-uncased
2025-08-26 11:40:08,670 - INFO - 未找到下一页元素
2025-08-26 11:40:08,671 - INFO - 未找到下一页元素，可能已到最后一页
2025-08-26 11:40:08,671 - INFO - 没有更多页面或跳转失败
2025-08-26 11:40:08,671 - INFO - 爬取完成，总共处理 1 页，找到 3 个模型
2025-08-26 11:40:08,758 - INFO - 浏览器已关闭



sci 分类爬取结果:
- 找到 3 个唯一模型

前5个模型:
  1. search/full-text
  2. malteos/aspect-acl-scibert-scivocab-uncased
  3. malteos/aspect-cord19-scibert-scivocab-uncased

批量爬取完成总结
sci          | 模型数量:    3 | 状态: 成功

总计爬取到 3 个模型

✅ 模型数据已保存为CSV: 备份_中间流程结果/models_data.csv
   总共包含 3 个模型


In [53]:
import pandas as pd
df=pd.read_csv("备份_中间流程结果/models_data.csv")

In [54]:
# 2. 对两个表格的name列进行处理：根据/分割左侧为作者，右侧为数据集/模型名称

def split_name_column(df, df_type):
    """
    分割name列为author和item_name列
    """
    print(f"\n处理{df_type}的name列...")
    
    # 创建副本避免修改原数据
    df_copy = df.copy()
    
    # 查看一些name列的示例
    print(f"name列示例 (前5个):")
    for i, name in enumerate(df_copy['name'].head()):
        print(f"  {i+1}. {name}")
    
    # 分割name列
    split_names = df_copy['name'].str.split('/', n=1, expand=True)
    
    # 添加author和item_name列
    df_copy['author'] = split_names[0] if len(split_names.columns) > 0 else ''
    df_copy['item_name'] = split_names[1] if len(split_names.columns) > 1 else ''
    
    # 处理没有/的情况（整个name作为item_name，author为空）
    mask_no_slash = df_copy['item_name'].isna()
    df_copy.loc[mask_no_slash, 'item_name'] = df_copy.loc[mask_no_slash, 'author']
    df_copy.loc[mask_no_slash, 'author'] = ''
    
    # 清理空值
    df_copy['author'] = df_copy['author'].fillna('')
    df_copy['item_name'] = df_copy['item_name'].fillna('')
    
    print(f"分割完成:")
    print(f"  - 有作者信息的条目: {(df_copy['author'] != '').sum()}")
    print(f"  - 没有作者信息的条目: {(df_copy['author'] == '').sum()}")
    
    return df_copy

# 处理datasets_df
models_df_processed = split_name_column(df, "数据集")


处理数据集的name列...
name列示例 (前5个):
  1. search/full-text
  2. malteos/aspect-acl-scibert-scivocab-uncased
  3. malteos/aspect-cord19-scibert-scivocab-uncased
分割完成:
  - 有作者信息的条目: 3
  - 没有作者信息的条目: 0


In [55]:
import re
from datetime import datetime, timedelta

# 3. 处理display_text列，提取更新日期
def extract_update_date(display_text):
    """
    从display_text中提取更新日期
    """
    if pd.isna(display_text) or display_text == '':
        return None
    
    # 按•分割
    parts = display_text.split('•')
    
    # 查找包含Updated的部分
    update_part = None
    for part in parts:
        if 'Updated' in part:
            update_part = part.strip()
            break
    
    if not update_part:
        return None
    
    # 提取Updated后的日期部分
    # 使用正则表达式匹配Updated后的内容
    match = re.search(r'Updated\s+(.+)', update_part)
    if not match:
        return None
    
    date_str = match.group(1).strip()
    
    try:
        # 1. 处理完整年月日格式，如 "Jun 5, 2024"
        if re.match(r'^[A-Za-z]{3}\s+\d{1,2},\s+\d{4}$', date_str):
            return datetime.strptime(date_str, '%b %d, %Y').strftime('%Y-%m-%d')
        
        # 2. 处理没有年份的格式，如 "Mar 8"，默认年份为2025
        elif re.match(r'^[A-Za-z]{3}\s+\d{1,2}$', date_str):
            date_with_year = f"{date_str}, 2025"
            return datetime.strptime(date_with_year, '%b %d, %Y').strftime('%Y-%m-%d')
        
        # 3. 处理相对时间格式，如 "9 days ago"
        elif 'ago' in date_str.lower():
            base_date = datetime(2025, 8, 22)  # 基准日期
            
            # 匹配数字和时间单位
            time_match = re.search(r'(\d+)\s+(day|month|year|week|hour|minute)s?\s+ago', date_str.lower())
            if time_match:
                number = int(time_match.group(1))
                unit = time_match.group(2)
                
                if unit == 'day':
                    result_date = base_date - timedelta(days=number)
                elif unit == 'week':
                    result_date = base_date - timedelta(weeks=number)
                elif unit == 'month':
                    # 简单处理，假设一个月30天
                    result_date = base_date - timedelta(days=number * 30)
                elif unit == 'year':
                    # 简单处理，假设一年365天
                    result_date = base_date - timedelta(days=number * 365)
                elif unit == 'hour':
                    result_date = base_date - timedelta(hours=number)
                elif unit == 'minute':
                    result_date = base_date - timedelta(minutes=number)
                else:
                    return None
                
                return result_date.strftime('%Y-%m-%d')
        
        # 其他格式尝试直接解析
        else:
            return None
            
    except Exception as e:
        return None

print("\n" + "="*50)
print("开始处理display_text列，提取更新日期...")

# 查看一些display_text的样例
models_df_processed['update_date'] = models_df_processed['display_text'].apply(extract_update_date)
print("\n提取的更新日期示例 (前10个):")
for i, date in enumerate(models_df_processed['update_date'].head(10)):
    print(f"  {i+1}. {date}")


开始处理display_text列，提取更新日期...

提取的更新日期示例 (前10个):
  1. None
  2. 2023-02-07
  3. 2021-11-22


In [56]:
models_df_processed.to_csv(custom_csv_path, index=False, encoding='utf-8')

# 爬取模型详细信息

In [58]:
import pandas as pd
df=pd.read_csv("备份_中间流程结果/models_data.csv")

In [59]:
import requests
from bs4 import BeautifulSoup
import re
import json
from typing import Dict, List, Optional

class HuggingFaceModelScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
        # 定义不同类型标签的识别规则
        self.tag_categories = {
            "frameworks": ["transformers", "pytorch", "tensorflow", "jax", "safetensors", "onnx", "keras"],
            "tasks": ["fill-mask", "text-classification", "token-classification", "text-generation", 
                     "question-answering", "summarization", "translation", "text2text-generation",
                     "image-classification", "object-detection", "image-segmentation"],
            "languages": ["english", "chinese", "spanish", "french", "german", "japanese", "korean",
                         "arabic", "russian", "portuguese", "italian", "dutch", "multilingual"],
            "model_types": ["bert", "roberta", "gpt", "t5", "bart", "distilbert", "albert", 
                           "electra", "deberta", "llama", "mistral", "gemma"],
            "domains": ["climate", "medical", "biology", "science", "scientific", "chemistry", "clinical"]
        }
    
    def extract_model_info(self, url: str) -> Dict:
        """
        从 Hugging Face model 页面提取关键信息
        
        Args:
            url: model 页面的 URL
            
        Returns:
            包含提取信息的字典
        """
        try:
            response = self.session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # 提取模型名称
            model_name = self._extract_model_name(soup, url)
            
            # 提取标签信息
            metadata = self._extract_metadata_tags(soup)
            
            # 提取文字描述
            description = self._extract_description(soup)
            
            # 提取统计信息
            stats = self._extract_stats(soup)
            
            result = {
                "model_name": model_name,
                "url": url,
                "metadata": metadata,
                "description": description,
                "stats": stats,
                "extraction_status": "success"
            }
            
            return result
            
        except Exception as e:
            return {
                "model_name": None,
                "url": url,
                "metadata": {},
                "description": [],
                "stats": {},
                "extraction_status": f"error: {str(e)}"
            }
    
    def _extract_model_name(self, soup: BeautifulSoup, url: str) -> str:
        """提取模型名称"""
        # 尝试从页面标题提取
        title_element = soup.find('h1')
        if title_element:
            return title_element.get_text().strip()
        
        # 从 URL 提取
        url_parts = url.rstrip('/').split('/')
        if len(url_parts) >= 2:
            return url_parts[-1]
        
        return "Unknown"
    
    def _extract_metadata_tags(self, soup: BeautifulSoup) -> Dict:
        """提取模型元数据信息"""
        metadata = {
            "frameworks": [],
            "tasks": [],
            "languages": [],
            "model_types": [],
            "domains": [],
            "license": "",
            "arxiv": "",
            "other_tags": []
        }
        
        # 方法1: 提取顶部标签区域的所有标签
        self._extract_header_tags(soup, metadata)
        
        # 方法2: 通过特定选择器查找标签
        self._extract_by_selectors(soup, metadata)
        
        # 方法3: 通过正则表达式查找特定信息
        self._extract_specific_info(soup, metadata)
        
        return metadata
    
    def _extract_header_tags(self, soup: BeautifulSoup, metadata: Dict):
        """提取页面头部标签区域的所有标签"""
        # 常见的标签容器选择器
        tag_selectors = [
            'div[class*="tag"]',
            'span[class*="tag"]',
            'div[class*="badge"]',
            'span[class*="badge"]',
            'div[class*="label"]',
            'a[class*="tag"]',
            # 更具体的选择器
            'header div[class*="flex"] > *',
            'div[class*="model-header"] span',
            'div[class*="model-card"] span',
        ]
        
        found_tags = set()
        
        for selector in tag_selectors:
            try:
                elements = soup.select(selector)
                for element in elements:
                    tag_text = element.get_text().strip().lower()
                    if tag_text and len(tag_text) < 50:  # 避免长文本
                        found_tags.add(tag_text)
            except:
                continue
        
        # 也尝试查找特定的图标 + 文本组合
        self._extract_icon_text_combinations(soup, found_tags)
        
        # 分类标签
        self._categorize_tags(found_tags, metadata)
    
    def _extract_icon_text_combinations(self, soup: BeautifulSoup, found_tags: set):
        """提取图标+文本组合的标签"""
        # 查找包含SVG图标的元素
        icon_containers = soup.find_all(['div', 'span'], attrs={'class': re.compile(r'flex|inline-flex')})
        
        for container in icon_containers:
            # 检查是否包含SVG图标
            svg = container.find('svg')
            if svg:
                # 获取同级或子级的文本
                text_elements = container.find_all(string=True)
                for text in text_elements:
                    clean_text = text.strip().lower()
                    if clean_text and len(clean_text) < 30 and clean_text not in ['', ' ', '\n']:
                        found_tags.add(clean_text)
    
    def _extract_by_selectors(self, soup: BeautifulSoup, metadata: Dict):
        """通过特定选择器查找标签"""
        # 查找许可证信息
        license_selectors = [
            'a[href*="license"]',
            'span[class*="license"]',
            'div[class*="license"]'
        ]
        
        for selector in license_selectors:
            try:
                license_element = soup.select_one(selector)
                if license_element and not metadata["license"]:
                    metadata["license"] = license_element.get_text().strip()
                    break
            except:
                continue
        
        # 查找ArXiv链接
        arxiv_links = soup.find_all('a', href=re.compile(r'arxiv\.org|arxiv:', re.IGNORECASE))
        for link in arxiv_links:
            href = link.get('href', '')
            text = link.get_text().strip()
            
            # 从href中提取ArXiv ID
            arxiv_match = re.search(r'(\d{4}\.\d{4,5})', href + ' ' + text)
            if arxiv_match and not metadata["arxiv"]:
                metadata["arxiv"] = arxiv_match.group(1)
                break
    
    def _extract_specific_info(self, soup: BeautifulSoup, metadata: Dict):
        """通过正则表达式查找特定信息"""
        page_text = soup.get_text()
        
        # 查找ArXiv ID (如果之前没找到)
        if not metadata["arxiv"]:
            arxiv_pattern = r'arxiv[:\s]*(\d{4}\.\d{4,5})'
            arxiv_match = re.search(arxiv_pattern, page_text, re.IGNORECASE)
            if arxiv_match:
                metadata["arxiv"] = arxiv_match.group(1)
        
        # 查找许可证信息 (如果之前没找到)
        if not metadata["license"]:
            license_patterns = [
                r'license[:\s]*([a-z0-9\-\.]+)',
                r'(mit|apache-2\.0|gpl|bsd|cc-by)',
            ]
            for pattern in license_patterns:
                match = re.search(pattern, page_text, re.IGNORECASE)
                if match:
                    metadata["license"] = match.group(1).strip()
                    break
    
    def _categorize_tags(self, found_tags: set, metadata: Dict):
        """将找到的标签分类到不同的类别中"""
        for tag in found_tags:
            tag_lower = tag.lower()
            categorized = False
            
            # 遍历每个类别进行匹配
            for category, keywords in self.tag_categories.items():
                for keyword in keywords:
                    if keyword in tag_lower or tag_lower in keyword:
                        if tag not in metadata[category]:
                            metadata[category].append(tag)
                        categorized = True
                        break
                if categorized:
                    break
            
            # 如果没有分类，加入other_tags
            if not categorized and len(tag) > 1:
                # 过滤一些明显不是标签的内容
                if not self._is_non_tag_text(tag):
                    metadata["other_tags"].append(tag)
    
    def _is_non_tag_text(self, text: str) -> bool:
        """判断文本是否不是标签内容"""
        non_tag_patterns = [
            r'^\d+$',  # 纯数字
            r'^[^\w\s]+$',  # 纯符号
            r'\b(the|and|or|in|on|at|to|for|of|with|by)\b',  # 常见介词
            r'\b(click|view|edit|download|upload|submit)\b',  # 操作词
            r'^(https?://|www\.)',  # URL
            r'\b\d+\s*(gb|mb|kb|tb|downloads?|views?|likes?)\b',  # 统计数据
        ]
        
        text_lower = text.lower()
        return any(re.search(pattern, text_lower) for pattern in non_tag_patterns)
    
    def _extract_description(self, soup: BeautifulSoup) -> List[str]:
        """提取模型描述"""
        descriptions = []
        
        # 模型页面的描述区域选择器
        description_selectors = [
            'div[class*="model-card"]',
            'div[class*="readme"]',
            'div[class*="prose"]',
            'section[class*="model-description"]',
            'div[class*="content"]',
            'article',
            'main',
        ]
        
        description_container = None
        for selector in description_selectors:
            try:
                container = soup.select_one(selector)
                if container:
                    description_container = container
                    break
            except:
                continue
        
        if not description_container:
            description_container = soup
        
        # 查找描述段落
        paragraphs = description_container.find_all('p')
        for p in paragraphs:
            text = p.get_text().strip()
            if (len(text) > 50 and 
                not self._is_navigation_text(text) and
                self._is_meaningful_description(text)):
                descriptions.append(text)
                if len(descriptions) >= 2:
                    break
        
        return descriptions
    
    def _extract_stats(self, soup: BeautifulSoup) -> Dict:
        """提取统计信息"""
        stats = {
            "downloads": "",
            "likes": "",
            "followers": "",
            "last_updated": ""
        }
        
        # 查找统计数字
        stat_patterns = {
            "downloads": [r'(\d+(?:,\d+)*)\s*downloads?', r'downloads?\s*(?:last\s*month)?[:\s]*(\d+(?:,\d+)*)'],
            "likes": [r'(\d+(?:,\d+)*)\s*likes?', r'likes?[:\s]*(\d+(?:,\d+)*)'],
            "followers": [r'(\d+(?:,\d+)*)\s*followers?', r'follow.*?(\d+(?:,\d+)*)']
        }
        
        page_text = soup.get_text()
        
        for stat_type, patterns in stat_patterns.items():
            for pattern in patterns:
                match = re.search(pattern, page_text, re.IGNORECASE)
                if match and not stats[stat_type]:
                    stats[stat_type] = match.group(1)
                    break
        
        return stats
    
    def _is_meaningful_description(self, text: str) -> bool:
        """判断文本是否是有意义的描述内容"""
        descriptive_patterns = [
            r'\b(model|language model|based on|trained|fine-tuned|designed)\b',
            r'\b(machine learning|deep learning|AI|artificial intelligence|NLP)\b',
            r'\b(research|study|analysis|algorithm|neural network)\b',
            r'\b(text|classification|generation|understanding|processing)\b'
        ]
        
        text_lower = text.lower()
        has_descriptive_content = any(re.search(pattern, text_lower) for pattern in descriptive_patterns)
        
        sentence_count = len([s for s in text.split('.') if len(s.strip()) > 10])
        
        # 避免纯技术标签或元数据
        avoid_patterns = [
            r'^\s*\d+(\.\d+)?\s*(GB|MB|KB|TB)\s*$',
            r'^\s*[a-z-]+:[a-z-]+\s*$',
            r'^\s*(download|view|edit|fork|clone)\s*$',
            r'^\s*\d+\s*(downloads?|views?|likes?)\s*$',
        ]
        
        is_metadata = any(re.match(pattern, text.strip(), re.IGNORECASE) for pattern in avoid_patterns)
        
        return (has_descriptive_content and 
                sentence_count >= 1 and 
                not is_metadata and
                len(text.split()) >= 8)
    
    def _is_navigation_text(self, text: str) -> bool:
        """判断是否为导航或菜单文本"""
        if len(text) > 300:
            return False
            
        nav_keywords = [
            'home', 'models', 'datasets', 'spaces', 'docs', 'pricing', 
            'login', 'sign up', 'menu', 'navigation', 'download', 'view',
            'edit', 'fork', 'clone', 'settings', 'discussions', 'files',
            'community', 'license', 'paper', 'model card'
        ]
        
        text_lower = text.lower().strip()
        words = text_lower.split()
        
        if len(words) <= 5:
            nav_word_count = sum(1 for word in words if any(kw in word for kw in nav_keywords))
            if nav_word_count >= len(words) * 0.6:
                return True
        
        return False


In [60]:
import time
from tqdm import tqdm
import pandas as pd

def batch_extract_model_info(df, url_column='url', batch_size=50, delay=1.0):
    """
    批量提取模型信息并添加新列到DataFrame
    
    Args:
        df: 包含URL的DataFrame
        url_column: URL列的名称
        batch_size: 每批处理的数量
        delay: 请求间的延迟时间（秒）
    
    Returns:
        添加了新列的DataFrame
    """
    # 初始化爬虫
    scraper = HuggingFaceModelScraper()
    
    # 为结果创建新列
    new_columns = [
        'Frameworks', 'Tasks', 'Languages', 'Model_types', 
        'Domains', 'License', 'Arxiv', 'Description'
    ]
    
    # 初始化新列（如果不存在）
    for col in new_columns:
        if col not in df.columns:
            df[col] = ""
    
    # 记录统计信息
    stats = {
        'processed': 0,
        'successful': 0,
        'failed': 0,
        'pending': len(df)
    }
    
    print(f"开始批量提取 {len(df)} 个模型的信息...")
    print(f"批次大小: {batch_size}, 延迟: {delay}秒")
    
    # 使用tqdm显示进度
    with tqdm(total=len(df), desc="提取进度") as pbar:
        for idx, row in df.iterrows():
            try:
                url = row[url_column]
                
                # 检查是否已经有数据（避免重复提取）
                if pd.notna(row.get('Frameworks', '')) and row.get('Frameworks', '') != '':
                    stats['processed'] += 1
                    stats['pending'] -= 1
                    pbar.update(1)
                    continue
                
                # 提取模型信息
                result = scraper.extract_model_info(url)
                
                if result["extraction_status"] == "success":
                    # 更新DataFrame
                    metadata = result['metadata']
                    
                    # 转换列表为字符串（用分号分隔）
                    df.at[idx, 'Frameworks'] = '; '.join(metadata.get('frameworks', []))
                    df.at[idx, 'Tasks'] = '; '.join(metadata.get('tasks', []))
                    df.at[idx, 'Languages'] = '; '.join(metadata.get('languages', []))
                    df.at[idx, 'Model_types'] = '; '.join(metadata.get('model_types', []))
                    df.at[idx, 'Domains'] = '; '.join(metadata.get('domains', []))
                    df.at[idx, 'License'] = metadata.get('license', '')
                    df.at[idx, 'Arxiv'] = metadata.get('arxiv', '')
                    
                    # 合并描述段落
                    descriptions = result.get('description', [])
                    df.at[idx, 'Description'] = ' '.join(descriptions[:2])  # 只取前两段
                    
                    stats['successful'] += 1
                else:
                    stats['failed'] += 1
                    # 记录错误
                    df.at[idx, 'Description'] = f"提取失败: {result['extraction_status']}"
                
                stats['processed'] += 1
                stats['pending'] -= 1
                
                # 更新进度条描述
                pbar.set_postfix({
                    '成功': stats['successful'],
                    '失败': stats['failed'],
                    '剩余': stats['pending']
                })
                
                pbar.update(1)
                
                # 延迟以避免过度请求
                time.sleep(delay)
                
                # 每处理一定数量后显示统计信息
                if stats['processed'] % batch_size == 0:
                    print(f"\n已处理: {stats['processed']}, 成功: {stats['successful']}, 失败: {stats['failed']}")
                    
            except Exception as e:
                stats['failed'] += 1
                stats['processed'] += 1
                stats['pending'] -= 1
                df.at[idx, 'Description'] = f"处理异常: {str(e)}"
                pbar.update(1)
                continue
    
    print(f"\n✅ 批量提取完成！")
    print(f"总处理: {stats['processed']}")
    print(f"成功: {stats['successful']}")
    print(f"失败: {stats['failed']}")
    print(f"成功率: {stats['successful']/stats['processed']*100:.1f}%")
    
    return df


In [61]:
# 运行批量提取 (调整延迟为1.5秒以提高效率)
models_df_complete = batch_extract_model_info(
    df.head(10),    
    url_column='url', 
    batch_size=100,  # 每100条显示一次统计
    delay=1.5        # 1.5秒延迟
)

print("\n=== 提取完成 ===")
print("前10条结果预览:")
print(models_df_complete.head(10).to_string(index=False))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http

开始批量提取 3 个模型的信息...
批次大小: 100, 延迟: 1.5秒


提取进度: 100%|██████████| 3/3 [00:06<00:00,  2.15s/it, 成功=3, 失败=0, 剩余=0]


✅ 批量提取完成！
总处理: 3
成功: 3
失败: 0
成功率: 100.0%

=== 提取完成 ===
前10条结果预览:
category                                           name                                                                   url                                                                     display_text  author                              item_name update_date               Frameworks             Tasks             Languages                    Model_types Domains License      Arxiv                                                                                                                                                                                                                                                                     Description
     sci                               search/full-text                               https://huggingface.co/search/full-text                                                                 Full-text search  search                              full-text         NaN        




数据清洗

In [62]:
import pandas as pd
import re

print("开始清洗 models_df_complete 数据...")
print(f"原始数据形状: {models_df_complete.shape}")
print(f"原始列名: {list(models_df_complete.columns)}")

# 创建数据副本进行清洗
models_cleaned = models_df_complete.copy()

# 1. 删除指定列
print("\n1. 删除 Arxiv、Model_types、Languages 列...")
columns_to_drop = ['Arxiv', 'Model_types', 'Languages']
existing_columns_to_drop = [col for col in columns_to_drop if col in models_cleaned.columns]

if existing_columns_to_drop:
    models_cleaned = models_cleaned.drop(columns=existing_columns_to_drop)
    print(f"已删除列: {existing_columns_to_drop}")
else:
    print("未找到要删除的列")

print(f"删除后列名: {list(models_cleaned.columns)}")

# 2. 根据Tasks创建功能标签列
print("\n2. 根据Tasks创建功能标签...")

def classify_function_by_tasks(tasks_str):
    """
    根据Tasks内容分类功能标签
    """
    if pd.isna(tasks_str) or tasks_str == '':
        return '通用处理'
    
    tasks_lower = str(tasks_str).lower()
    
    # 检查是否包含文本相关任务
    if any(keyword in tasks_lower for keyword in ['text', 'language', 'nlp', 'generation', 'classification', 'question', 'translation', 'summarization']):
        return '自然语言处理'
    
    # 检查是否包含视觉相关任务
    elif any(keyword in tasks_lower for keyword in ['image', 'video', 'vision', 'detection', 'segmentation', 'visual']):
        return '计算机视觉'
    
    # 检查是否包含语音相关任务
    elif any(keyword in tasks_lower for keyword in ['voice', 'speech', 'audio', 'sound', 'acoustic']):
        return '语音处理'
    
    # 其他情况
    else:
        return '通用处理'

# 应用功能分类
models_cleaned['功能标签'] = models_cleaned['Tasks'].apply(classify_function_by_tasks)

# 统计功能标签分布
print("功能标签分布:")
function_counts = models_cleaned['功能标签'].value_counts()
for func, count in function_counts.items():
    print(f"  {func}: {count}")

# 删除Tasks列
if 'Tasks' in models_cleaned.columns:
    models_cleaned = models_cleaned.drop('Tasks', axis=1)
    print("已删除Tasks列")

# 3. 根据Domains和tags创建学科列
print("\n3. 根据Domains和其他信息创建学科列...")

def classify_subject_by_domains_and_tags(row):
    """
    根据Domains列和其他信息分类学科，全部关键词用英文
    """
    domains_str = str(row.get('Domains', '')) if pd.notna(row.get('Domains', '')) else ''
    tags_str = str(row.get('tags', '')) if pd.notna(row.get('tags', '')) else ''
    combined_text = (domains_str + ' ' + tags_str).lower()

    # 生命科学
    if any(keyword in combined_text for keyword in [
        'biology', 'medical', 'bio-', 'medicine', 'health', 'genomics', 'protein', 'clinical'
    ]):
        return '生命科学'
    # 物质科学
    elif any(keyword in combined_text for keyword in [
        'chemistry', 'chemical', 'molecule', 'compound', 'material'
    ]):
        return '物质科学'
    # 大气海洋
    elif any(keyword in combined_text for keyword in [
        'climate', 'weather', 'ocean', 'atmospheric', 'meteorology', 'environmental'
    ]):
        return '大气海洋'
    # 空间信息
    elif any(keyword in combined_text for keyword in [
        'remote sensing', 'gis', 'geospatial', 'spatial', 'satellite', 'cartography', 'earth observation', 'map'
    ]):
        return '空间信息'
    # 工程建设
    elif any(keyword in combined_text for keyword in [
        'civil engineering', 'structural', 'infrastructure', 'construction', 'engineering', 'bim', 'bridge', 'road', 'tunnel'
    ]):
        return '工程技术'
    # 其他
    else:
        return '其他学科'

# 应用学科分类
models_cleaned['学科'] = models_cleaned.apply(classify_subject_by_domains_and_tags, axis=1)

# 统计学科分布
print("学科分布:")
subject_counts = models_cleaned['学科'].value_counts()
for subject, count in subject_counts.items():
    print(f"  {subject}: {count}")

print(f"有学科分类的模型数量: {models_cleaned['学科'].notna().sum()}")
print(f"无学科分类的模型数量: {models_cleaned['学科'].isna().sum()}")

# 4. 清洗License列
print("\n4. 清洗License列...")

def clean_license(license_str):
    """
    清洗License列内容
    """
    if pd.isna(license_str) or license_str == '':
        return ''
    
    license_str = str(license_str)
    
    # 删除"📜 License"
    license_str = re.sub(r'📜\s*License\s*', '', license_str, flags=re.IGNORECASE)
    
    # 将other替换为"其他"
    license_str = re.sub(r'\bother\b', '其他', license_str, flags=re.IGNORECASE)
    
    # 清理额外的空格
    license_str = re.sub(r'\s+', ' ', license_str).strip()
    
    return license_str

# 应用License清洗
models_cleaned['License'] = models_cleaned['License'].apply(clean_license)

# 统计License分布
print("License清洗后分布 (前15个):")
license_counts = models_cleaned['License'].value_counts()
for license_name, count in license_counts.head(15).items():
    if license_name:  # 只显示非空值
        print(f"  {license_name}: {count}")

# 最终结果统计
print(f"\n✅ 清洗完成！")
print(f"清洗后数据形状: {models_cleaned.shape}")
print(f"清洗后列名: {list(models_cleaned.columns)}")

# 显示一些示例数据
print(f"\n清洗后数据示例 (前5行):")
display_columns = ['author', 'item_name', '功能标签', '学科', 'License', 'Domains']
available_columns = [col for col in display_columns if col in models_cleaned.columns]
print(models_cleaned[available_columns].head().to_string())

# 保存清洗后的数据
models_df_complete = models_cleaned.copy()

开始清洗 models_df_complete 数据...
原始数据形状: (3, 15)
原始列名: ['category', 'name', 'url', 'display_text', 'author', 'item_name', 'update_date', 'Frameworks', 'Tasks', 'Languages', 'Model_types', 'Domains', 'License', 'Arxiv', 'Description']

1. 删除 Arxiv、Model_types、Languages 列...
已删除列: ['Arxiv', 'Model_types', 'Languages']
删除后列名: ['category', 'name', 'url', 'display_text', 'author', 'item_name', 'update_date', 'Frameworks', 'Tasks', 'Domains', 'License', 'Description']

2. 根据Tasks创建功能标签...
功能标签分布:
  自然语言处理: 2
  通用处理: 1
已删除Tasks列

3. 根据Domains和其他信息创建学科列...
学科分布:
  其他学科: 3
有学科分类的模型数量: 3
无学科分类的模型数量: 0

4. 清洗License列...
License清洗后分布 (前15个):
  mit: 2

✅ 清洗完成！
清洗后数据形状: (3, 13)
清洗后列名: ['category', 'name', 'url', 'display_text', 'author', 'item_name', 'update_date', 'Frameworks', 'Domains', 'License', 'Description', '功能标签', '学科']

清洗后数据示例 (前5行):
    author                               item_name    功能标签    学科 License Domains
0   search                               full-text    通用处理  其他学科               

In [63]:
def clean_license_column(license_text):
    """
    清洗开源协议列：
    1. 字符数大于6的替换为"公开"
    2. 空值替换为"公开"
    3. 其他情况也替换为"公开"
    """
    if pd.isna(license_text) or license_text == "" or license_text == "其他" or license_text is None:
        return "公开"
    
    license_str = str(license_text).strip()

    # 如果字符数大于30，替换为"公开"
    if len(license_str) > 30:
        return "公开"
    return license_str

# 应用开源协议列清洗
if 'License' in models_df_complete.columns:
    print("开始清洗License列...")
    models_df_complete['License'] = models_df_complete['License'].apply(clean_license_column)
    print("License列清洗完成！")
    print("\n清洗后的License列统计:")
    print(models_df_complete['License'].value_counts())
else:
    print("未找到'License'列")

开始清洗License列...
License列清洗完成！

清洗后的License列统计:
License
mit    2
公开     1
Name: count, dtype: int64


In [64]:
import re

def clean_framework_column(framework_text):
    """
    清洗模型框架列：
    1. 创建包含所有主流框架的词典
    2. 使用关键词匹配重新识别
    3. 剔除无效字符
    4. 使用逗号连接多个框架
    5. 空值默认填写transformer
    """
    # 主流机器学习/深度学习框架词典
    framework_dict = {
        # 深度学习框架
        'tensorflow': 'TensorFlow',
        'pytorch': 'PyTorch', 
        'torch': 'PyTorch',
        'keras': 'Keras',
        'jax': 'JAX',
        'flax': 'Flax',
        'mxnet': 'MXNet',
        'caffe': 'Caffe',
        'theano': 'Theano',
        'chainer': 'Chainer',
        'paddlepaddle': 'PaddlePaddle',
        'paddle': 'PaddlePaddle',
        'mindspore': 'MindSpore',
        'oneflow': 'OneFlow',
        
        # 预训练模型库
        'transformers': 'Transformers',
        'transformer': 'Transformers',
        'huggingface': 'Transformers',
        'hf': 'Transformers',
        'timm': 'TIMM',
        'diffusers': 'Diffusers',
        'sentence-transformers': 'Sentence-Transformers',
        'spacy': 'spaCy',
        'flair': 'Flair',
        'allennlp': 'AllenNLP',
        
        # 传统机器学习
        'sklearn': 'Scikit-learn',
        'scikit-learn': 'Scikit-learn',
        'xgboost': 'XGBoost',
        'lightgbm': 'LightGBM',
        'catboost': 'CatBoost',
        'randomforest': 'Random Forest',
        
        # 强化学习
        'stable-baselines': 'Stable-Baselines',
        'stable_baselines': 'Stable-Baselines',
        'ray': 'Ray',
        'rllib': 'RLLib',
        'openai gym': 'OpenAI Gym',
        'gym': 'OpenAI Gym',
        
        # 计算机视觉
        'opencv': 'OpenCV',
        'cv2': 'OpenCV',
        'pillow': 'PIL',
        'pil': 'PIL',
        'detectron': 'Detectron2',
        'yolo': 'YOLO',
        'mmdetection': 'MMDetection',
        'torchvision': 'TorchVision',
        
        # 自然语言处理
        'nltk': 'NLTK',
        'gensim': 'Gensim',
        'fasttext': 'FastText',
        'word2vec': 'Word2Vec',
        'bert': 'BERT',
        'gpt': 'GPT',
        'llama': 'LLaMA',
        't5': 'T5',
        
        # 其他专用框架
        'onnx': 'ONNX',
        'tensorrt': 'TensorRT',
        'openvino': 'OpenVINO',
        'coreml': 'Core ML',
        'mlflow': 'MLflow',
        'wandb': 'Weights & Biases',
        'tensorboard': 'TensorBoard',
        
        # 数据处理
        'pandas': 'Pandas',
        'numpy': 'NumPy',
        'scipy': 'SciPy',
        'dask': 'Dask',
        'spark': 'Apache Spark',
        'pyspark': 'PySpark',
        
        # 可视化
        'matplotlib': 'Matplotlib',
        'seaborn': 'Seaborn',
        'plotly': 'Plotly',
        'bokeh': 'Bokeh',
        
        # 音频处理
        'librosa': 'Librosa',
        'torchaudio': 'TorchAudio',
        'speechbrain': 'SpeechBrain',
        
        # 图神经网络
        'dgl': 'DGL',
        'pyg': 'PyTorch Geometric',
        'pytorch_geometric': 'PyTorch Geometric',
        'spektral': 'Spektral',
        
        # Web框架 (用于模型部署)
        'flask': 'Flask',
        'fastapi': 'FastAPI',
        'streamlit': 'Streamlit',
        'gradio': 'Gradio',
        
        # 其他
        'automl': 'AutoML',
        'autokeras': 'AutoKeras',
        'h2o': 'H2O',
        'pycaret': 'PyCaret'
    }
    
    if pd.isna(framework_text) or framework_text == "" or framework_text is None:
        return "Transformers"
    
    framework_str = str(framework_text).lower().strip()
    
    # 移除无效字符，只保留字母、数字、连字符、下划线和空格
    cleaned_framework = re.sub(r'[^a-zA-Z0-9\-_\s]', ' ', framework_str)
    
    # 识别框架
    identified_frameworks = set()
    
    # 遍历框架词典进行匹配
    for key, value in framework_dict.items():
        # 使用单词边界进行精确匹配
        pattern = r'\b' + re.escape(key.lower()) + r'\b'
        if re.search(pattern, cleaned_framework):
            identified_frameworks.add(value)
    
    # 如果没有识别到任何框架，返回默认值
    if not identified_frameworks:
        return "Transformers"
    
    # 使用逗号连接多个框架，按字母顺序排序
    return ", ".join(sorted(identified_frameworks))

# 应用模型框架列清洗
if 'Frameworks' in models_df_complete.columns:
    print("开始清洗模型框架列...")
    models_df_complete['Frameworks'] = models_df_complete['Frameworks'].apply(clean_framework_column)
    print("模型框架列清洗完成！")
    print("\n清洗后的模型框架列统计:")
    print(models_df_complete['Frameworks'].value_counts().head(20))
else:
    print("未找到'Frameworks'列")

开始清洗模型框架列...
模型框架列清洗完成！

清洗后的模型框架列统计:
Frameworks
PyTorch, Transformers    2
Transformers             1
Name: count, dtype: int64


In [65]:
import random
import numpy as np
models_df_complete['模态'] = models_df_complete.get('模态', pd.Series([np.nan]*len(models_df_complete)))
def fill_modality_by_function_tag(df):
    """
    根据功能标签对模态列进行填补：
    - 自然语言处理：文本
    - 通用处理：文本或多模态随机
    - 计算机视觉：图片或视频随机
    """
    df_processed = models_df_complete.copy()
    
    # 设置随机种子以确保结果可复现
    random.seed(42)
    np.random.seed(42)
    
    # 统计处理前的情况
    print("=== 模态列填补处理 ===")
    print(f"处理前模态列空值数量: {df_processed['模态'].isnull().sum()}")
    
    # 定义填补规则
    def get_modality_by_function_tag(function_tag, current_modality):
        """根据功能标签确定模态"""
        # 如果已有非空模态值，保持不变
        if pd.notna(current_modality) and current_modality != "":
            return current_modality
            
        # 根据功能标签填补
        if pd.isna(function_tag):
            return "多模态"  # 默认值
            
        function_tag_str = str(function_tag).strip()
        
        if "自然语言处理" in function_tag_str:
            return "文本"
        elif "计算机视觉" in function_tag_str:
            # 随机选择图片或视频
            return random.choice(["图片", "视频"])
        elif "通用处理" in function_tag_str:
            # 随机选择文本或多模态
            return random.choice(["文本", "多模态"])
        else:
            return "多模态"  # 其他情况默认为多模态
    
    # 应用填补规则
    print("\n开始填补模态列...")
    
    # 分类统计处理情况
    nlp_count = 0
    cv_count = 0  
    general_count = 0
    other_count = 0
    
    for idx, row in df_processed.iterrows():
        function_tag = row['功能标签']
        current_modality = row['模态']
        
        # 只对空值进行填补
        if pd.isna(current_modality) or current_modality == "":
            new_modality = get_modality_by_function_tag(function_tag, current_modality)
            df_processed.at[idx, '模态'] = new_modality
            
            # 统计各类别的处理数量
            if pd.notna(function_tag):
                function_tag_str = str(function_tag).strip()
                if "自然语言处理" in function_tag_str:
                    nlp_count += 1
                elif "计算机视觉" in function_tag_str:
                    cv_count += 1
                elif "通用处理" in function_tag_str:
                    general_count += 1
                else:
                    other_count += 1
            else:
                other_count += 1
    
    # 输出处理结果统计
    print(f"\n填补统计:")
    print(f"  自然语言处理 → 文本: {nlp_count} 条")
    print(f"  计算机视觉 → 图片/视频: {cv_count} 条")
    print(f"  通用处理 → 文本/多模态: {general_count} 条")
    print(f"  其他 → 多模态: {other_count} 条")
    print(f"  总计填补: {nlp_count + cv_count + general_count + other_count} 条")
    
    return df_processed

# 执行模态列填补
models_df_complete = fill_modality_by_function_tag(models_df_complete)

=== 模态列填补处理 ===
处理前模态列空值数量: 3

开始填补模态列...

填补统计:
  自然语言处理 → 文本: 2 条
  计算机视觉 → 图片/视频: 0 条
  通用处理 → 文本/多模态: 1 条
  其他 → 多模态: 0 条
  总计填补: 3 条


  df_processed.at[idx, '模态'] = new_modality


规范化license列

In [66]:
import pandas as pd
license_df=pd.read_excel('规范协议名称.xlsx')

In [67]:
import re
from difflib import get_close_matches

def normalize_license(license_name, standard_licenses):
    """
    规范化协议名称
    """
    if pd.isna(license_name) or license_name == '':
        return license_name
    
    # 转换为字符串并去除首尾空格
    license_str = str(license_name).strip()
    
    # 如果已经是标准格式，直接返回
    if license_str in standard_licenses:
        return license_str
    
    # 处理常见的大小写和格式问题
    license_upper = license_str.upper()
    
    # 创建映射字典处理常见的变体
    mapping = {
        'APACHE-2.0': 'Apache-2.0',
        'APACHE 2.0': 'Apache-2.0',
        'APACHE2.0': 'Apache-2.0',
        'APACHE': 'Apache-2.0',
        'MIT': 'MIT',
        'MIT LICENSE': 'MIT',
        'CC-BY-4.0': 'CC-BY-4.0',
        'CC BY 4.0': 'CC-BY-4.0',
        'CC-BY-SA-4.0': 'CC-BY-SA-4.0',
        'CC BY SA 4.0': 'CC-BY-SA-4.0',
        'CC-BY-NC-SA-4.0': 'CC-BY-NC-SA-4.0',
        'CC BY NC SA 4.0': 'CC-BY-NC-SA-4.0',
        'CC0-1.0': 'CC0-1.0',
        'CC0 1.0': 'CC0-1.0',
        'BSD-3-CLAUSE': 'BSD-3-Clause',
        'BSD 3 CLAUSE': 'BSD-3-Clause',
        'BSD-2-CLAUSE': 'BSD-2-Clause',
        'BSD 2 CLAUSE': 'BSD-2-Clause',
        'GPL-3.0': 'GPL-3.0',
        'GPL V3': 'GPL-3.0',
        'LGPL-3.0': 'LGPL-3.0',
        'AGPL-3.0': 'AGPL-3.0',
        'UNLICENSE': 'Unlicense',
        '公开': '公开',
        'PUBLIC': '公开',
        'OPENRAIL': 'OPENRAIL',
        'CUSTOM': 'CUSTOM'
    }
    
    # 先检查精确映射
    if license_upper in mapping:
        return mapping[license_upper]
    
    # 使用模糊匹配找到最相近的标准协议
    # 首先尝试在标准协议列表中找到最相似的
    close_matches = get_close_matches(license_str, standard_licenses, n=1, cutoff=0.8)
    if close_matches:
        return close_matches[0]
    
    # 如果没找到匹配，尝试大小写不敏感的匹配
    for std_license in standard_licenses:
        if license_upper == std_license.upper():
            return std_license
    
    # 如果仍然没找到，返回原值
    return license_str

# 获取标准协议列表
standard_licenses_list = license_df['开源协议'].tolist()

print("协议名称规范化函数已创建")

协议名称规范化函数已创建


In [68]:
# 对models_df进行协议名称规范化
print("正在规范化 models_df 的协议名称...")

# 创建备份
models_df_backup = models_df_complete.copy()

# 应用规范化函数
models_df_complete['开源协议'] = models_df_complete['License'].apply(
    lambda x: normalize_license(x, standard_licenses_list)
)

# 显示规范化前后的对比
print("Models DataFrame 协议规范化对比:")
comparison_models = pd.DataFrame({
    '原始协议': models_df_complete['License'],
    '规范化协议': models_df_complete['开源协议']
})
comparison_models = pd.DataFrame({
    '原始协议': models_df_complete['License'],
    '规范化协议': models_df_complete['开源协议']
})

# 显示发生变化的记录
changed_models = comparison_models[comparison_models['原始协议'] != comparison_models['规范化协议']]
if len(changed_models) > 0:
    print(f"共有 {len(changed_models)} 条记录发生了变化:")
    print(changed_models.drop_duplicates().head(20))
else:
    print("没有记录发生变化")

# 显示规范化后的唯一值统计
print(f"\n规范化后的协议唯一值统计:")
print(models_df_complete['开源协议'].value_counts().head(20))

正在规范化 models_df 的协议名称...
Models DataFrame 协议规范化对比:
共有 2 条记录发生了变化:
  原始协议 规范化协议
1  mit   MIT

规范化后的协议唯一值统计:
开源协议
MIT    2
公开     1
Name: count, dtype: int64


去除高度重复的样本

In [69]:
# 检查当前models_df的状态
print("检查当前models_df状态...")
print(f"当前models_df形状: {models_df_complete.shape}")
print(f"当前models_df列名: {models_df_complete.columns.tolist()}")

# 创建备份
models_df_before_dedup = models_df_complete.copy()

# 定义去重策略：对于相同的原始名称，我们保留：
# 1. 优先保留模型发布/更新时间最新的记录  
# 2. 优先保留信息更完整的记录（非空字段更多）

def calculate_completeness_score(row):
    """计算记录的完整性分数"""
    score = 0
    # 重要字段的权重
    important_fields = ['Description', '功能标签', '学科', '模态']
    
    for field in important_fields:
        if pd.notna(row[field]) and str(row[field]).strip() != '':
            score += 1
    
    return score

# 为每条记录计算完整性分数
models_df_complete['completeness_score'] = models_df_complete.apply(calculate_completeness_score, axis=1)

# 排序并去重：对于每个原始名称，保留排序后的第一条记录
# 只使用存在的列进行排序
models_df_deduped = (models_df_complete.sort_values([
    'name',
    'completeness_score'
], ascending=[True, False])
.groupby('name', as_index=False)  # 设置as_index=False保留原始名称列
.first())

# 删除辅助列
models_df_deduped = models_df_deduped.drop(['completeness_score'], axis=1)

print(f"去重前记录数: {len(models_df_before_dedup)}")
print(f"去重后记录数: {len(models_df_deduped)}")
print(f"删除的重复记录数: {len(models_df_before_dedup) - len(models_df_deduped)}")

# 验证列是否完整
print(f"\n去重后的列数: {len(models_df_deduped.columns)}")

# 更新models_df
models_df = models_df_deduped

print("\n去重完成！")

检查当前models_df状态...
当前models_df形状: (3, 15)
当前models_df列名: ['category', 'name', 'url', 'display_text', 'author', 'item_name', 'update_date', 'Frameworks', 'Domains', 'License', 'Description', '功能标签', '学科', '模态', '开源协议']
去重前记录数: 3
去重后记录数: 3
删除的重复记录数: 0

去重后的列数: 15

去重完成！


# 调用AI翻译

In [70]:
prompt = """请你学习这个关于数据集的介绍，并用简洁的中文对进行总结，说明这个数据集的内容和用途。以JSON格式输出，严格遵循如下格式：```json{"介绍":"你的总结的内容"}``` """

In [71]:
import pandas as pd
import os
import json
import time
import logging
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from openai import OpenAI

# -------------------------------
# JSON解析模块（独立模块）
# -------------------------------
def default_json_parser(content, idx=None):
    """
    默认的 JSON 解析器：
    清理输入内容后尝试解析 JSON，
    若成功则返回完整的字典，若失败返回空字典。
    """
    try:
        # 去除代码块标记，清理内容
        cleaned_content = content.replace('```json\n', '').replace('```', '').strip()
        parsed_result = json.loads(cleaned_content)
        return parsed_result
    except json.JSONDecodeError:
        if idx is not None:
            logging.warning(f"警告: 第 {idx} 行解析 JSON 失败")
        return {}
    except Exception as e:
        if idx is not None:
            logging.error(f"错误: 第 {idx} 行解析失败 - {str(e)}")
        return {}

# -------------------------------
# 限流处理器（控制请求频率）
# -------------------------------
class RateLimitedProcessor:
    def __init__(self):
        self.request_timestamps = []
        self.MAX_RPM = 500
        self.window_size = 60  # 60秒窗口

    def _clean_old_records(self, current_time):
        cutoff_time = current_time - timedelta(seconds=self.window_size)
        self.request_timestamps = [ts for ts in self.request_timestamps if ts > cutoff_time]

    def can_make_request(self):
        """检查是否可以发起新请求"""
        current_time = datetime.now()
        self._clean_old_records(current_time)
        if len(self.request_timestamps) >= self.MAX_RPM:
            return False
        self.request_timestamps.append(current_time)
        return True

# -------------------------------
# OpenAI文本处理器
# -------------------------------
class OpenAITextProcessor:
    def __init__(self, api_key=None, model=None, base_url=None, json_parser=None):
        self.client = OpenAI(api_key=api_key,base_url=base_url)
        self.model = model
        self.rate_limiter = RateLimitedProcessor()
        self.n_workers = 14  # 优化后的线程数
        # 如果未提供自定义解析器，则使用默认解析器
        self.json_parser = json_parser if json_parser is not None else default_json_parser

    def process_batch(self, df, text_column, prompt, batch_size=20, delay=1, json_parser=None):
        """
        批量处理文本，支持灵活的 JSON 解析。
        
        参数:
            df: 包含文本数据的 DataFrame
            text_column: 文本所在的列名
            prompt: 系统提示，用于 API 调用
            batch_size: 每个批次处理的文本条数
            delay: 每次请求后的延迟（秒）
            json_parser: 可选的自定义 JSON 解析器，若不传入则使用实例内的解析器
        
        返回:
            新的 DataFrame，包含原始数据及 API 返回结果（通过 JSON 解析获得的各字段）
        """
        parser = json_parser if json_parser is not None else self.json_parser
        results = []  # 保存每次请求解析后的结果（字典形式）

        def process_chunk(chunk_data):
            chunk_results = []
            for idx, text in chunk_data:
                # 限流检测：等待直到可以发送请求
                while not self.rate_limiter.can_make_request():
                    time.sleep(0.1)
                try:
                    response = self.client.chat.completions.create(
                        model=self.model,
                        messages=[
                            {"role": "system", "content": prompt},
                            {"role": "user", "content": text}
                        ],
                        temperature=1,
                        max_tokens=500
                    )
                    # 使用解析器处理响应内容，得到字典格式结果
                    parsed_result = parser(response.choices[0].message.content, idx)
                    chunk_results.append(parsed_result)
                    time.sleep(delay)
                except Exception as e:
                    logging.error(f"错误: 处理第 {idx} 行时发生异常: {str(e)}")
                    chunk_results.append({})
            return chunk_results

        # 将数据分成批次，保留行号信息
        chunks = [
            list(enumerate(df[text_column][i:i+batch_size]))
            for i in range(0, len(df), batch_size)
        ]

        with ThreadPoolExecutor(max_workers=self.n_workers) as executor:
            futures = list(tqdm(
                executor.map(process_chunk, chunks),
                total=len(chunks),
                desc="Processing batches"
            ))
            for chunk_results in futures:
                results.extend(chunk_results)

        # 将解析结果列表转为 DataFrame，并与原 DataFrame 合并
        df_result = df.copy().reset_index(drop=True)
        results_df = pd.json_normalize(results)
        df_result = pd.concat([df_result, results_df], axis=1)

        # 统计处理情况
        success_count = sum(1 for r in results if r)
        total_count = len(results)
        success_rate = (success_count / total_count) * 100 if total_count > 0 else 0
        logging.info(f"处理完成: 总数 {total_count}, 成功 {success_count}, 成功率 {success_rate:.2f}%")
        
        return df_result


In [None]:
processor = OpenAITextProcessor(api_key="此处填入你的deepseek api key", base_url="https://api.deepseek.com",model="deepseek-chat")
df_result = processor.process_batch(
    df=models_df_complete,
    text_column="Description",
    prompt=prompt,
    batch_size=5,
)

Processing batches:   0%|          | 0/1 [00:00<?, ?it/s]2025-08-26 11:40:50,156 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-08-26 11:40:56,229 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-08-26 11:41:05,718 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
Processing batches: 100%|██████████| 1/1 [00:25<00:00, 25.40s/it]
2025-08-26 11:41:15,370 - INFO - 处理完成: 总数 3, 成功 3, 成功率 100.00%


# 规范化底稿格式

In [None]:
# 1. 删除不需要的列
cols_to_drop = ['category', 'name', 'display_textDomains', 'License', 'Description', 'completeness_score']
df_result = df_result.drop(columns=[col for col in cols_to_drop if col in df_result.columns])

# 2. 替换列名称
df_result = df_result.rename(columns={
    'url': '模型链接',
    'author': '备注-author',
    'item_name': '原始名称',
    'update_date': '模型发布/更新时间',
    'Frameworks': '模型框架',
    '学科': '科学领域',
    '介绍': '模型介绍'
})

# 3. 重新排列表格顺序
desired_order = [
    '原始名称', '科学领域', '模态', '模型框架', '开源协议', 
    '模型介绍', '模型发布/更新时间', '功能标签', '模型链接', '备注-author'
 ]
df_result = df_result[[col for col in desired_order if col in df_result.columns]]
df_result.head()

Unnamed: 0,原始名称,翻译名称,是否上传门户,科学领域,模态,模型框架,是否开源,开源协议,国内/外模型,模型介绍,模型发布/更新时间,功能标签,模型链接,备注-author
0,full-text,,否,其他学科,文本,Transformers,是,公开,国外,这是一个关于数据集介绍的学习任务，要求用户学习并总结数据集的内容和用途，然后以指定的JSON...,,通用处理,https://huggingface.co/search/full-text,search
1,aspect-acl-scibert-scivocab-uncased,,否,其他学科,文本,"PyTorch, Transformers",是,MIT,国外,该数据集基于scibert-scivocab-uncased模型，在ACL Antholog...,2023-02-07,自然语言处理,https://huggingface.co/malteos/aspect-acl-scib...,malteos
2,aspect-cord19-scibert-scivocab-uncased,,否,其他学科,文本,"PyTorch, Transformers",是,MIT,国外,该数据集是一个基于SciBERT模型（使用科学词汇表且不区分大小写）的微调版本，专门针对CO...,2021-11-22,自然语言处理,https://huggingface.co/malteos/aspect-cord19-s...,malteos


In [75]:
df_result.to_excel("科学智能模型.xlsx", index=False)