In [29]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from openai import OpenAI
from config import Config

ModuleNotFoundError: No module named 'config'

In [31]:
memes_data = []

In [35]:
url = "https://weibo.com/ajax/side/hotSearch"
response = requests.get(url, headers=headers)
data = response.json()

if data and 'data' in data and 'realtime' in data['data']:
    hot_topics = data['data']['realtime']
    for topic in hot_topics[:10]:  # 获取热搜
        memes_data.append({
            'name': topic['word'],
            'heat': topic['num'],
            'source': '微博热搜'
        })

In [41]:
url = f"https://api.bilibili.com/x/web-interface/search/square?limit=10"
response = requests.get(url, headers=headers)
data = response.json()

if data and data['code'] == 0 and 'data' in data:
    trending = data['data']['trending']
    for topic in trending['list']:
        memes_data.append({
            'name': topic['keyword'],
            'heat': topic['show_name'],
            'source': 'B站热搜'
        })

In [53]:
topic

{'keyword': '刘晓艳尴尬的笑重制版',
 'show_name': '刘晓艳尴尬的笑重制版',
 'icon': 'http://i0.hdslb.com/bfs/activity-plat/static/20221117/eaf2dd702d7cc14d8d9511190245d057/EeuqbMwao9.png',
 'uri': '',
 'goto': '',
 'heat_score': 4206720}

In [43]:
memes_data

[{'name': '特朗普24小时内连砍三刀', 'heat': 2061145, 'source': '微博热搜'},
 {'name': '哈佛已离境留学生或无法返美', 'heat': 1317965, 'source': '微博热搜'},
 {'name': '道德模范可学可做可追可及', 'heat': 965355, 'source': '微博热搜'},
 {'name': '王楚钦vs莫雷加德', 'heat': 857199, 'source': '微博热搜'},
 {'name': '别给小孩买性感奶辣衣服了', 'heat': 779288, 'source': '微博热搜'},
 {'name': '女网红想变韩女全脸整容对标张元英', 'heat': 534261, 'source': '微博热搜'},
 {'name': '10多个省份鼓励实行2.5天休假模式', 'heat': 476535, 'source': '微博热搜'},
 {'name': '胡彦斌给演唱会每一位观众送金子', 'heat': 455221, 'source': '微博热搜'},
 {'name': '永辉超市八点半打折', 'heat': 436954, 'source': '微博热搜'},
 {'name': '广西龙胜山洪已有4人遇难', 'heat': 419466, 'source': '微博热搜'},
 {'name': '影视飓风专访阿汤哥', 'heat': '影视飓风专访阿汤哥', 'source': 'B站热搜'},
 {'name': 'IG EDG', 'heat': 'IG EDG', 'source': 'B站热搜'},
 {'name': 'UP主破防将起诉小米', 'heat': 'UP主破防将起诉小米', 'source': 'B站热搜'},
 {'name': 'ShowMaker阿狸杀疯', 'heat': 'ShowMaker阿狸杀疯', 'source': 'B站热搜'},
 {'name': 'DK2比1战胜HLE', 'heat': 'DK2比1战胜HLE', 'source': 'B站热搜'},
 {'name': 'JOJO结局爆改全员存活', 'heat': 'JOJO结局爆改全员存活', 'source':

In [1]:

class MemeCollector:
    def __init__(self, openai_api_key=None):
        self.today = datetime.now().strftime("%Y-%m-%d")
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.memes_data = []
        
        # 初始化OpenAI客户端
        self.openai_client = None
        api_key = openai_api_key or Config.get_openai_api_key()
        
        if api_key:
            self.openai_client = OpenAI(
                api_key=api_key,
                base_url=Config.get_openai_base_url()
            )
            print("✅ 已启用LLM梗检测功能")
        else:
            print("⚠️  未找到OpenAI API密钥，将使用备用判断逻辑")
        
        # 缓存LLM判断结果，避免重复调用
        self.meme_cache = {}
    
    def collect_weibo_hot_topics(self):
        """从微博热搜采集热门话题"""
        try:
            url = "https://weibo.com/ajax/side/hotSearch"
            response = requests.get(url, headers=self.headers)
            data = response.json()
            
            if data and 'data' in data and 'realtime' in data['data']:
                hot_topics = data['data']['realtime']
                for topic in hot_topics[:Config.MAX_TOPICS_PER_SOURCE]:  # 获取热搜
                    if self._is_meme(topic['word']):
                        self.memes_data.append({
                            'name': topic['word'],
                            'heat': topic['num'],
                            'source': '微博热搜'
                        })
            return len(self.memes_data)
        except Exception as e:
            print(f"微博热搜采集错误: {e}")
            return 0
    
    def collect_bilibili_hot_topics(self):
        """从B站热门话题采集"""
        try:
            url = f"https://api.bilibili.com/x/web-interface/search/square?limit={Config.BILIBILI_API_LIMIT}"
            response = requests.get(url, headers=self.headers)
            data = response.json()
            
            if data and data['code'] == 0 and 'data' in data:
                trending = data['data']['trending']
                for topic in trending['list']:
                    if self._is_meme(topic['keyword']):
                        self.memes_data.append({
                            'name': topic['keyword'],
                            'heat': topic['show_name'],
                            'source': 'B站热搜'
                        })
            return len(self.memes_data)
        except Exception as e:
            print(f"B站热搜采集错误: {e}")
            return 0
    
    def collect_zhihu_hot_topics(self):
        """从知乎热榜采集 - 使用网页爬虫方式"""
        try:
            # 方法1：尝试使用公开的知乎热榜页面
            url = "https://www.zhihu.com/hot"
            headers = {
                **self.headers,
                'Referer': 'https://www.zhihu.com/',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
                'Accept-Encoding': 'gzip, deflate, br',
                'Cache-Control': 'no-cache',
                'Pragma': 'no-cache'
            }
            
            response = requests.get(url, headers=headers, timeout=Config.REQUEST_TIMEOUT)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            # 解析知乎热榜页面的标题
            hot_items = soup.find_all('div', class_='HotItem-content')
            
            for item in hot_items[:Config.MAX_TOPICS_PER_SOURCE]:  # 获取热门话题
                title_element = item.find('h2', class_='HotItem-title')
                if title_element:
                    title = title_element.get_text(strip=True)
                    if self._is_meme(title):
                        # 尝试获取热度信息
                        metrics_element = item.find('div', class_='HotItem-metrics')
                        heat = '热门' if metrics_element else '未知热度'
                        
                        self.memes_data.append({
                            'name': title,
                            'heat': heat,
                            'source': '知乎热榜'
                        })
            
            print(f"知乎热榜采集完成，获取到 {len([m for m in self.memes_data if m['source'] == '知乎热榜'])} 个相关话题")
            return len([m for m in self.memes_data if m['source'] == '知乎热榜'])
            
        except requests.exceptions.RequestException as e:
            print(f"知乎热榜网络请求错误: {e}")
            return self._fallback_zhihu_collection()
        except Exception as e:
            print(f"知乎热榜采集错误: {e}")
            return self._fallback_zhihu_collection()
    
    def _fallback_zhihu_collection(self):
        """知乎采集失败时的备用方案"""
        try:
            # 备用方案：使用一个更简单的API端点（如果存在）
            # 或者添加一些预设的热门网络梗
            fallback_memes = [
                {'name': '整活', 'heat': '热门', 'source': '知乎热榜(备用)'},
                {'name': '破防', 'heat': '热门', 'source': '知乎热榜(备用)'},
                {'name': '摆烂', 'heat': '热门', 'source': '知乎热榜(备用)'}
            ]
            
            self.memes_data.extend(fallback_memes)
            print("知乎热榜采集失败，使用备用数据")
            return len(fallback_memes)
        except Exception as e:
            print(f"知乎备用方案也失败了: {e}")
            return 0
    
    def _is_meme(self, text):
        """使用大模型判断一个话题是否为网络梗"""
        # 检查缓存
        if text in self.meme_cache:
            return self.meme_cache[text]
        
        # 如果没有OpenAI客户端，直接返回True（输出所有热点）
        if not self.openai_client:
            print(f"LLM不可用，直接输出热点: '{text}'")
            self.meme_cache[text] = True
            return True
        
        try:
            # 构建prompt
            prompt = f"""
请判断以下文本是否是一个"网络梗"。

网络梗的定义：普罗大众都知道的一个有趣的事件、短语、表达方式或者流行语，通常具有幽默性、娱乐性，在网络上广泛传播并被大家理解和使用。

网络梗的特征：
1. 具有趣味性和娱乐性
2. 在网络上广泛传播
3. 大部分网民都能理解其含义
4. 经常用于表达情绪或观点
5. 具有一定的文化内涵或背景故事

不是网络梗的例子：
- 纯粹的新闻事件（如"地震"、"事故"等）
- 严肃的政治话题
- 单纯的人名或地名
- 技术术语或专业词汇

待判断文本："{text}"

请只回答"是"或"否"，不要解释。
"""
            
            response = self.openai_client.chat.completions.create(
                model=Config.OPENAI_MODEL,
                messages=[
                    {"role": "system", "content": "你是一个专门识别网络梗的助手，能够准确判断一个词语或短语是否为网络梗。"},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=Config.OPENAI_MAX_TOKENS,
                temperature=Config.OPENAI_TEMPERATURE
            )
            
            result = response.choices[0].message.content.strip()
            is_meme = result == "是"
            
            # 缓存结果
            self.meme_cache[text] = is_meme
            return is_meme
            
        except Exception as e:
            print(f"LLM判断梗失败 ('{text}'): {e}，直接输出热点")
            # 调用失败时直接返回True（输出所有热点）
            self.meme_cache[text] = True
            return True
    
    def run_all_collectors(self):
        """运行所有采集器"""
        self.collect_weibo_hot_topics()
        self.collect_bilibili_hot_topics()
        self.collect_zhihu_hot_topics()
        
        # 返回采集到的数据
        return self.memes_data