In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re

def get_stock_info(url):
    """
    从指定URL获取股票相关信息并返回JSON格式数据
    """
    try:
        # 设置请求头，模拟浏览器访问
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        
        # 发送GET请求
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        response.encoding = 'utf-8'
        
        # 解析HTML内容
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 提取股票信息
        stock_data = extract_stock_data(soup)
        
        return json.dumps(stock_data, ensure_ascii=False, indent=2)
        
    except requests.RequestException as e:
        return json.dumps({"error": f"请求失败: {str(e)}"}, ensure_ascii=False, indent=2)
    except Exception as e:
        return json.dumps({"error": f"数据解析失败: {str(e)}"}, ensure_ascii=False, indent=2)

def extract_stock_data(soup):
    """
    从BeautifulSoup对象中提取股票数据
    """
    stock_info = {}
    
    try:
        # 提取股票代码和名称
        title_element = soup.find('title')
        if title_element:
            stock_info['page_title'] = title_element.get_text().strip()
        
        # 提取股票基本信息
        # 尝试从不同的可能位置提取数据
        
        # 查找包含股票代码的元素
        code_elements = soup.find_all(string=re.compile(r'HK\d+|^\d{5}$'))
        if code_elements:
            stock_info['stock_code'] = code_elements[0].strip()
        
        # 查找股票名称
        name_selector = soup.select('.stock-name, .name, h1')
        if name_selector:
            stock_info['stock_name'] = name_selector[0].get_text().strip()
        
        # 查找当前价格
        price_selectors = ['.current-price', '.price', '.now-price', '.stock-price']
        for selector in price_selectors:
            price_element = soup.select_one(selector)
            if price_element:
                stock_info['current_price'] = price_element.get_text().strip()
                break
        
        # 查找涨跌幅
        change_selectors = ['.change-rate', '.rate', '.change-percent']
        for selector in change_selectors:
            change_element = soup.select_one(selector)
            if change_element:
                stock_info['change_rate'] = change_element.get_text().strip()
                break
        
        # 查找涨跌额
        change_amount_selectors = ['.change-amount', '.change', '.price-change']
        for selector in change_amount_selectors:
            change_element = soup.select_one(selector)
            if change_element:
                stock_info['change_amount'] = change_element.get_text().strip()
                break
        
        # 提取表格数据（如果存在）
        tables = soup.find_all('table')
        if tables:
            table_data = []
            for table in tables:
                rows = table.find_all('tr')
                for row in rows:
                    cells = row.find_all(['td', 'th'])
                    if len(cells) >= 2:
                        key = cells[0].get_text().strip()
                        value = cells[1].get_text().strip()
                        if key and value:
                            table_data.append({key: value})
            if table_data:
                stock_info['table_data'] = table_data
        
        # 提取所有文本内容作为备用信息
        all_text = soup.get_text()
        # 查找可能包含股票信息的数字模式
        numbers = re.findall(r'\d+\.?\d*', all_text)
        if numbers:
            stock_info['extracted_numbers'] = numbers[:10]  # 限制前10个数字
        
        # 如果没有提取到主要信息，添加原始HTML的部分内容
        if len(stock_info) <= 1:
            stock_info['raw_content'] = str(soup)[:1000] + "..." if len(str(soup)) > 1000 else str(soup)
        
    except Exception as e:
        stock_info['extraction_error'] = str(e)
    
    return stock_info

# 主函数调用
def main():
    """
    主函数：获取股票信息并返回JSON格式数据
    """
    url = "https://stockpage.10jqka.com.cn/HK2018/"
    result = get_stock_info(url)
    print(result)
    return result

# 执行主函数
if __name__ == "__main__":
    stock_json = main()

{
  "page_title": "瑞声科技(02018)首页概览_港股行情_同花顺金融网",
  "stock_code": "var waplink = 'https://m.10jqka.com.cn/stockpage/hk_HK2018/';\n\ndocument.domain=\"10jqka.com.cn\";\nvar protocol = window.location.protocol;\nif (protocol == \"http:\") {\n    window.location.href = window.location.href.replace(\"http\", \"https\");\n}\nvar JUMP={\n\t\"hash\":(!window.location.hash)?\"\":window.location.hash,\n\t\"agent\":navigator.userAgent,\n\t\"to_mobile\":function() {\n\t\tif ((this.agent.indexOf('Android') != -1 ||this.agent.indexOf('Mobile') != -1) && this.hash != '#pc') {\n\t\t    window.location.href = waplink;\n\t\t}\n\t}\n};\nJUMP.to_mobile();",
  "stock_name": "瑞声科技HK2018",
  "current_price": "--",
  "table_data": [
    {
      "股票\n\n\n每日必读\n\n\n新股频道\n\n\n个股\n\n\n滚动新闻\n\n\n同花顺原创\n\n\n股指期货\n\n\n创业板\n\n\n港股频道\n\n\n港股频道": "财经\n\n\n财经要闻\n\n\n互动平台\n\n\n国内经济\n\n\n国际经济\n\n\n宏观经济\n\n\n每日专题\n\n\n财经人物\n\n\n产经新闻\n\n\n大事件直播"
    },
    {
      "股票名称": "最新价"
    },
    {
      "公司名称：瑞声科技控股有限公司": "上市日期：20