# 1. 2023年10月6日开始重构

- 1.akshare有一个接口更新了名字
- 2.券商研报的表头更新了
- 3.通过萝卜投研爬取10年pe平均值做了线程池优化（12个workers让本来50分钟的爬取减少到5分钟）

In [4]:
#!/usr/bin/env python=3.10
# coding=utf-8
# author: KarlQu
# Version: 1.1
# Date: 2023-10-6
# webdriver=Firefox2023.04
# pip install akshare -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com  --upgrade
#  mamba install -c conda-forge selenium=4.13.0
# selenium 4 与 selenium 3 对对象选择语法做了修改
#  mamba install -c conda-forge jupyter
# akshare.__version__ = '1.11.23 '
# !最重要的是g和g的可达性
# akshare更新非常频繁
# pip install akshare --upgrade -i https://pypi.org/simple

# 是否需要更新10年pe均值和券商研报, pe_ave_10(ifupdate = False), 输出位置 f'./backups/{DATE}/nets_end.xlsx'

import numpy as np
import pandas as pd
import requests
import akshare as ak
import tqdm
import time
import datetime as dt
import os
import calendar
from selenium import webdriver
# from selenium.webdriver.support.wait import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.common.by import By
from concurrent import futures

DATE = dt.date.today() # str(DATE) ; dt.date.today().year

UA = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0'
}

def open_file(file_name:str, upperdir = 'backups'):
    """创建文件夹
    Parameters
    ----------
    file_name : 日期字符串
    upperdir : 上级目录名
    """
    if os.path.exists(f'./{upperdir}/' + file_name):
        pass
    else:
        os.mkdir(f'./{upperdir}/' + file_name)

def update_g(ifupdate = False) -> pd.DataFrame:
    """akshare的个股研报接口

    Parameters
    ----------
    ifupdate : bool, optional
        _description_, by default False

    Returns
    -------
    pd.DataFrame
        columns = ['代码','名称','研报数','2021预测每股收益','2022预测每股收益','2023预测每股收益','2024预测每股收益']
        
        共4年的每股收益,2023年更新为'2022预测每股收益','2023预测每股收益','2024预测每股收益','2025预测每股收益'
    """
    if ifupdate:
        open_file(str(DATE), upperdir = 'backups')
        profit_forecast = ak.stock_profit_forecast_em().iloc[:,[1,2,3,9,10,11,12]].drop_duplicates()
        profit_forecast.to_excel(f'./backups/{DATE}/profit_forecast.xlsx',index=False)

    else:
        latest_file = list(os.walk('./backups/'))[0][1][-1]
        profit_forecast = pd.read_excel(f'./backups/{latest_file}/profit_forecast.xlsx',dtype={'代码':'str'})
    # 调整g,收益>0、每期g>10%、每期g可信值35%。 得到调整的几何均数g
    profit_forecast = profit_forecast[(profit_forecast['2022预测每股收益']>0) & (profit_forecast['2023预测每股收益']>0 )& ( profit_forecast['2024预测每股收益']>0)]
    profit_forecast
    profit_forecast1 = profit_forecast.iloc[:,[3,4,5,6]].pct_change(axis=1)[['2023预测每股收益','2024预测每股收益',	'2025预测每股收益']]
    profit_forecast1
    profit_forecast2 = profit_forecast1[(profit_forecast1['2023预测每股收益']>0.1) & (profit_forecast1['2024预测每股收益']>0.1 )& ( profit_forecast1['2025预测每股收益']>0.1)]
    profit_forecast2
    profit_forecast2[profit_forecast2>0.35] = 0.35
    profit_forecast3 = profit_forecast2 + 1
    g_scaled = ((profit_forecast3['2023预测每股收益']*profit_forecast3['2024预测每股收益']*profit_forecast3['2025预测每股收益'])**(1/3)-1)
    g_scaled.name = 'g'
    g = pd.concat([g_scaled,profit_forecast],axis=1,join='inner')[['代码','名称','g','研报数']].reset_index(drop=True)
    return g


def ten_year_ago(date: dt.date = DATE) -> str:
        """10年前日期，月底对应月底
        Parameters
        ----------
        date : datetime.date
        Returns
        -------
        str
        """
        year = date.year - 10
        day = min(date.day, calendar.monthrange(year, date.month)[1])
        dt = date.replace(year=year, month=date.month, day=day)
        # print(dt)
        bigen_date = str(dt).replace('-','')
        return bigen_date

def pe_ave_10(date:str = str(DATE), ifupdate = False):
    """获取or更新10年pe均值和券商研报的预期每股收益

    Parameters
    ----------
    date : str, optional
        _description_, by default str(DATE)
    ifupdate : bool, optional
        _description_, by default False
    """
    if ifupdate:
        g = update_g(ifupdate=ifupdate)
        stock_ids = g.代码.to_list() # 每次爬pe历史均值的时候都更新一下券商研报
        print('代码前五个')
        print(stock_ids[0:5])
        # 1. selenium登录获取cookies
        denglu_link = 'https://robo.datayes.com/v2/landing/peband'
        cruser = webdriver.Firefox()
        cruser.get(denglu_link)
        input("扫码登录后按回车键")
        # WebDriverWait(cruser,60).until(EC.presence_of_all_elements_located((By.ID, 'app')))
        cookies = {x['name']:x['value'] for x in cruser.get_cookies()}
        print(cookies)
        end_date = date.replace('-','')
        begin_date = ten_year_ago()
        ave_pettm = []
        executor = futures.ThreadPoolExecutor(max_workers=12)
        fs = []
        total_tasks = len(stock_ids)
        with tqdm.tqdm(total=total_tasks) as pbar:
            for ids in stock_ids:
                def get_ids_peave(ids, begin_date, end_date, cookies, UA, ave_pettm):
                    try:
                        url = f'https://gw.datayes.com/rrp_adventure/web/stockModel/band/{ids}' # 2022old
                        response = requests.get(url=url,
                                                params= {
                                                    'apiType': '4',
                                                    'category': '1',    # 1为PE，2为PB
                                                    'subCategory': '1',
                                                    'flag': '-1',
                                                    'beginDate': f'{begin_date}',
                                                    'endDate': f'{end_date}'
                                                },
                                                cookies=cookies,
                                                headers=UA)
                        ave_pettm.append([ids, response.json()['data']['mean']])
                        pbar.update(1) # 更新tqdm进度条用
                        time.sleep(np.random.rand()) # 1536/1536 [33:05<00:00,  1.29s/it]
                        # print(f'正常{response.status_code} , code: {ids}')
                    except:
                        print(f'异常{response.status_code} , code: {ids}')
                        pass

                f= executor.submit(get_ids_peave, ids, begin_date, end_date, cookies, UA, ave_pettm)
                fs.append(f)
            futures.wait(fs)
        cruser.quit()
        # 3. 本地保存
        pe_ave = pd.DataFrame(ave_pettm, columns=['code','ave_pettm'])
        open_file(str(DATE), upperdir = 'backups')
        pe_ave.to_excel(f'./backups/{DATE}/avepettm_backup.xlsx',index=False)
        latest_file = DATE
    else:
        latest_file = list(os.walk('./backups/'))[0][1][-1]
        print(f'研报和10年期pe平均值的最近一次更新时间为 {latest_file}')
        g = update_g(ifupdate=ifupdate)
        pe_ave = pd.read_excel(f'./backups/{latest_file}/avepettm_backup.xlsx',dtype={'code':'str'})
    nets = pd.merge(g.rename(columns={'代码':'code'}) , pe_ave , on='code',how='left')
    nets['peg'] = (nets.ave_pettm / (100*nets.g))
    nets1 = nets[(nets['peg']<1.2)]
    # (nets['peg']<1.2).sum()
    # 2. 得到筛选后的股票代码，后续可以爬取pettm和dvttm了
    stock_ids_selected = nets1.code.to_list()
    print('DEBUG:print 代码前五')
    print(stock_ids_selected[0:5])
    pe_ttm = []
    for ids in tqdm.tqdm(stock_ids_selected):
        try:
            rrr = ak.stock_a_indicator_lg(symbol=f'{ids}').iloc[-1,[0,2,7]] # pettm and dvttm 接口
            rrr['industry'] = ak.stock_individual_info_em(symbol=f"{ids}").loc[2].value # 行业接口
            rrr.name = ids
            pe_ttm.append(rrr)
        except:
            print(f'{ids} 查无!')
        continue

    pe_and_dv = pd.concat(pe_ttm,axis=1).T
    pe_and_dv_backup = pe_and_dv.reset_index(drop=False)
    pe_and_dv_backup = pe_and_dv_backup.rename(columns={'index':'code'}) 
    open_file(str(DATE), upperdir = f'backups/{latest_file}')
    pe_and_dv_backup.to_excel(f'./backups/{latest_file}/{DATE}/pe_and_dv.xlsx',index=False)
    # pe_ttm = pd.read_excel(f'./backups/{latest_file}/{DATE}/pe_and_dv.xlsx',dtype={'code':'str'})
    nets2 = pd.merge(nets1,pe_and_dv_backup,on='code',how='left')
    nets3 = nets2[nets2.pe_ttm<nets2.ave_pettm]
    nets3['peg'] = (nets3.pe_ttm / (100*nets3.g))
    nets3['exp_reward_yearly'] = 100*((nets3.ave_pettm/nets3.pe_ttm)**(1/3)*(1+nets3.g)-1)
    nets_end = nets3[nets3.exp_reward_yearly>35].drop_duplicates()
    nets_end.to_excel(f'./backups/{latest_file}/{DATE}/nets_end.xlsx',index=False)   
    print('done! open nets_end.xlsx')

In [5]:
# pe_ave_10(ifupdate=True)
pe_ave_10(ifupdate=False)

最近一次更新时间为 2023-10-06


  profit_forecast1 = profit_forecast.iloc[:,[3,4,5,6]].pct_change(axis=1)[['2023预测每股收益','2024预测每股收益',	'2025预测每股收益']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  profit_forecast2[profit_forecast2>0.35] = 0.35
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  profit_forecast2[profit_forecast2>0.35] = 0.35


代码前五个
['603369', '603801', '002867', '002142', '002271']


 78%|███████▊  | 342/439 [09:13<12:26,  7.69s/it]

605277 查无!


 93%|█████████▎| 408/439 [10:52<00:49,  1.61s/it]

832469 查无!


100%|██████████| 439/439 [11:39<00:00,  1.59s/it]


done! open nets_end.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nets3['peg'] = (nets3.pe_ttm / (100*nets3.g))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nets3['exp_reward_yearly'] = 100*((nets3.ave_pettm/nets3.pe_ttm)**(1/3)*(1+nets3.g)-1)


# 2. AkShare的legulegu.com/robots.txt协议对爬虫没做限制,但是！？
尝试也改造成线程池加速
> 被封ip了。。。

In [3]:
import os
list(os.walk('./backups/')) # [0][1][-1]

[('./backups/', ['2022-10-16', '2023-10-06'], []),
 ('./backups/2022-10-16',
  [],
  ['avepettm_backup.xlsx',
   'nets_end.xlsx',
   'pe_and_dv.xlsx',
   'profit_forecast.xlsx']),
 ('./backups/2023-10-06',
  [],
  ['avepettm_backup.xlsx',
   'nets_end.xlsx',
   'pe_and_dv.xlsx',
   'profit_forecast.xlsx'])]