## 1. Index Components

In [1]:
import os
save_path = "./Data/AnnualReports"
os.makedirs(save_path, exist_ok=True)

In [2]:
import pandas as pd

# Components are downloaded from iFind
components = pd.read_excel("./Data/Components.xlsx")
components.columns = ["code_full", "name", "price", "rtn", "type"]
components["code"] = components["code_full"].apply(lambda x: x.split(".")[0])
components["code"] = components["code"].str.zfill(6)
components.head()

Unnamed: 0,code_full,name,price,rtn,type,code
0,000001.SZ,平安银行,10.14,-1.27,沪深300,1
1,000002.SZ,万科A,6.81,-1.3,沪深300,2
2,000063.SZ,中兴通讯,26.61,-2.03,沪深300,63
3,000100.SZ,TCL科技,3.85,-1.28,沪深300,100
4,000166.SZ,申万宏源,4.57,-1.3,沪深300,166


In [4]:
components.to_excel("./Data/Components.xlsx", index=False)

## 2. Downloading Annual Reports

In [7]:
save_path = "./Data/AnnualReports"

import pandas as pd
import os

components = pd.read_excel("./Data/Components.xlsx")
components = components.set_index("code")
components.head()

Unnamed: 0_level_0,code_full,name,price,rtn,type
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,000001.SZ,平安银行,10.14,-1.27,沪深300
2,000002.SZ,万科A,6.81,-1.3,沪深300
63,000063.SZ,中兴通讯,26.61,-2.03,沪深300
100,000100.SZ,TCL科技,3.85,-1.28,沪深300
166,000166.SZ,申万宏源,4.57,-1.3,沪深300


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import re
import time

In [3]:
# Web page URL for the annual reports
page_url = "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index"
# Date range
start_date = "2021-01-01"
end_date = "2022-12-31"

def setup_driver(edge_driver_path, url=page_url):
    options = webdriver.EdgeOptions()
    options.headless = True  # 无头模式
    driver = webdriver.Edge(service=Service(edge_driver_path), options=options)
    driver.get(url)

    # wait for the page to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "handle-datepicker"))
    )
    date_picker = driver.find_element(By.CLASS_NAME, "handle-datepicker")
    date_picker.click()

    # Clear and set the start date
    start_date_input = driver.find_element(By.CSS_SELECTOR, ".handle-datepicker input[placeholder='开始日期']")
    start_date_input.click()
    start_date_input.clear()
    time.sleep(0.5)
    start_date_input.send_keys(start_date)

    # Clear and set the end date
    end_date_input = driver.find_element(By.CSS_SELECTOR, ".handle-datepicker input[placeholder='结束日期']")
    end_date_input.click()
    end_date_input.clear()
    time.sleep(0.5)
    end_date_input.send_keys(end_date)

    end_date_input.send_keys(Keys.ENTER)
    return driver

# replace the path with the path to your Edge driver
edge_driver_path = r"D:\Program Files\edgedriver_win64\msedgedriver.exe"
driver = setup_driver(edge_driver_path)

In [4]:
# Stocks codes and Keywords for searching
stocks_codes = components.index.tolist()
stocks_codes = [str(elements).zfill(6) for elements in stocks_codes]
components.index = stocks_codes
search_query = "年度报告"

In [5]:
# Scrape the data
def scrape_data(driver, stock_code, search_query=search_query):
    # input the stock code
    code_input = driver.find_element(By.CSS_SELECTOR, "input[placeholder='代码/简称/拼音']")
    code_input.click()
    code_input.clear()
    time.sleep(0.5)
    code_input.send_keys(stock_code)
    code_input.send_keys(Keys.TAB)

    # input the search query
    title_input = driver.find_element(By.CSS_SELECTOR, "input[placeholder='标题关键字']")
    title_input.click()
    title_input.clear()
    time.sleep(0.5)
    title_input.send_keys(search_query)
    title_input.send_keys(Keys.ENTER)

    time.sleep(0.5)

    # check if there is no data
    try:
        driver.find_element(By.CSS_SELECTOR, "div.no-data")
        links = []
    except:
        # wait for the table to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.el-table__body-wrapper"))
        )
        time.sleep(0.2)

        # get the data
        rows = driver.find_elements(By.CSS_SELECTOR, "div.el-table__body-wrapper tr")
        links = []
        for row in rows:
            title = row.find_element(By.CSS_SELECTOR, "td:nth-child(3) a").get_attribute("title")
            title = title.replace("<em>", "").replace("</em>", "")
            if title.endswith("年度报告") and "半" not in title:
                link = row.find_element(By.CSS_SELECTOR, "td:nth-child(3) a").get_attribute("href")
                math = re.search(r'announcementId=(\d+).*announcementTime=([\d-]+)', link)
                announcement_id = math.group(1)
                announcement_time = math.group(2)
                links.append(
                    {
                        "StockCode": stock_code,
                        "StockName": components.loc[stock_code, "name"],
                        "Title": title,
                        "Id": announcement_id,
                        "Time": announcement_time,
                        "Link": link
                    }
                )
    
    # clear all search tags
    tags = driver.find_elements(By.CSS_SELECTOR, "div.select-box .jc-el-tag")
    for tag in tags:
        close_button = tag.find_element(By.CSS_SELECTOR, "i.el-tag__close")
        close_button.click()
    return links

In [6]:
from tqdm import tqdm

all_links = []
tqdm_bar = tqdm(stocks_codes, desc="Scraping data")
for stock_code in tqdm_bar:
    links = scrape_data(driver, stock_code)
    all_links.extend(links)
    if len(links) == 0:
        tqdm_bar.set_postfix_str(f"Stock {stock_code} has no data. Number of links: {len(all_links)}")
    else:
        tqdm_bar.set_postfix_str(f"Stock {stock_code}'s data is scraped. Number of links: {len(all_links)}")
    time.sleep(1)

driver.quit()

Scraping data: 100%|██████████| 300/300 [16:23<00:00,  3.28s/it, Stock 688599's data is scraped. Number of links: 560]


In [9]:
all_links[:2]

[{'StockCode': '000001',
  'StockName': '平安银行',
  'Title': '2021年年度报告',
  'Id': '1212533413',
  'Time': '2022-03-10',
  'Link': 'http://www.cninfo.com.cn/new/disclosure/detail?stockCode=000001&announcementId=1212533413&orgId=gssz0000001&announcementTime=2022-03-10'},
 {'StockCode': '000001',
  'StockName': '平安银行',
  'Title': '2020年年度报告',
  'Id': '1209224370',
  'Time': '2021-02-02',
  'Link': 'http://www.cninfo.com.cn/new/disclosure/detail?stockCode=000001&announcementId=1209224370&orgId=gssz0000001&announcementTime=2021-02-02'}]

In [10]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}

def download_pdf(download_url, destination, headers=headers):
    response = requests.get(download_url, headers=headers, verify=False)
    response.raise_for_status()

    with open(destination, "wb") as f:
        f.write(response.content)

In [11]:
# this failure is due to my network issue, ignore it please
for code in all_links:
    download_url = f'http://www.cninfo.com.cn/new/announcement/download?bulletinId={code["Id"]}&announceTime={code["Time"]}'
    destination = os.path.join(save_path, f'{code["StockCode"]}_{code["StockName"]}_{code["Title"]}.pdf')
    try:
        download_pdf(download_url, destination)
    except:
        print(f"Failed to download {code['StockCode']}_{code['StockName']}_{code['Title']}.pdf")

Failed to download 000063_中兴通讯_2021年年度报告.pdf


In [None]:
# Save the links to a file
all_links = pd.DataFrame(all_links)
all_links.to_excel("./Data/ReportsLinks.xlsx", index=False)

In [14]:
# Download a single file
download_url = f'http://www.cninfo.com.cn/new/announcement/download?bulletinId=1212529495&announceTime=2022-03-09'
destination = os.path.join(save_path, "000063_中兴通讯_2021年年度报告.pdf")
download_pdf(download_url, destination)

## 3. Target Return

In [14]:
import pandas as pd
import os
import akshare as ak
from tqdm import tqdm

In [15]:
components = pd.read_excel("./Data/Components.xlsx")
components.head()

Unnamed: 0,code_full,name,price,rtn,type,code
0,000001.SZ,平安银行,10.14,-1.27,沪深300,1
1,000002.SZ,万科A,6.81,-1.3,沪深300,2
2,000063.SZ,中兴通讯,26.61,-2.03,沪深300,63
3,000100.SZ,TCL科技,3.85,-1.28,沪深300,100
4,000166.SZ,申万宏源,4.57,-1.3,沪深300,166


In [16]:
save_path = "./Data/HistoryData"
os.makedirs(save_path, exist_ok=True)

start_date = "20220101"
end_date = "20231231"

return_df = {}

for stock in tqdm(components["code"]):
    stock = str(stock).zfill(6)
    # use "hfq" to adjust the data
    df = ak.stock_zh_a_hist(symbol=stock, period="monthly", start_date=start_date, end_date=end_date, adjust="hfq")
    df["year"] = pd.to_datetime(df["日期"]).dt.year
    df.to_csv(os.path.join(save_path, f"{stock}.csv"), index=False)
    return_df[stock] = df.groupby("year")["涨跌幅"].apply(lambda x: (1 + x / 100).prod() - 1)

100%|██████████| 300/300 [00:07<00:00, 38.79it/s]


In [17]:
return_df = pd.DataFrame(return_df).T
return_df.columns.name = ""
return_df.head()

Unnamed: 0,2022,2023
1,-0.176295,-0.241602
2,-0.021555,-0.266972
63,-0.210217,0.036873
100,-0.326498,0.212967
166,-0.186012,0.106854


In [18]:
components["code"] = components["code"].astype(str).str.zfill(6)
components = pd.merge(components, return_df, left_on="code", right_index=True, how="left")
components.head()

Unnamed: 0,code_full,name,price,rtn,type,code,2022,2023
0,000001.SZ,平安银行,10.14,-1.27,沪深300,1,-0.176295,-0.241602
1,000002.SZ,万科A,6.81,-1.3,沪深300,2,-0.021555,-0.266972
2,000063.SZ,中兴通讯,26.61,-2.03,沪深300,63,-0.210217,0.036873
3,000100.SZ,TCL科技,3.85,-1.28,沪深300,100,-0.326498,0.212967
4,000166.SZ,申万宏源,4.57,-1.3,沪深300,166,-0.186012,0.106854


In [19]:
components.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   code_full  300 non-null    object 
 1   name       300 non-null    object 
 2   price      300 non-null    float64
 3   rtn        300 non-null    float64
 4   type       300 non-null    object 
 5   code       300 non-null    object 
 6   2022       298 non-null    float64
 7   2023       300 non-null    float64
dtypes: float64(4), object(4)
memory usage: 18.9+ KB


In [20]:
components.drop(columns=["price", "rtn", "type"], inplace=True)
components.to_excel("./Data/Components.xlsx", index=False)

## 4. Check Annual Reports

In [1]:
import os

path = "./Data/AnnualReports"
files = os.listdir(path)
files[:5]

['000001_平安银行_2020年年度报告.pdf',
 '000001_平安银行_2021年年度报告.pdf',
 '000002_万科A_2020年年度报告.pdf',
 '000002_万科A_2021年年度报告.pdf',
 '000063_中兴通讯_2020年年度报告.pdf']

In [9]:
import pandas as pd

files_series = pd.Series([file.split(".")[0] for file in files])
files_series = files_series.str.split("_", expand=True)
files_series.columns = ["code", "name", "title", "title2", "title3"]
files_series = files_series.set_index("code")
files_series.head()

Unnamed: 0_level_0,name,title,title2,title3
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,平安银行,2020年年度报告,,
1,平安银行,2021年年度报告,,
2,万科A,2020年年度报告,,
2,万科A,2021年年度报告,,
63,中兴通讯,2020年年度报告,,


In [10]:
cond = files_series.groupby("code").size()
cond[cond > 2]

code
000063    3
003816    3
600011    3
600029    3
601236    3
601238    4
601328    4
601398    4
601808    3
601865    4
601899    3
601988    4
dtype: int64

In [13]:
files_series.loc[cond[cond > 2].index]

Unnamed: 0_level_0,name,title,title2,title3
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
63,中兴通讯,2020年年度报告,,
63,中兴通讯,2021年年度报告,,
63,中兴通讯,关于按照《香港上市规则》公布2020年年度报告,,
3816,中国广核,2021年年度报告,,
3816,中国广核,境内同步披露公告- 2021年度报告,,
3816,中国广核,境内同步披露公告2020年度报告,,
600011,华能国际,华能国际2020年年度报告,,
600011,华能国际,华能国际H股2020年年度报告,,
600011,华能国际,华能国际H股2021年年度报告,,
600029,南方航空,H股公告-年度报告,,


In [None]:
"""
Mannually Process:
    Rename: 南方航空	H股公告-年度报告 to H股公告-2020年年度报告
    Delete:
        东吴证券股份有限公司关于红塔证券股份有限公司2021年度持续督导年度报告
        国泰君安证券股份有限公司关于福莱特玻璃集团股份有限公司2020年持续督导年度报告
        国泰君安证券股份有限公司关于福莱特玻璃集团股份有限公司2021年持续督导年度报告
        光大证券股份有限公司关于宁波德业科技股份有限公司2021年持续督导年度报告
    Download:
        宁波德业科技股份有限公司2021年年度报告全文

Notes:
    Some companies may have only one report for they went public later.
    We think "H股公告", "境内" or "境外" reports are also useful, so we keep them.
    We did not notice that some reports are endwith "年度报告全文", but we had meet the standards of 500 reports, so we ignored it.
"""