In [1]:
import requests
import numpy as np
import pandas as pd
import bs4
import re
import json

In [2]:
def extract_link(line):
    node = line.find('a', href = True)
    if node != None and 'href' in node.attrs:
        return('http://www.chinamoney.com.cn' + node['href'])

In [3]:
def parse_link(link):
    return bs4.BeautifulSoup(requests.get(link).content, 'html.parser')

In [4]:
# This function returns all the texts from the page navigation part 'market note',
# and checks if there is a "next page" option

def last_page(page):
    nodes = page.find('td', class_ = 'market-note-1').find_all('a')
    return '下一页' not in [a.get_text() for a in nodes]

In [5]:
# make a list of strings unique by appending "_1", "_2", etc. to duplicate entries

def uniquify(df_columns):
    seen = set()

    for item in df_columns:
        fudge = 1
        newitem = item

        while newitem in seen:
            fudge += 1
            newitem = "{}_{}".format(item, fudge)

        yield newitem
        seen.add(newitem)

In [159]:
# download terms and conditions from a hyperlink

def get_TC(link):
    
    tcpage = parse_link(link)
    
    # Get text from the lines, remove whitespaces and the last line which is redundant
    tc = [re.sub('\s', '', x.get_text()) for x in tcpage.find_all('td', {'class': ['bdr-rgt1', 'row2']})][10:-1]
    
    tcdf = pd.DataFrame([tc[1::2]], columns = tc[0:-1:2])
    
    # Change column names from Chinese to English
    tcdf = format_bond_list(tcdf)
    
    # Make columns unique by appending numbers if there are multiple follow-ons
    tcdf.columns = list(uniquify(tcdf.columns))
    
    return tcdf

In [157]:
# Append two dataframes preserving the column orders

def TC_append(df1, df2):
    cols = df1.columns if len(df1.columns) > len(df2.columns) else df2.columns
    df = pd.concat([df1, df2])[cols]
    return df

In [8]:
def list_from_page(page):
    
    # Isolate the table containing bond information, based on the HTML tag and classes
    nodes = page.find_all('td', {'class': ['dreport-row1', 'dreport-row2']})
    
    # From each line in the table, extract the text, and then put everything into a list
    bondList = [line.get_text().replace('\xa0', '') for line in nodes]
    length = len(bondList)
    
    bondList = pd.DataFrame(np.array(bondList).reshape(length//6, 6), # reshape the list to a 6-column numpy array
                            columns = ['Short Name', 'Code', 'Issuer', 'Bond Type', 'Issue Date', 'Latest Rating'])
    
    # Append the hyperlinks to bond pages to the dataframe
    bondList['Link'] = pd.Series([extract_link(nodes[i]) for i in range(0, length, 6)])
    
    return bondList

In [9]:
def format_bond_list(df):
    nameDict = {'债券简称': 'Short Name',
                '债券代码': 'Code',
                '发行人': 'Issuer',
                '债券类型': 'Bond Type',
                '发行日期': 'Issue Date',
                '最新评级': 'Latest Rating',
                '债券发行日': 'Issue Date',
                '到期兑付日': 'Maturity Date',
                '上市交易日': 'First Trade Date',
                '债券摘牌日': 'Last Trade Date',
                '债券期限': 'Tenor',
                '流通范围': 'Circulation Range',
                '面值(元)': 'Face Value',
                '发行价格(元)': 'Issue Price',
                '计划发行量(亿)': 'Planned Issue Amount (100 millions)',
                '实际发行量(亿)': 'Actual Issue Amount (100 millions)',
                '币种': 'Currency',
                '计息基础': 'Day Count Basis',
                '息票类型': 'Coupon Type',
                '债券起息日': 'Dated Date',
                '付息频率': 'Coupon Frequency',
                '票面利率(%)': 'Coupon Rate',
                '发行收益率(%)': 'Yield at Issue',
                '参考收益率(%)': 'Yield for Reference',
                '基准利率': 'Reference Rate',
                '基准利差(BP)': 'Spread to Reference Rate', 
                '信用评级机构一': 'Rating Agency 1',
                '债项/主体评级一': 'Issue/Issuer Rating 1',
                '信用评级机构二': 'Rating Agency 2',
                '债项/主体评级二': 'Issue/Issuer Rating 2',
                '行权类型': 'Option Type',
                '行权日期': 'Option Date',
                '托管机构': 'Clearing House',
                '续发行发行日': 'Follow-on Issue Date',
                '续发行上市交易日': ' Follow-on First Trade Date',
                '计划续发行总额(亿)': 'Planned Follow-on Amount (100 millions)',
                '实际续发行总额(亿)': 'Actual Follow-on Amount (100 millions)',
                '续发行价格(元)': 'Follow-on Issue Price',
                '续发行收益率(%)': 'Follow-on Yield at Issue',
                '备注': 'Notes'}

    df.columns = df.columns.to_series().map(lambda x: nameDict[x] if x in nameDict.keys() else x)
    
    return df

In [None]:
def get_bond_list(bondType = '', couponType = '', issueYear = '', startPage = 1, endPage = ''):
    
    output = pd.DataFrame()
    params = {'bondName':'', 'bondCode':'', 'bondType': bondType, 'issueEnty':'',
             'couponType':couponType, 'issueYear':issueYear, 'enty':'', 'rtngShrt':'', 'pagingPage_il':startPage}
    
    for page in range(startPage, endPage+1):
        
        params['pagingPage_il'] = page
        
        response = requests.get('http://www.chinamoney.com.cn/dqs/rest/cm-u-pt/bondInfoList', params = params)
        
        #convert response to a dict
        response_dict = json.loads(response.text)
        
        #check if the loop has reached the last page
        if page > response_dict['data']['totalPages']:
            break
        
        #bond list from the 'records' entry in the dict
        bond_list = pd.DataFrame(response_dict['records'])
    
        #for each bond in the list, download its T&C and combine them into a DataFrame
        TC = pd.DataFrame()
        for code in bond_list['definedCode']: 
            output = TC_append(output, TC_from_code(code))
        
        page+=1
        
    return output

In [18]:
'''def get_bond_list(bondType = '', couponType = '', issueYear = '', rtngShrt= '', startPage = 1, endPage = ''):
    
    output = pd.DataFrame()
    page = startPage
    
    while True:
        link = ('http://www.chinamoney.com.cn/fe/chinamoney/searchBondInfoForward.action?rtngShrt=' + str(rtngShrt)
                + '&pagingPage_il_='+ str(page) + '&issueEnty=&couponType=' + str(couponType)
                + '&enty=&issueYear=' + str(issueYear) + '&bondType=' + str(bondType) + '&bondName=&bondCode=&')
        
        webpage = parse_link(link)
        
        # bond list
        bondList = list_from_page(webpage)
        
        # For each bond in the links, get its T&C and combine all T&C into a dataframe called TC
        TC = pd.DataFrame()
        for link in bondList['Link']: TC = TC_append(TC, get_TC(link))
        
        # Merge the bond list with the TC table
        bondList = bondList.merge(TC, how = 'outer')
            
            
        output = TC_append(output, bondList)
        page += 1
        
        if last_page(webpage) or page > endPage: return output #if the current downloaded page is the last page, exit loop

In [11]:
# remove unnecessary columns and write results to csv

def output_list(df, path):
    
    cols = ['Last Trade Date', 'Tenor', 'Circulation Range', 'Planned Issue Amount (100 millions)',
            'Yield for Reference', 'Reference Rate', 'Spread to Reference Rate', 'Rating Agency 1',
            'Issue/Issuer Rating 1', 'Rating Agency 2', 'Issue/Issuer Rating 2']
    
    df.drop(cols, axis = 1).to_csv(path, index = False, encoding = 'utf-8')

In [28]:
# download the master mapping table from cninfo

def get_bond_map():
    
    webpage = requests.get('http://www.cninfo.com.cn/disclosure/bondcross-references.jsp')
    
    # encoding parameter is necessary as some characters used are outside of the specified char set
    soup = bs4.BeautifulSoup(webpage.content, 'html.parser', from_encoding='GB18030')
    
    data = [x.get_text() for x in soup.find_all('td', {'class': ['zx_center_title', 'zx_data6']})]
    data = np.array(data).reshape(len(data)//8,8)
    
    # put the data into a dataframe, using the first row as column headers
    bond_map = pd.DataFrame(data[1:,:], columns = data[0,:])
    
    return bond_map

In [126]:
# Change the layout of the map to enable lookup

def gather_map(bond_map):
    
    bond_map = pd.melt(bond_map, id_vars = ['债券名称', '到期日期'],
                               value_vars = ['深交所代码', '上交所代码', '银行间代码'],
                               var_name = '交易所', value_name = '代码')
    bond_map['交易所'] = bond_map['交易所'].str.replace('代码', '')
    
    return bond_map

In [269]:
def map_exchange(row, bondmap):
    if (row['Code'], row['Maturity Date']) in bondmap.index:
        return bondmap.loc[(row['Code'], row['Maturity Date']),'交易所']
    else:
        return np.nan
        
output1.apply(lambda row: map_exchange(row, bondmap1), axis = 1).sort_values()

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
10    NaN
11    NaN
12    NaN
13    NaN
14    NaN
15    NaN
16    NaN
17    NaN
18    NaN
19    NaN
20    NaN
21    NaN
22    NaN
23    NaN
24    NaN
25    NaN
26    NaN
27    NaN
28    NaN
29    NaN
       ..
258   NaN
259   NaN
260   NaN
261   NaN
262   NaN
263   NaN
264   NaN
265   NaN
266   NaN
267   NaN
268   NaN
269   NaN
270   NaN
271   NaN
272   NaN
273   NaN
274   NaN
275   NaN
276   NaN
277   NaN
278   NaN
279   NaN
280   NaN
281   NaN
282   NaN
283   NaN
284   NaN
285   NaN
286   NaN
287   NaN
Length: 288, dtype: float64

## Functions to download T&C given a list of symbols

In [31]:
# Convert symbols to codes used in the URLs

def find_code(symbol):
    
    soup = parse_link('http://www.chinamoney.com.cn/fe/chinamoney/seniorSearchAjaxAction.action?bondCode=' 
                         + str(symbol) + '&bondName=')
    code = re.search('[0-9]+',str(soup))
    
    if code == None:
        flag = False
        code = None
    else:
        flag = True
        code = code.group(0)
    
    return flag, code

In [108]:
def TC_from_code(code):
    
    link = 'http://www.chinamoney.com.cn/fe/chinamoney/searchDetailInfo.action?entyCode=' + code
    TC = get_TC(link)
        
    return TC

In [144]:
'''def TC_from_symbol(start, end):
    
    TC = pd.DataFrame()
    
    for symbol in mmlist[start:end]:
        
        flag, code = find_code(symbol)
        if flag == True:
            link = 'http://www.chinamoney.com.cn/fe/chinamoney/searchDetailInfo.action?entyCode=' + code
            TC = TC_append(TC, get_TC(link))
        
    return TC'''