In [8]:
from bs4 import BeautifulSoup
import pandas as pd
import os
from io import StringIO


def htmls2csv(column_index,
              html_files_directory='./tempHTML/房地产公司汇总/',
              encode='utf-8',
              selector='div.ant-table-body > table',
              output_file='output_combined'):
    """
    提取指定文件夹中的所有html中 指定样式的的表格，汇总导出为一个csv文件
    :param column_index: 列索引
    :param html_files_directory: 待提取的HTML文件夹
    :param encode: 保存后的编码方式
    :param selector: 表格的选择器
    :param output_file: 导出的csv文件名字
    """
    # Directory containing your HTML files
    # Output CSV file
    output_csv_file = output_file + '.csv'

    df_list = []
    # 遍历文件夹中的所有HTML文件
    html_files = sorted([filename for filename in os.listdir(html_files_directory) if filename.endswith(".html")])
    print(len(html_files))
    # 遍历排序后的文件名列表
    for filename in html_files:
        file_path = os.path.join(html_files_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file_in:
            html_content = file_in.read()
        soup = BeautifulSoup(html_content, 'lxml')
        # Find all tables with the specified style
        tables = soup.select(selector=selector)

        # Check if any tables with the specified style were found
        if tables:
            # Iterate through each table, convert it to a DataFrame, and add it to the list
            for table in tables:
                html_str = str(table)
                print(file_path)
                # df_list.append(pd.read_html(StringIO(html_str))[0])
                temp_df = pd.read_html(StringIO(html_str))[0]
                # Resetting the index of the DataFrame
                temp_df = temp_df.reset_index(drop=True)
                df_list.append(temp_df)
        else:
            print(f"there is no tables with the specified style in the {filename}")

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(df_list, ignore_index=True)
    # 去除全部是NULL的行和列
    combined_df = combined_df.dropna(axis=0, how='all')
    combined_df = combined_df.dropna(axis=1, how='all')
    # resetting the index of the DataFrame and dropping the old index column
    combined_df = combined_df.reset_index(drop=True)
    combined_df.columns = column_index
    combined_df.to_csv(output_csv_file, index=True, encoding=encode)
    # print(combined_df)

In [9]:
def getAllEntreprise():
    """
    通过企业预警通, 保存所有发行过债券的房地产公司名单
    只保留非国营、非央企
    """
    all_enterprise_column_index = [ # 企业预警通中所有发行过债券的企业表的表头
        '序号', '企业名称', '经营状态', '法定代表人',
        '注册资本', '注册资本币种', '成立日期', '核准日期',
        '所属省', '所属市', '所属区/县', '统一社会信用代码',
        '联系电话', '邮箱', '参保人', '企业性质',
        '组织形式', '国标行业门类', '国标行业大类', '国标行业中类',
        '国标行业小类', '网址', '注册地址', '通信地址', '是否发行债券', '经营范围']
    selector = "div.ant-table-body > table"
    htmls2csv(column_index=all_enterprise_column_index, output_file="DATA/企业预警通/所有发行债券企业",encode='utf-8',selector=selector,html_files_directory='./tempHTML/房地产公司汇总/')
    
def getDaultBond():
    """
    通过企业预警通, 保存所有债券违约的房地产公司名单
    """
    default_column_index = [ #
        '序号','债券代码','债券简称','最新违约日',
        '最新违约金额','发行人','主承','违约原因','最新违约类型',
        '累计违约金额','首次违约日','企业性质','行业',
        '地区','最新主体评级','债券市场','债券类型'
    ]
    selector = "div.ant-table-body > table"
    htmls2csv(column_index=default_column_index, output_file="DATA/企业预警通/债券违约",encode='utf-8',selector=selector,html_files_directory='./tempHTML/债券违约/')

def getDefaultEntity():
    """
    通过企业预警通, 保存所有债券违约的违约主体
    """
    entity_column_index = [
        '序号','违约主体','首次违约日期','违约只数',
        '违约金额','已偿还','偿还进度','行业','地区',
        '企业性质'
    ]
    selector = "div.ant-table-body > table"
    htmls2csv(column_index=entity_column_index, output_file="DATA/企业预警通/违约主体",encode='utf-8',selector=selector,html_files_directory='./tempHTML/违约主体/')
    
def getAmericanBond():
    bond_column_index = [
        '序号','ISIN代码','债券全称',	
        '违约日期','信用主体','违约原因',	
        '违约金额(亿美元)','违约金额币种','发行人',
        '发行规模(亿元)','起息日','最新票息利率(%)',
        '兑付日','企业性质','主体评级','行业'
    ]
    selector = "div.ant-table-body > table"
    htmls2csv(column_index=bond_column_index, output_file="DATA/企业预警通/美债违约",encode='utf-8',selector=selector,html_files_directory='./tempHTML/美债违约/')

def getBondDetail():
    bond_column_index = [
        '序号','债券代码','债券简称','发债人',
        '展期公告日','状态','展期总额(亿)',
        '已兑付金额(亿)','展期本金(亿)','展期利息(亿)','展期孳息(亿)',
        '首次展期时余额(亿)','首次展期日期','展期起始日','展期截止日',
        '展期类型','历程','债券类型','首次展期债项评级','是否城投',
        '企业性质','所属行业','所属地区','首次展期主体评级'
    ]
    selector = "div.ant-table-body > table"
    htmls2csv(column_index=bond_column_index, output_file="DATA/企业预警通/债券展期明细",encode='utf-8',selector=selector,html_files_directory='./tempHTML/债券展期明细/')

In [10]:
if __name__ == "__main__":
    # getAllEntreprise()
    getDaultBond()
    # getDefaultEntity()
    # getAmericanBond()
    # getBondDetail()

4
./tempHTML/债券违约/1.html
./tempHTML/债券违约/2.html
./tempHTML/债券违约/3.html
./tempHTML/债券违约/4.html
