In [1]:
import re
import os
import json
import pandas as pd
from tqdm import tqdm
from glob import glob
from bs4 import BeautifulSoup
from scrapy.selector import Selector

In [2]:
class BaseParser(object):
    def __init__(self):
        self._font_map = {
            'title': 22,
            'h1': 20,
            'TLV1': 20,
            'h2': 18,
            'TLV2': 18,
            'h3': 16,
            'TLV3': 16,
            'h4': 14,
            'TLV4': 14,
        }
        pass

    def parser(self):
        pass
    
    def file2text(self):
        pass
    
    def recursive_dir(self, root_path, output_path, type_list=[], filter_map=None):
        for path in os.listdir(root_path):
            concat_ipath = os.path.join(root_path, path)
            if os.path.isdir(concat_ipath):
                concat_opath = os.path.join(output_path, path)
                if not os.path.exists(concat_opath):
                    os.mkdir(concat_opath)
                self.recursive_dir(concat_ipath, concat_opath, type_list=type_list, filter_map=filter_map)
            else:
                files = os.path.splitext(path)
                file_name_o, file_suffix = files[0], files[-1][1:]
                if file_suffix not in type_list:
                    continue

                start = 1
                if filter_map is not None and file_name_o in filter_map:
                    start = filter_map[file_name_o]

                # self.file2text(concat_ipath, output_path, start=start)
                try:
                    self.file2text(concat_ipath, output_path, start=start)
                except Exception as e:
                    print(e)
    
    def construct_content(self, content_infos, main_font_size=12):
        pre_infos = []
        pre_fonts = []
        results = []
        for info in content_infos:
            font_size = info['font-size']
            content = info['content']
            if len(pre_fonts) == 0:
                if font_size <= main_font_size:
                    tmp = {
                        'title': '绪论',
                        'contents': [content],
                        'childs': []
                    }
                else:
                    tmp = {
                        'title': content,
                        'contents': [],
                        'childs': []
                    }
                
                results.append(tmp)
                pre_infos.append(tmp)
                pre_fonts.append(font_size)

            else:
                if font_size <= main_font_size:
                    pre_infos[-1]['contents'].append(content)
                else:
                    while len(pre_fonts) > 0 and font_size >= pre_fonts[-1]:
                        pre_fonts.pop()
                        pre_infos.pop()
                    tmp = {
                        'title': content,
                        'contents': [],
                        'childs': []
                    }
                    if len(pre_fonts) == 0:
                        results.append(tmp)
                    else:
                        pre_infos[-1]['childs'].append(tmp)
                    
                    pre_infos.append(tmp)
                    pre_fonts.append(font_size)
        
        return results


In [None]:
import pandas as pd

In [38]:
class ChmParser(BaseParser):
    def __init__(self, input_path, output_path):
        super().__init__()
        """
        chm文件解析代码
        输入：
        - input_path: chm文件路径 (必须)
        - output_path: 输出文件路径（路径，非文件名），文件为原文件名保持一致的.json文件
        """
        super().__init__()
        self._input_path = input_path
        self._output_path = output_path

    def parser(self):
        if not os.path.exists(self._output_path):
            os.makedirs(self._output_path, exist_ok=True)
        
        self.recursive_dir(self._input_path, self._output_path, type_list=['chm', 'CHM'])

    def file2text(self, chm_file, output_path, start=1):
        file_name = os.path.splitext(os.path.basename(chm_file))[0]
        if '&' in file_name:
            file_name = file_name.replace('&', 'and')
            new_chm_file = chm_file.replace('&', 'and')
            os.rename(chm_file, new_chm_file)
            chm_file = new_chm_file

        file_name_m = file_name.replace(' ', '_')
        # chm文件转成html文件
        output_file = os.path.join(output_path, f'chm_html_{file_name_m}')
        if not os.path.exists(output_file):
            os.makedirs(output_file, exist_ok=True)
            print(f"hh -decompile {output_file} {chm_file}")
            os.system(f"hh -decompile {output_file} {chm_file}")

        txt_file = os.path.join(output_path, file_name + '.txt')
        json_file = os.path.join(output_path, file_name + '.json')

        texts = [file_name]
        content_infos = [{'font-size': 24, 'content': file_name}]
        
        

        for root, dirs, file_names in os.walk(output_file):
            """
            os.walk()
            root 所指的是当前正在遍历的这个文件夹的本身的地址
            files_names同样是 list , 内容是该文件夹中所有的文件(不包括子目录)
            函数会自动改变root的值使得遍历所有的子文件夹。
            所以返回的三元元组的个数为所有子文件夹（包括子子文件夹，子子子文件夹等等）加上1（根文件夹）
            """
            print(root)
            # 把file_names列表元素重新排个序(但是里面不仅有htm文件，还有hhc和hhk文件)
            # file_names是list类型，现在里面的元素都是str类型（数字后面加‘.htm’这种），直接排序会按长度
            # 先去掉.htm，转成int，排序后，再转成str，统一加上.htm？
            # print(file_name)
            print(len(file_names))
            
            new_file_names = []
            for file_name in file_names:
                if not file_name.endswith('htm'):
                    continue          
                else:
                    # print(file_name)
                    new_file_names.append(file_name)

            if new_file_names != []:
#                 new_file_names = [i.zfill(3) for i in new_file_names]
#                 new_file_names.sort()
                new_file_names = [i.replace('.htm', '') for i in new_file_names]
                new_file_names = list(map(int, new_file_names))
                new_file_names.sort() 
                new_file_names = [str(i) for i in new_file_names]
                new_file_names = [i+'.htm' for i in new_file_names]
                new_file_names.pop(0)
                # print(type((new_file_names))
                # print("444")
                for file_name in new_file_names:
                    chtml_file = os.path.join(root, file_name) #问题出在这里？
                    change_path_chtml_file = chtml_file.replace('\\', '/')
                    # print(change_path_chtml_file)
                    soup = BeautifulSoup(open(change_path_chtml_file, 'rb'), features='html.parser')
                    # item = soup.body.descendants
                    # item = soup.body
                    # item = soup.select('td')[0]
                    item = soup.find('td',{'valign':'top'})
                    # print(type(item))
                    if item is None:
                        continue
                    else:
                        self.recursive_parser(item, texts, content_infos, "")
                
    
        with open(txt_file, 'w', encoding="utf-8") as f:
            f.write('\n'.join(texts))
        
        # 结构化
        structual_contents = self.construct_content(content_infos)
        with open(json_file, 'w', encoding="utf-8") as f:
            f.write(json.dumps(structual_contents, indent=2, ensure_ascii=False))


    def recursive_parser(self, item, texts, content_infos, pre_class):
        for child in item:
            try:
                if isinstance(child, str):
                    content = child.strip()
                    if len(content) > 0:
                        texts.append(content)
                        content_infos.append({'font-size': 12, 'content': content})
                    continue
                
                tag = child.name

                if tag in ['script', 'link', 'table', 'tbody']:
                    continue
                
            
                content = child.text.strip()
                if len(content) == 0:
                    continue
                    
                if "图缺" in content:
                    continue
                
                if tag == 'center':
                    p_info = {}
                    p_info['font-size'] = 20
                    p_info['content'] = content
                    content_infos.append(p_info)
                    texts.append(content)
                    continue
                
                if tag == 'div' and 'id' in child.attrs and len(child.attrs['id']) > 0:
                    id_ = child.attrs['id']
                    if id_ in ['printpreview_header']:
                        continue
                
                if tag == 'div' and 'class' in child.attrs and len(child.attrs['class']) > 0:
                    # 对div标签的class进行判别，不同情况分别处理
                    class_ = child.attrs['class'][0]
                    if class_ in ['SECONDARY', 'AUTHORS', 'HNUM', 'BY', 'SG', 'PN', 'pagenum', 'TB', 'SIDEBAR']:
                        continue

                    p_info = {}
                    p_info['content'] = content
                    if class_ in ['h1', 'h2', 'h3', 'h4']:
                        p_info['font-size'] = self._font_map[class_]
                        content_infos.append(p_info)
                        texts.append(content)
                    
                    elif class_ in ['HD', 'FRTITLE']:
                        if pre_class in self._font_map:
                            p_info['font-size'] = self._font_map[pre_class]
                        else:
                            p_info['font-size'] = 14
                        content_infos.append(p_info)
                        texts.append(content)

                    elif class_ == 'P':
                        p_info['font-size'] = 12
                        content_infos.append(p_info)
                        texts.append(content)
                    
                    else:
                        self.recursive_parser(child, texts, content_infos, class_)

                elif tag in ['section', 'div', 'ul', 'ol']:
                    # 这几类标签 直接递归处理子标签
                    self.recursive_parser(child, texts, content_infos, '')
                    
                else:
                    # 剩余情况直接获取标签内容，并根据标签类型确定字体大小
                    p_info = {}
                    p_info['content'] = content
                    if tag in self._font_map:
                        p_info['font-size'] = self._font_map[tag]
                    else:
                        p_info['font-size'] = 12

                    content_infos.append(p_info)
                    texts.append(content)
            
            except Exception as e:
                print(e)

In [None]:
import urllib 

In [None]:
if __name__ == '__main__':
    source_path = 'C:/Users/myzhang43/Desktop/Project/chmparser/data/中医著作_chm文件'
    output_path = 'C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析'
    chmparser = ChmParser(source_path, output_path)
    chmparser.parser()

C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《一得集》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《一得集》\829
103
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《一草亭目科全书》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《一草亭目科全书》\241
17
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《丁甘仁医案》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《丁甘仁医案》\674
81
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《万氏秘传外科心法》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《万氏秘传外科心法》\867
112
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《万氏秘传片玉心书》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《万氏秘传片玉心书》\787
269
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《万病回春》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《万病回春》\616
184
C:/Users/myzhang43/Desktop/Proje

C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《伤寒法祖》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《伤寒法祖》\837
17
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《伤寒百证歌》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《伤寒百证歌》\862
102
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《伤寒直格》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《伤寒直格》\847
53
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《伤寒舌鉴》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《伤寒舌鉴》\904
12
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《伤寒补例》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《伤寒补例》\261
20
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《伤寒论》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《伤寒论》\98
138
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析

C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《医学从众录》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《医学从众录》\738
95
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《医学传心录》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《医学传心录》\142
104
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《医学传灯》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《医学传灯》\320
40
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《医学入门》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《医学入门》\346
1126
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《医学启源》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《医学启源》\784
80
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《医学妙谛》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《医学妙谛》\825
64
C:/Users/myzhang43/Desktop/Project/chmparser/data/c

C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《古今医统大全》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《古今医统大全》\336
1668
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《古今医鉴》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《古今医鉴》\613
201
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《古今名医汇粹》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《古今名医汇粹》\631
109
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《古代房中秘方》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《古代房中秘方》\1023
75
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《史载之方》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《史载之方》\836
62
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《叶天士医案精华》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《叶天士医案精华》\802
40
C:/Users/myzhang43/Desktop/Project/

C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《女科切要》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《女科切要》\835
130
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《女科折衷纂要》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《女科折衷纂要》\874
84
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《女科指掌》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《女科指掌》\856
65
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《女科指要》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《女科指要》\266
16
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《女科撮要》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《女科撮要》\855
35
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《女科旨要》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《女科旨要》\200
54
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm

C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《幼幼集成》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《幼幼集成》\630
176
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《幼科切要》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《幼科切要》\294
46
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《幼科发挥》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《幼科发挥》\754
68
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《幼科心法要诀》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《幼科心法要诀》\774
205
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《幼科折衷》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《幼科折衷》\755
74
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《幼科指南》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《幼科指南》\873
32
C:/Users/myzhang43/Desktop/Project/chmparser/data/ch

C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《景景医话》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《景景医话》\148
60
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《曹仁伯医案论》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《曹仁伯医案论》\226
27
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《服食导饵》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《服食导饵》\1022
27
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《望诊遵经》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《望诊遵经》\736
104
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《本经逢原》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《本经逢原》\633
771
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《本草乘雅半偈》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《本草乘雅半偈》\938
423
C:/Users/myzhang43/Desktop/Project/chmparser/d

C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《温病指南》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《温病指南》\271
14
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《温病条辨》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《温病条辨》\681
88
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《温病正宗》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《温病正宗》\723
82
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《滇南本草》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《滇南本草》\939
512
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《濒湖脉学》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《濒湖脉学》\110
31
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《灵枢悬解》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《灵枢悬解》\1291
85
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解

C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《类经图翼》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《类经图翼》\641
166
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《类证治裁》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《类证治裁》\592
394
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《类证活人书》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《类证活人书》\691
28
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《素灵微蕴》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《素灵微蕴》\1297
31
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《素问悬解》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《素问悬解》\1290
137
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《经方实验录》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《经方实验录》\1032
100
C:/Users/myzhang43/Desktop/Project/chmparser/da

C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《评注产科心法》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《评注产科心法》\891
83
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《评琴书屋医略》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《评琴书屋医略》\1040
58
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《诊宗三昧》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《诊宗三昧》\845
26
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《诊家枢要》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《诊家枢要》\217
16
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《诊家正眼》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《诊家正眼》\801
79
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《诊脉三十二辨》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《诊脉三十二辨》\248
36
C:/Users/myzhang43/Desktop/Project/chmparser/

C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《针经指南》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《针经指南》\933
20
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《钱氏秘传产科方书名试验录》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《钱氏秘传产科方书名试验录》\199
10
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《银海指南》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《银海指南》\751
69
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《银海精微》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《银海精微》\781
103
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《长沙药解》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《长沙药解》\1299
164
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《阴证略例》
2
C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析\chm_html_《阴证略例》\871
50
C:/Users/myzhang43/Desktop/Project/chmp

In [5]:
path_results1 = "C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析/json"
path_results2 = "C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析/txt"
path_merges = "C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析/results"
# 合并json文件
merges_json = os.path.join(path_merges, "results.json")
with open(merges_json, "w", encoding="utf-8") as f0:
    for file in os.listdir(path_results1):
        with open(os.path.join(path_results1, file), "r", encoding="utf-8") as f1:
            for line in tqdm.tqdm(f1):
                line_dict = json.loads(line)
                js = json.dumps(line_dict, ensure_ascii=False)
                f0.write(js + '\n')
            f1.close()
    f0.close()

# # 合并txt文件
# merges_txt = os.path.join(path_merges, "results.txt")
# with open(merges_txt, "w", encoding="utf-8") as f0:
#     for file in os.listdir(path_results2)
#         with open(os.path.join(path_results2, file), "r", encoding="utf-8") as f1:
#             for line in tqdm.tqdm(f1):
#                 f0.writelines(line + '\n')
            
#             f1.close()
#     f0.close()

AttributeError: type object 'tqdm' has no attribute 'tqdm'

In [4]:
path_results1 = "C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析/json"
path_results2 = "C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析/txt"
path_merges = "C:/Users/myzhang43/Desktop/Project/chmparser/data/chm文件解析/results"
filenames=os.listdir(path_results2) 

file=open('results.txt','w', encoding='utf8') 
#先遍历文件名  
for filename in filenames:  
    filepath=path_results2+'\\'
    filepath=filepath+filename
    #遍历单个文件，读取行数  
    for line in open(filepath, encoding='utf8'):  
        file.writelines(line)  
    file.write('\n')  
#关闭文件  
file.close()