In [2]:
# 导入相关包
import os
import pathlib as pl
import pandas as pd
import numpy as np
import re
from io import StringIO
from datetime import datetime,timedelta
import time
from IPython.core.interactiveshell import InteractiveShell
from tqdm.autonotebook import *
import pdfplumber
import collections
tqdm.pandas()
InteractiveShell.ast_node_interactivity = "all"
sys.path.append("..")

from tgrocery import Grocery
import jieba

In [3]:
# 数据准备(train_output文件中格式有点问题，需要提前用excel或者wps打开然后另存为excel文件)
train_outputs = pd.read_excel('datasets/train_output.xlsx')

# 获取pdf中文字和表格
def extract_pdf_content(pdf_path):
    text_list = []
    table_list = []
    with pdfplumber.open(pdf_path) as pdf:
        for index_page in np.arange(0, len(pdf.pages), 1):
            # 读取多页
            page = pdf.pages[index_page]   # 第n页的信息
            text = page.extract_text()
            text_list.append(text)
            table = page.extract_tables()
            for t in table:
                table_list.append(t)
    return text_list, table_list

def get_dir_file(path):
    '''
    输入文件夹位置，输出整理好的dataframe
    '''
    path_list = os.listdir(path)
    id_list = []
    file_path_list = []
    text_list = []
    table_list = []
    for i in tqdm(path_list):
        if '.PDF' in i:
            file_path = path + i
            id_list.append(int(i.split('.')[0]))
            file_path_list.append(file_path)
            try:
                text_temp, table_temp = extract_pdf_content(file_path)
            except Exception:
                print('此pdf无法读取')
                text_temp, table_temp = [], []
            text_list.append(text_temp)
            table_list.append(table_temp)
            
    df = pd.DataFrame()
    df['sample_id'] = id_list
    df['file_path'] = file_path_list
    df['text'] = text_list
    df['tabel'] = table_list
    df = df.sort_values('sample_id')
    return df

# 文件处理太慢，可持续化保存文件
train_path = 'datasets/train.csv'
if os.path.exists(train_path):
    train_df = pd.read_csv(train_path)
else:
    train_df = get_dir_file('datasets/train_data/')
    train_df.to_csv(train_path,index=False)
    train_df = pd.read_csv(train_path)

test_path =  'datasets/test.csv'
if os.path.exists(test_path):
    test_df = pd.read_csv(test_path)
else:
    test_df = get_dir_file('datasets/test_data/')
    test_df.to_csv(test_path,index=False)
    test_df = pd.read_csv(test_path)

train_outputs.head(2)
train_df.head(2)
test_df.head(2)

Unnamed: 0,sample_id,认购日期,理财产品名称,产品发行方名称,理财类型,认购金额(万元),产品起息日,产品到息日,产品期限,资金来源,实际购买公司名称,实际购买公司和上市公司关系,买卖方是否有关联关系,公告日期
0,1,2019-03-27,汇聚金1号,中融国际信托有限公司,信托,10000.0,2019-03-27,2019-09-23,180天,自有资金,恒生电子股份有限公司,公司本身,否,2019-04-25
1,1,2019-03-27,招商银行步步生金8699,招商银行,银行理财产品,200.0,2019-03-27,NaT,,自有资金,恒生电子股份有限公司,公司本身,否,2019-04-25


Unnamed: 0,sample_id,file_path,text,tabel
0,1,datasets/train_data/1.PDF,[' ...,"[[['', None, None, '', None, None, '', None, N..."
1,2,datasets/train_data/2.PDF,[' ...,"[[['', None, None, '', None, None, '', None, N..."


Unnamed: 0,sample_id,file_path,text,tabel
0,11188,datasets/test_data/11188.PDF,['北京京西文化旅游股份有限公司监事会\n \n \n关于使用部分闲置募集资金购买理财产品的...,[]
1,11189,datasets/test_data/11189.PDF,['北京京西文化旅游股份有限公司 \n监事会关于使用部分自有资金购买理财产品的意见 \n根据...,[]


In [4]:
# 构造训练集验证集
train_df = train_df.sample(frac=1, random_state=1017)

val_df = train_df[:1800]
# val_df = train_df[:1800].head(20)
# train_df = train_df[1800:].head(20)
# test_df=test_df.head(20)

train_outputs["sample_id"]=train_outputs["sample_id"].astype(str)
val_df["sample_id"]=val_df["sample_id"].astype(str)
# train_df["sample_id"]=train_df["sample_id"].astype(str)
# test_df["sample_id"]=test_df["sample_id"].astype(str)


# 有效文本挖掘

In [11]:
#text=text.replace(r'[ ]+',' ').replace('\r','^')
def get_title(text):
    global title_num_char
    title_list=[]
    title_type_list=[]
    text_start_iter_list=[]
    text_end_iter_list=[]
 
    for item in title_num_char:
        pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5 ()\[\]（）【】：:]*?[\^]")
        #pattern = re.compile(item+r"[ ]*?[^ ]+?[ ]")
        #pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5 ()[]（）【】][]")#*?[\^]
        tmp=pattern.finditer(text)
        for i in tmp:
            title_list.append(i.group())
            text_start_iter_list.append(i.span(0)[0])
            title_type_list.append(1)
            text_end_iter_list.append(i.span(0)[1])

    # for item in title_list:
    for item in s_title_num_char:
        # pattern = re.compile(item+r"[ ]*?[ ]*?[\d][^ ]+?:?[ ]?") 
        # pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5\^ ]*")
        # pattern = re.compile(item+'[0-9\u4e00-\u9fa5()[]（）【】*?[\^]]')*?[\^]
    #    pattern = re.compile(item+r"[\d]*?\u4e00-\u9fa5+[ ]")
        pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5 ()\[\]（）【】：:]*?[ ][\^]")
        tmp=pattern.finditer(text)
        for i in tmp:
            title_list.append(i.group())
            text_start_iter_list.append(i.span(0)[0])
            title_type_list.append(1)
            text_end_iter_list.append(i.span(0)[1])

    title_list.append("引言")
    title_type_list.append(1)
    text_start_iter_list.append(0)
    text_end_iter_list.append(0)

    result_df=pd.DataFrame([title_list,title_type_list,text_start_iter_list,text_end_iter_list]).T.sort_values(by=2).reset_index(drop=True)
    # print(result_df)
    return result_df

def get_title_text(text,title_df):
    # print(title_df)
    title_1_df=title_df[title_df[1]==1]
    text_iter_list=[]
    text_list=[]
    # print(title_1_df)
    for iter1,iter2 in title_1_df[[2,3]].values:
        # print(iter1)
        if(len(text_iter_list)!=0):
            text_iter_list.append(iter1)
        text_iter_list.append(iter2)
    # text_iter_list.append(text_iter_list[len(text_iter_list)-1])
    text_iter_list.append(len(text))
    for index in range(int(len(text_iter_list)/2)):
        text_list.append(text[text_iter_list[2*index]:text_iter_list[2*index+1]])
    
    title_1_df[4]=text_list

    return title_1_df.reset_index(drop=True)

#from fuzzywuzzy import fuzz
def judge_title(sample_id=0,text=r"test\n"):
    # print(text)
    text=text.replace(r"\n","^").replace(r'[ ]+',' ')
    title_df=get_title(text)
    title_df["sample_id"]=[sample_id for x in range(title_df.shape[0])]
    # print(title_df)`
    title_1_df=get_title_text(text,title_df)[["sample_id",0,1,2,3,4]]

    

    global val_df
    global train_outputs
    val_true_name=train_outputs[train_outputs["sample_id"]==sample_id]["理财产品名称"]
    
    index=0
    neg_index=[]
    for title_des in title_1_df[0].values:
        for item in title_neg_words:
            if re.search(item,title_des) is not None:
                neg_index.append(index)
                break
        index+=1


    return title_1_df.drop(neg_index)
    # print(title_list)

def get_judge_title_result(val_df):

    judge_title_result=None


    for sample_id,text in tqdm(val_df[["sample_id","text"]].values):
        # print(sample_id)
        # print(text)
        judge_title_result= judge_title(sample_id,text) if judge_title_result is None else pd.concat([judge_title_result,judge_title(sample_id,text)])

        judge_title_result["sample_id"]=judge_title_result["sample_id"].astype(str)        
    return judge_title_result

title_num_char=["一、","二、","三、","四、","五、","六、","七、","八、","九、","十、","十一、","十二、","十三、","十四、","十五、"]
s_title_num_char=["（一）","（二）","（三）","（四）","（五）","（六）","（七）","（八）","（九）","（十）","（十一）","（十二）","（十三）","（十四）","（十五）"]
s_title_num_char.extend(["[(]一[)]","[(]二[)]","[(]三[)]","[(]四[)]","[(]五[)]","[(]六[)]","[(]七[)]","[(]八[)]","[(]九[)]","[(]十[)]","[(]十一[)]","[(]十二[)]","[(]十三[)]","[(]十四[)]","[(]十五[)]"])

title_pos_words=[]
title_neg_words=[]
title_neg_words=["备查","日前","过去","履行","审批","程序","风险","措施","影响","累计","赎回","到期[^日].*[^2][^0]","到期.{0,2}[/^]","截至","意见","十二个月内","公告前","报备文件","前期"]

val_judge_title_result=get_judge_title_result(val_df)
# train_judge_title_result=get_judge_title_result(train_df)
# test_judge_title_result=get_judge_title_result(test_df)
# judge_title_result.to_excel("训练集段落标题分类结果.xlsx",index=None)

100%|██████████| 1800/1800 [00:58<00:00, 30.95it/s]


In [12]:
###9月11日
#val_judge_title_result.head(100)
def get_content_df(val_judge_title_result):
    sample_id=val_judge_title_result['sample_id'].drop_duplicates() 
    text_list=[]
    text_sampleid_list=[]
    content_df=pd.DataFrame()
    for i in tqdm(sample_id):
        text_df=val_judge_title_result[val_judge_title_result['sample_id']==i]
    #  text_df.iloc[0,5]
        #text_df
        n=len(text_df)
        text_sampleid_list.append(i)
        text=''
        for y in range(0,n):
            text=text+text_df.iloc[y,1]+text_df.iloc[y,5]
            text=re.sub("[ ]+"," ",text).replace("（","(").replace("）",")")
        #text
        text_list.append(text)
    content_df["sample_id"]=text_sampleid_list
    content_df["text"]=text_list
    return content_df.reset_index(drop=True)


val_content_df=get_content_df(val_judge_title_result)
train_content_df=get_content_df(train_judge_title_result)
test_content_df=get_content_df(test_judge_title_result)

val_content_df=pd.merge(val_content_df,val_df[["sample_id","tabel"]],on="sample_id")
# train_content_df=pd.merge(train_content_df,train_df[["sample_id","tabel"]],on="sample_id")
# test_content_df=pd.merge(test_content_df,test_df[["sample_id","tabel"]],on="sample_id")

100%|██████████| 1800/1800 [00:12<00:00, 147.52it/s]


NameError: name 'train_judge_title_result' is not defined

# 名称提取

In [13]:
######计算文本位置长度#############
def len_count(my_list,my_str):
    my_len=0
    for i in my_list:
        my_len=my_len+len(i)
    my_len=my_len+(len(my_list)-1)*len(my_str)
    return my_len


######基于冒号做文本拼接与分割####效果优
def maohao_cat(text):
    text=text.replace(r"\n","^")
    text=text.replace(r"', '","^")
    text=text.replace(' ','')
    text=text.replace('（','(').replace('）',')').replace('；','^')     ###############;直接去掉,有极个别答案(答案中出现了引号)将会收到负反馈，多数正反馈
    text=re.sub('\^[1-9]\^','^',text)
    text=re.sub('\^\-[1-9]\-\^','^',text)
    text=re.sub("[（][^）]*?[\^]*$",'',text)
    text=re.sub('[，]*$','',text)
    pos_word=["名称","期限","年","月","日","时间","产品","资金","金额","天","来源","总额","类型","元","关系","无关","不存在关"]
    if '^' not in text:
        return 
    my_list=text.split("^")
    my_str=''
    str_list=[]
    # for i in my_list:
    #     if "：" not in i and ":" not in i and "无关" not in i and "不存在关" not in i and "名称" not in my_str:
    #         my_str=my_str+i
    #         # if len(i)<=6:
    #         #     str_list.append(my_str)      #############解决最后一行被后函数因字数过长舍弃的办法
    #         #     continue
    #     else:
    #         str_list.append(my_str)
    #         my_str=i
    
    for i in my_list:
        if "：" not in i and ":" not in i and len(re.sub('[^\u4e00-\u9fa5]*','',i))<5:
            my_str=my_str+i
            # if "不存在关" in my_str:
            #     str_list.append(my_str)
            #     my_str=''
            # if len(i)<=6:
            #     str_list.append(my_str)      #############解决最后一行被后函数因字数过长舍弃的办法
            #     continue
        else:
            str_list.append(my_str)
            my_str=i
    
    str_list.append(my_str)
    # print(my_list)
    # print('-----')
    # print(str_list)
    # print('-----')
    cat_text=''
    # print(str_list)
    for i in str_list:
        cat_text=cat_text+i+r'^'
        # print(cat_text)
    # print(cat_text.replace(r'^','\n'))
    return cat_text

    

        





######寻找文本中的答案##############
####以如下及冒号行做切片#############
def wjc_spl(text):
    spl1=[]      ###如下切片
    maohao=3  ####设定冒号出现次数阈值
    zishu=30   ####设定单行冒号后字数阈值
    my_str=""
    text=maohao_cat(text)    #########调用函数做text正规化
    spl2=[]      ###行切片
    spl3=[]
    neg_word=[]
    pos_word=["名称","期限","年","月","日","时间","产品","资金","金额","天","来源","总额","类型","元","关系","额","金","受托","签约银行"]
    # pos_word=["名称"]
    # print(len(text))
    if text==None:
        return -1,-1,-1
    text=text.replace(r"\n","^")
    # print(len(text))
    text=text.replace(r"', '","^")    ######解决换页符被置为', '的问题
    # print(len(text))
    text=text.replace(' ','')
    text=text.replace('（','(').replace('）',')').replace(':','：').replace('如^下：','如下：').replace('如下^：','如下：')
    # print(len(text))
    # print(text[:10000])
    if "如下：" not in text:
        return -1,-1,-1
    else:
        move=[]
        spl1=text.split("如下：")   #####首次切割####
        # print(len_count(spl1,'如下：'))
        # print(len(spl1[0])+len(spl1[1]))
        my_len=len(spl1[0])+len('如下：')
        spl1=spl1[1:]              ######第一次如下前的内容不关心##########
        len_list=[]
        tag1=[]
        # print(spl1)
        for i in spl1:
            len_list.append(len(i))
            if i.count("：")<=maohao:
                # print('aaaaaaa',len(len_list))
                tag1.append(len(len_list))
                move.append(i)
        # move=list(set(move))
        for x in move:
            # print('pppppppppppppppppppppppppppp')
            # print(x)
            spl1.remove(x)    ####去除冒号过少的部分切片
                
        # print(my_len+0)
        # print(spl1)
        for i in spl1:
            test=i.split('^')
            #print(test[10])
            for j in test:
                spl2.append(j)
            
        # print(spl2[100])
        move=[]
        for i in spl2:
            #print(i)
            if i.count("：")!=1 and i.count(":")!=1:
                # print(i.count("："))
                move.append(i)    ####以行做切片，去除单行里非只有一个冒号的行
        for x in move:
            # print(x)
            spl2.remove(x)
        # print(spl2[0])
        move=[]
        for i in spl2:
            judge1=re.sub('[a-zA-Z]','',i.split("：")[-1])
            judge1=re.sub(r"\d",'',judge1)
            judge1=re.sub(r'”','',judge1)
            judge1=re.sub(r'“','',judge1)
            judge1=re.sub(r'（','',judge1)
            judge1=re.sub(r'）','',judge1)
            # print(len(judge1))
            if len(judge1)>=zishu or len(judge1)<1:     #######去除冒号后关心字段所提中文内容过长或过短的行
                move.append(i)
                # print(i)
        for x in move:
            # print(x)
            spl2.remove(x)
        # print(spl2)
        move=[]
        # print(spl2)
        # print('-----------')

        # print(spl2)
        if spl2==[]:
            # print('该文本变量做行切片结果为空')
            return -1,-1,-1

        for i in spl2:
            x=0
            for j in pos_word:
                judge2=i.split("：")[0]      ########舍弃冒号前（分类字段）不包含pos_word的行
                if j not in judge2:
                    x=x+1
                if x==len(pos_word):
                    # print(i)
                    move.append(i)
            new_move=list(set(move))
            # print(new_move)
        for x in new_move:
            # print(x)
            spl2.remove(x)
            
        # print(spl2)

        


    
    tag1=0
    key_pos_word=['名称']                 ############计算一个pdf生成的dataframe一共需要几行，及每行需要多少列###########
    x=1
    ele_times=[]
    judge3=0
    for i in spl2:
        for j in key_pos_word:
            if j not in i:
                x=x+1                     ############若不含名称字段则结果会是实际+1###############
            else:
                ele_times.append(x)
                x=1
    
            # print(i)
            # print(x)
    ele_times.append(x)

    
    # print(ele_times)
    if len(ele_times)>1:
        mo_times=ele_times[0]+ele_times[-1]-1
        # print(type(mo_times))
        ele_times=ele_times[1:-1]
        ele_times.append(mo_times)
    else:
        test=0
        # print(len(spl2))
        # print(ele_times[0])
        if len(ele_times)==1 and ele_times[0]==len(spl2)+1:
            # print("不含名称字段")                              ##############此处需处理，或者返回一个标记以后处理  冒号行不包含名称字段的情况
            tag1=1
    

    ele_times1=ele_times
    # print(ele_times)
    # print(type(ele_times))

    tag2=0
    key_pos_word=['金额']                 ############计算一个pdf生成的dataframe一共需要几行，及每行需要多少列###########
    x=1
    ele_times=[]
    judge3=0
    for i in spl2:
        for j in key_pos_word:
            if j not in i:
                x=x+1                     ############若不含金额字段则结果会是实际+1###############
            else:
                ele_times.append(x)
                x=1
    
            # print(i)
            # print(x)
    ele_times.append(x)
    # print(ele_times)
    if len(ele_times)>1:
        mo_times=ele_times[0]+ele_times[-1]-1
        # print(type(mo_times))
        ele_times=ele_times[1:-1]
        ele_times.append(mo_times)
    else:
        test=0
        # print(len(spl2))
        # print(ele_times[0])
        if len(ele_times)==1 and ele_times[0]==len(spl2)+1:
            # print("不含金额字段")                              ##############此处需处理，或者返回一个标记以后处理  冒号行不包含金额字段的情况
            tag2=1
    
    ele_times2=ele_times
    # print(ele_times2)

    # print('---')

    

    if len(spl2)<=2:
        return -1,-1,-1

    

    # print(ele_times2)
    # if len(spl2)==1 and spl2[0]=='':
    #     return
    # if len(spl2)==1:
    #     spl2[0].replace(' ','')
   
    return spl2,ele_times1,tag1     ######仅返回“关心”的行切片    #############注意返回变量可能造成函数无法执行



def cut_list(text):
    spl2,ele_times1,tag1=wjc_spl(text)
    # print(spl2)
    # print(ele_times1)
    # print(tag1)
    my_list=[]
    if tag1==0:
        for i in ele_times1:
            # print(i)
            get=spl2[0:i]
            my_list.append(get)
            spl2=spl2[i:]
    else:
        return 0
    # print(my_list)
    return my_list



def get_mc(text):
    # my_list=cut_list(text)
    mc_list=[]
    my_list=cut_list(text)
    if my_list==0:
        return 
    else:
        my_list=cut_list(text)

        for i in my_list:
            this_str=i[0]
            if "名" not in this_str or "受托方" in this_str or "公司名称" in this_str:
                continue
            index=this_str.index("：")
            this_str=this_str[index+1:].replace('。','').replace('；','').replace(';','').replace('、','')
            mc_list.append(this_str)
            # print(mc_list)
    if mc_list ==[]:
        return
    return mc_list



def spl2_iswm(text):
    spl2,ele_times1,tag1=wjc_spl(text)
    # print(spl2)
    # print(ele_times1)
    # print(tag1)
    my_list=[]
    if tag1==1:
        return "无名称"
    else:
        return 



# 位置定位

In [14]:
# 找到关键字在文本中的位置
def find_pos_in_text(text, keys, sample_id = 0):
    ret = []

    if not keys or len(keys) == 0 or not text:
        return None

    textWithArrow = re.sub(' +', ' ', text).replace(r'\n', '^').replace(' ', '^')
    textWithoutWhite = textWithArrow.replace('^', '')

    # 每个非空词的前缀空格个数
    seq, totalSpace = 0, 0
    preleadingSpaceDict = collections.defaultdict(int)
    for i in range(len(textWithArrow)):
        if (textWithArrow[i]) == '^':
            totalSpace += 1
            preleadingSpaceDict[seq] = totalSpace
        else:
            preleadingSpaceDict[seq] = totalSpace
            seq += 1        

    for key in keys:
        nkey = str(key).replace('(', r'\(').replace(')', r'\)').replace('[', r'\[').replace(']', r'\]')
        for it in re.finditer(nkey, textWithoutWhite):
            ret.append([key, it.span()[0]+preleadingSpaceDict[it.span()[0]], \
                it.span()[1]+preleadingSpaceDict[it.span()[1]], sample_id])

    if not len(ret):
        return None

    ret.sort(key = lambda x: x[1])

    return ret

# 理财名称提取

In [15]:
def get_product_text(val_content_df):
    result_df=None
    for sample_id,text in tqdm(val_content_df[["sample_id","text"]].values):
        mc_list=get_mc(text)
        if mc_list is not None:
            mc_list=list(set(mc_list))
            # print(mc_list)
        # print("-----------")
        # sample_id
        tmp_df=pd.DataFrame(find_pos_in_text(text,mc_list),columns=[0,1,2,3])
        tmp_df[[0,1,2,3]]=tmp_df[[0,3,1,2]]
        tmp_df[1]=1
        tmp_df[3]=tmp_df[2]
        tmp_df["sample_id"]=sample_id
        result_df=get_title_text(text,tmp_df) if result_df is None else pd.concat([result_df,get_title_text(text,tmp_df)])
    return result_df.reset_index(drop=True)
    
val_product_text=get_product_text(val_content_df)
# train_product_text=get_product_text(train_content_df)
# test_product_text=get_product_text(test_content_df)

100%|██████████| 1800/1800 [01:16<00:00, 23.46it/s]


In [20]:
  def get_F1(val_pred, val_true):
      val_pred = list(val_pred)
      val_true = list(val_true)
      curr = list(set(val_pred).intersection(set(val_true)))
      R = len(curr)/len(val_true)
      P = len(curr)/len(val_pred)
      return 2*P*R/(P+R)

  r = pd.merge(val_df[['sample_id']], train_outputs, on='sample_id', how='left')
  val_true = r['sample_id'].astype(str) + r['理财产品名称'].astype(str)
    # r.to_excel("result_after_drop.xlsx",index=None)
  r=val_product_text.drop_duplicates(subset=["sanple_id",0])

  val_pred = r['sanple_id'].astype(str) + r[0].astype(str)
  score = get_F1(val_pred, val_true)
  score

0.34086359736917354

In [41]:
a=val_product_text.drop_duplicates(subset=["sanple_id",0])

b="lhl\验证集row数量对比.xlsx"
b=pd.read_excel(b).reset_index(drop=True)
b["sample_id"]=b["sample_id"].astype(str)
b=b[b["实际"]<=4]
a=set(list(pd.merge(a,b,left_on="sanple_id",right_on="sample_id")["sample_id"].values))
b=set(list(b["sample_id"].values))
print(b.difference(a))

{'1837', '9967', '3622', '3982', '5288', '5614', '9329', '125', '3327', '5227', '2430', '2547', '7465', '185', '1328', '5107', '1852', '3993', '11025', '1009', '7203', '10803', '8099', '3979', '9399', '3319', '10446', '7617', '5126', '4185', '9129', '3052', '10031', '2549', '1713', '1238', '10120', '5592', '6486', '5128', '4107', '5447', '1464', '2410', '1234', '1832', '6492', '7751', '7124', '3245', '8404', '5575', '8115', '8618', '7152', '953', '1789', '7505', '5116', '6697', '10122', '4522', '1783', '1172', '7748', '10121', '7175', '1982', '4972', '6842', '5596', '5135', '6706', '1755', '7600', '3605', '4275', '3983', '4420', '5226', '3989', '3604', '3089', '10105', '3945', '3961', '7204', '10974', '3247', '10177', '1631', '1070', '4261', '3974', '4202', '6840', '7618', '10460', '7182', '4406', '10131', '2565', '9652', '6966', '5891', '9330', '6684', '1606', '6863', '9369', '9143', '6780', '1814', '9181', '1810', '5173', '4147', '8525', '7880', '10183', '8441', '7149', '5188', '1045

# 理财名称所属文本提取

# 理财信息字段提取

# 理财信息字段整合

# 检验筛选