# 导入相关包

In [2]:
# 导入相关包
import os
import pathlib as pl
import pandas as pd
import numpy as np
import re
from io import StringIO
from datetime import datetime 
import time
from IPython.core.interactiveshell import InteractiveShell
from tqdm.autonotebook import *
import collections
import functools
from fuzzywuzzy import fuzz

# import pdfplumber
tqdm.pandas()
InteractiveShell.ast_node_interactivity = "all"
sys.path.append("..")


# PDF解析原始数据 
## 加载数据并采用pdfplumber抽取PDF中的文字和表格


In [3]:
# 数据准备(train_output文件中格式有点问题，需要提前用excel或者wps打开然后另存为excel文件)
train_outputs = pd.read_excel('../datasets/train_output.xlsx')

# 获取pdf中文字和表格
def extract_pdf_content(pdf_path):
    text_list = []
    table_list = []
    with pdfplumber.open(pdf_path) as pdf:
        for index_page in np.arange(0, len(pdf.pages), 1):
            # 读取多页
            page = pdf.pages[index_page]   # 第n页的信息
            text = page.extract_text()
            text_list.append(text)
            table = page.extract_tables()
            for t in table:
                table_list.append(t)
    return text_list, table_list

def get_dir_file(path):
    '''
    输入文件夹位置，输出整理好的dataframe
    '''
    path_list = os.listdir(path)
    id_list = []
    file_path_list = []
    text_list = []
    table_list = []
    for i in tqdm(path_list):
        if '.PDF' in i:
            file_path = path + i
            id_list.append(int(i.split('.')[0]))
            file_path_list.append(file_path)
            try:
                text_temp, table_temp = extract_pdf_content(file_path)
            except Exception:
                print('此pdf无法读取')
                text_temp, table_temp = [], []
            text_list.append(text_temp)
            table_list.append(table_temp)
            
    df = pd.DataFrame()
    df['sample_id'] = id_list
    df['file_path'] = file_path_list
    df['text'] = text_list
    df['tabel'] = table_list
    df = df.sort_values('sample_id')
    return df

# 文件处理太慢，可持续化保存文件
train_path = '../datasets/train.csv'
if os.path.exists(train_path):
    train_df = pd.read_csv(train_path)
else:
    train_df = get_dir_file('datasets/train_data/')
    train_df.to_csv(train_path,index=False)
    train_df = pd.read_csv(train_path)

test_path =  '../datasets/test.csv'
if os.path.exists(test_path):
    test_df = pd.read_csv(test_path)
else:
    test_df = get_dir_file('datasets/test_data/')
    test_df.to_csv(test_path,index=False)
    test_df = pd.read_csv(test_path)

train_outputs.head(2)
train_df.head(2)
test_df.head(2)

Unnamed: 0,sample_id,认购日期,理财产品名称,产品发行方名称,理财类型,认购金额(万元),产品起息日,产品到息日,产品期限,资金来源,实际购买公司名称,实际购买公司和上市公司关系,买卖方是否有关联关系,公告日期
0,1,2019-03-27,汇聚金1号,中融国际信托有限公司,信托,10000.0,2019-03-27,2019-09-23,180天,自有资金,恒生电子股份有限公司,公司本身,否,2019-04-25
1,1,2019-03-27,招商银行步步生金8699,招商银行,银行理财产品,200.0,2019-03-27,NaT,,自有资金,恒生电子股份有限公司,公司本身,否,2019-04-25


Unnamed: 0,sample_id,file_path,text,tabel
0,1,datasets/train_data/1.PDF,[' ...,"[[['', None, None, '', None, None, '', None, N..."
1,2,datasets/train_data/2.PDF,[' ...,"[[['', None, None, '', None, None, '', None, N..."


Unnamed: 0,sample_id,file_path,text,tabel
0,11188,datasets/test_data/11188.PDF,['北京京西文化旅游股份有限公司监事会\n \n \n关于使用部分闲置募集资金购买理财产品的...,[]
1,11189,datasets/test_data/11189.PDF,['北京京西文化旅游股份有限公司 \n监事会关于使用部分自有资金购买理财产品的意见 \n根据...,[]


In [4]:
# 构造训练集验证集
train_df = train_df.sample(frac=1, random_state=1017)
val_df = train_df[:1800]
train_df = train_df[1800:]

# 数据处理
## 抽取整体数据（一个sampleid内此字段内容都相同）
## 公告时间，实际购买公司

In [5]:
######计算文本位置长度#############
def len_count(my_list,my_str):
    my_len=0
    for i in my_list:
        my_len=my_len+len(i)
    my_len=my_len+(len(my_list)-1)*len(my_str)
    return my_len


######基于冒号做文本拼接与分割####效果优
def maohao_cat(text):
    text=text.replace(r"\n","^")
    text=text.replace(r"', '","^")
    text=text.replace(' ','')
    text=text.replace('（','(').replace('）',')').replace('；','^')     ###############;直接去掉,有极个别答案(答案中出现了引号)将会收到负反馈，多数正反馈
    text=re.sub('\^[1-9]\^','^',text)
    text=re.sub('\^\-[1-9]\-\^','^',text)
    text=re.sub("[（][^）]*?[\^]*$",'',text)
    text=re.sub('[，]*$','',text)
    pos_word=["名称","期限","年","月","日","时间","产品","资金","金额","天","来源","总额","类型","元","关系","无关","不存在关"]
    if '^' not in text:
        return 
    my_list=text.split("^")
    my_str=''
    str_list=[]
    # for i in my_list:
    #     if "：" not in i and ":" not in i and "无关" not in i and "不存在关" not in i and "名称" not in my_str:
    #         my_str=my_str+i
    #         # if len(i)<=6:
    #         #     str_list.append(my_str)      #############解决最后一行被后函数因字数过长舍弃的办法
    #         #     continue
    #     else:
    #         str_list.append(my_str)
    #         my_str=i
    
    for i in my_list:
        if "：" not in i and ":" not in i and len(re.sub('[^\u4e00-\u9fa5]*','',i))<5:
            my_str=my_str+i
            # if "不存在关" in my_str:
            #     str_list.append(my_str)
            #     my_str=''
            # if len(i)<=6:
            #     str_list.append(my_str)      #############解决最后一行被后函数因字数过长舍弃的办法
            #     continue
        else:
            str_list.append(my_str)
            my_str=i
    
    str_list.append(my_str)
    # print(my_list)
    # print('-----')
    # print(str_list)
    # print('-----')
    cat_text=''
    # print(str_list)
    for i in str_list:
        cat_text=cat_text+i+r'^'
        # print(cat_text)
    # print(cat_text.replace(r'^','\n'))
    return cat_text

    

        





######寻找文本中的答案##############
####以如下及冒号行做切片#############
def wjc_spl(text):
    global spl1      ###如下切片
    global maohao  ####设定冒号出现次数阈值
    global spl2      ###行切片
    global zishu   ####设定单行冒号后字数阈值
    maohao=3
    zishu=30
    my_str=""
    text=maohao_cat(text)    #########调用函数做text正规化
    spl2=[]
    spl3=[]
    neg_word=[]
    pos_word=["名称","期限","年","月","日","时间","产品","资金","金额","天","来源","总额","类型","元","关系","额","金"]
    # pos_word=["名称"]
    # print(len(text))
    if text==None:
        return -1,-1,-1
    text=text.replace(r"\n","^")
    # print(len(text))
    text=text.replace(r"', '","^")    ######解决换页符被置为', '的问题
    # print(len(text))
    text=text.replace(' ','')
    text=text.replace('（','(').replace('）',')').replace(':','：').replace('如^下：','如下：').replace('如下^：','如下：')
    # print(len(text))
    # print(text[:10000])
    if "如下：" not in text:
        return -1,-1,-1
    else:
        move=[]
        spl1=text.split("如下：")   #####首次切割####
        # print(len_count(spl1,'如下：'))
        # print(len(spl1[0])+len(spl1[1]))
        my_len=len(spl1[0])+len('如下：')
        spl1=spl1[1:]              ######第一次如下前的内容不关心##########
        len_list=[]
        tag1=[]
        # print(spl1)
        for i in spl1:
            len_list.append(len(i))
            if i.count("：")<=maohao:
                # print('aaaaaaa',len(len_list))
                tag1.append(len(len_list))
                move.append(i)
        # move=list(set(move))
        for x in move:
            # print('pppppppppppppppppppppppppppp')
            # print(x)
            spl1.remove(x)    ####去除冒号过少的部分切片
                
        # print(my_len+0)
        # print(spl1)
        for i in spl1:
            test=i.split('^')
            #print(test[10])
            for j in test:
                spl2.append(j)
            
        # print(spl2[100])
        move=[]
        for i in spl2:
            #print(i)
            if i.count("：")!=1 and i.count(":")!=1:
                # print(i.count("："))
                move.append(i)    ####以行做切片，去除单行里非只有一个冒号的行
        for x in move:
            # print(x)
            spl2.remove(x)
        # print(spl2[0])
        move=[]
        for i in spl2:
            judge1=re.sub('[a-zA-Z]','',i.split("：")[-1])
            judge1=re.sub(r"\d",'',judge1)
            judge1=re.sub(r'”','',judge1)
            judge1=re.sub(r'“','',judge1)
            judge1=re.sub(r'（','',judge1)
            judge1=re.sub(r'）','',judge1)
            # print(len(judge1))
            if len(judge1)>=zishu or len(judge1)<1:     #######去除冒号后关心字段所提中文内容过长或过短的行
                move.append(i)
                # print(i)
        for x in move:
            # print(x)
            spl2.remove(x)
        # print(spl2)
        move=[]
        # print(spl2)
        # print('-----------')

        # print(spl2)
        if spl2==[]:
            # print('该文本变量做行切片结果为空')
            return -1,-1,-1

        for i in spl2:
            x=0
            for j in pos_word:
                judge2=i.split("：")[0]      ########舍弃冒号前（分类字段）不包含pos_word的行
                if j not in judge2:
                    x=x+1
                if x==len(pos_word):
                    # print(i)
                    move.append(i)
            new_move=list(set(move))
            # print(new_move)
        for x in new_move:
            # print(x)
            spl2.remove(x)
            
        # print(spl2)

        


    
    tag1=0
    key_pos_word=['名称']                 ############计算一个pdf生成的dataframe一共需要几行，及每行需要多少列###########
    x=1
    ele_times=[]
    judge3=0
    for i in spl2:
        for j in key_pos_word:
            if j not in i:
                x=x+1                     ############若不含名称字段则结果会是实际+1###############
            else:
                ele_times.append(x)
                x=1
    
            # print(i)
            # print(x)
    ele_times.append(x)

    
    # print(ele_times)
    if len(ele_times)>1:
        mo_times=ele_times[0]+ele_times[-1]-1
        # print(type(mo_times))
        ele_times=ele_times[1:-1]
        ele_times.append(mo_times)
    else:
        test=0
        # print(len(spl2))
        # print(ele_times[0])
        if len(ele_times)==1 and ele_times[0]==len(spl2)+1:
            # print("不含名称字段")                              ##############此处需处理，或者返回一个标记以后处理  冒号行不包含名称字段的情况
            tag1=1
    

    ele_times1=ele_times
    # print(ele_times)
    # print(type(ele_times))

    tag2=0
    key_pos_word=['金额']                 ############计算一个pdf生成的dataframe一共需要几行，及每行需要多少列###########
    x=1
    ele_times=[]
    judge3=0
    for i in spl2:
        for j in key_pos_word:
            if j not in i:
                x=x+1                     ############若不含金额字段则结果会是实际+1###############
            else:
                ele_times.append(x)
                x=1
    
            # print(i)
            # print(x)
    ele_times.append(x)
    # print(ele_times)
    if len(ele_times)>1:
        mo_times=ele_times[0]+ele_times[-1]-1
        # print(type(mo_times))
        ele_times=ele_times[1:-1]
        ele_times.append(mo_times)
    else:
        test=0
        # print(len(spl2))
        # print(ele_times[0])
        if len(ele_times)==1 and ele_times[0]==len(spl2)+1:
            # print("不含金额字段")                              ##############此处需处理，或者返回一个标记以后处理  冒号行不包含金额字段的情况
            tag2=1
    
    ele_times2=ele_times
    # print(ele_times2)

    # print('---')

    

    if len(spl2)<=2:
        return -1,-1,-1

    

    # print(ele_times2)
    # if len(spl2)==1 and spl2[0]=='':
    #     return
    # if len(spl2)==1:
    #     spl2[0].replace(' ','')
   
    return spl2,ele_times1,tag1     ######仅返回“关心”的行切片    #############注意返回变量可能造成函数无法执行



def cut_list(text):
    spl2,ele_times1,tag1=wjc_spl(text)
    # print(spl2)
    # print(ele_times1)
    # print(tag1)
    my_list=[]
    if tag1==0:
        for i in ele_times1:
            # print(i)
            get=spl2[0:i]
            my_list.append(get)
            spl2=spl2[i:]
    else:
        return 0
    # print(my_list)
    return my_list



def get_mc(text):
    # my_list=cut_list(text)
    mc_list=[]
    my_list=cut_list(text)
    if my_list==0:
        return 
    else:
        my_list=cut_list(text)

        for i in my_list:
            this_str=i[0]
            if "名" not in this_str or "受托方" in this_str or "公司名称" in this_str:
                continue
            index=this_str.index("：")
            this_str=this_str[index+1:].replace('。','').replace('；','').replace(';','').replace('、','')
            mc_list.append(this_str)
            # print(mc_list)
    if mc_list ==[]:
        return
    return mc_list



def get_all(text):
    # my_list=cut_list(text)
    mc_list=[]
    my_list=cut_list(text)
    str_money=''
    str_len_time=''
    str_fxf=''    #####发行方####
    time_list=[]
    all_list=[]
    part_list=[]
    if my_list==0:
        return 
    else:
        my_list=cut_list(text)

        for i in my_list:
            for j in i:
                if "金额" in j or "总额" in j or "元" in j or ("币种" not in j and "人民币" in j):                     ########此处因可能造成晒错，所以统一遍历
                    str_money=j
                    part_list.append(str_money)
                    continue
                if "期限" in j:
                    str_len_time=j
                    part_list.append(str_len_time)
                if "无关" in j or "不存在关" in j or "受托方" in j:
                    str_fxf=j
                    part_list.append(str_fxf)
                if "年" in j and "月" in j and "日" in j:
                    time_list.append(j)
                    part_list.append(time_list)
            all_list.append(part_list)

    print(all_list)
    if all_list==[]:
        return 
    return all_list



# str_test=val_df['text'][660]
# print(type(val_df['text'][201]))
# spllll=wjc_spl(str_test)
#print(spllll)

# def zuhe(text):
#     my_str=maohao_cat(text)
#     spllll=wjc_spl(mystr)
#     return spllll


###test模块##########
# str_test=val_df['text'][201]
# # mystr=maohao_cat(str_test)
# # print(mystr)
# # spllll=wjc_spl(str_test)
# # cut_list(str_test)
# get_all(str_test)
# # print(times1)



############全test模块##########
val_text=val_df[['sample_id','text']]
# val_text['sample_id']=val_df['sample_id'].astype(str)
val_text['提取文本关键区域'] = val_df.progress_apply(lambda row: wjc_spl(row['text']), axis=1)
# df["列名"]=df.apply(lambda x:方法名(x,入参2),axis=1)
suc_val_text=val_text[val_text['提取文本关键区域'].notnull() & (val_text['提取文本关键区域'] != "")]
print('一共有',val_text.sample_id.size,'行')
print('其中text抽取了',suc_val_text.提取文本关键区域.size,'行')

suc_val_text['mc'] = val_df.progress_apply(lambda row: get_mc(row['text']), axis=1)

suc_val_text=suc_val_text.dropna()



# print(len(suc_val_text.iloc[858,3]))
# print(suc_val_text.iloc[858,3])
# suc_val_text = suc_val_text[suc_val_text['提取文本关键区域'].str.contains("万")]
# letters=0
# letter=''
# for i in suc_val_text['提取文本关键区域']:
#     if j in i:
#         letter=letter+j
#     if '万' in letter:
#         letters=letters+1
# print('万', letters)
# print(type(suc_val_text.iloc[1,3]))

100%|██████████| 1800/1800 [00:02<00:00, 789.16it/s]
  3%|▎         | 46/1800 [00:00<00:03, 454.61it/s]一共有 1800 行
其中text抽取了 1800 行
100%|██████████| 1800/1800 [00:03<00:00, 526.74it/s]


In [6]:
# 找到关键字在文本中的位置
def find_pos_in_text(text, keys, sample_id = 0):
    ret = []

    if not keys or len(keys) == 0 or not text:
        return None

    textWithArrow = re.sub(' +', ' ', text).replace(r'\n', '^').replace(' ', '^')
    textWithoutWhite = textWithArrow.replace('^', '')

    # 每个非空词的前缀空格个数
    seq, totalSpace = 0, 0
    preleadingSpaceDict = collections.defaultdict(int)
    for i in range(len(textWithArrow)):
        if (textWithArrow[i]) == '^':
            totalSpace += 1
            preleadingSpaceDict[seq] = totalSpace
        else:
            preleadingSpaceDict[seq] = totalSpace
            seq += 1        

    for key in keys:
        nkey = str(key).replace('(', r'\(').replace(')', r'\)').replace('[', r'\[').replace(']', r'\]')
        for it in re.finditer(nkey, textWithoutWhite):
            ret.append([key, it.span()[0]+preleadingSpaceDict[it.span()[0]], \
                it.span()[1]+preleadingSpaceDict[it.span()[1]], sample_id])

    if not len(ret):
        return None

    ret.sort(key = lambda x: x[1])

    return ret

_ok = 0
_fail = 0
def test_find_pos_in_text(sample_df, key_df):
    def valid_pos(train_pos):
        global _ok
        global _fail
        if not train_pos or len(train_pos) == 0:
            return

        for pos in train_pos:
            df = sample_df[sample_df['sample_id'] == pos[3]]
            text = re.sub(' +', ' ', df['text'].values[0]).replace(r'\n', '^')
            if text[pos[1]:pos[2]].replace(' ', '').replace('^', '') == pos[0]:
                _ok += 1
            else:
                print('sample_id=%s, 位置%d-%d, 长跑=%s, 提取到的内容=%s' % \
                    (pos[3], pos[1], pos[2], pos[0], text[pos[1]:pos[2]]))
                _fail +=1

    # 找到所有的key
    all_key = collections.defaultdict(list)
    def to_all_key(dict, row):
        dict[row['sample_id']] = row['mc']
    key_df.progress_apply(lambda row: to_all_key(all_key, row), axis=1)

    for sample_id in all_key:
        df = sample_df[sample_df['sample_id'] == sample_id]
        if len(df['text'].values) == 0:
            continue
        ret = find_pos_in_text(df['text'].values[0], all_key[sample_id], sample_id)
        valid_pos(ret)
        print(ret)
    
    print('成功: %d, 失败: %d' % (_ok, _fail))

test_find_pos_in_text(val_df, suc_val_text)



15, 522, 813], ['结构性存款', 2368, 2373, 813], ['结构性存款', 2415, 2420, 813], ['结构性存款', 2462, 2468, 813], ['结构性存款', 2505, 2511, 813], ['结构性存款', 2987, 2993, 813], ['结构性存款', 3434, 3439, 813], ['结构性存款', 3481, 3487, 813], ['结构性存款', 3769, 3775, 813], ['结构性存款', 3815, 3820, 813], ['结构性存款', 3865, 3870, 813], ['结构性存款', 4387, 4392, 813], ['结构性存款', 5111, 5117, 813], ['结构性存款', 5443, 5449, 813], ['结构性存款', 5512, 5517, 813]]
[['“一海通财.理财宝”系列收益凭证普通版182天期第983号”', 583, 615, 10951], ['“一海通财.理财宝”系列收益凭证普通版182天期第984号”', 870, 902, 10951]]
[['中国农业银行“金钥匙.安心得利”2016年第1157期人', 639, 669, 6470], ['“乾元—豫满添利”开放式资产组合型人民币理财产品', 1078, 1104, 6470], ['中国农业银行“本利丰步步高”开放式人民币理财产品', 2025, 2051, 6470], ['工银理财·浙江稳健1号法人开放式人民币理财产品', 2686, 2712, 6470], ['工银理财·浙江稳健1号法人开放式人民币理财产品', 2686, 2712, 6470], ['工银理财·浙江稳健1号法人开放式人民币理财产品', 3234, 3260, 6470], ['工银理财·浙江稳健1号法人开放式人民币理财产品', 3234, 3260, 6470], ['中国工商银行“工银同利”系列随心E人民币理财产品', 3789, 3816, 6470], ['中银日积月累-收益累进第5页共8页', 4469, 4496, 6470], ['中国农业银行“安心.灵动.20天”人民币理财产品', 5133, 5159, 6470]]
[['广州农商银行赢家月月盈

# 判断文本是否包含表格

In [122]:
def is_section_contain_row(section, row, sample_id = 0, table = [[]], statis = None):
    # 用\n分割表头
    ret = []
    for col in row:
        if col == None or len(col) == 0:
            continue
        ret.append(col.split("\n"))
    
    # 包含最多的是哪一个？
    cnt_map = collections.defaultdict(int)
    for row in ret:
        cnt_map[len(row)] += 1

    maxCnt = -1
    maxCntLen = 1
    for i in cnt_map:
        if cnt_map[i] > maxCnt:
            maxCnt = cnt_map[i]
            maxCntLen = i

    def escape_reg(r):
        return r.replace(' ', '').replace("\n", '').replace('(', r'[(]').replace(')', r'[)]')

    # 带所有关键字的正则
    keys = []
    for row in ret:
        if len(row) == maxCntLen:
            keys.append(row[-1])
    r = escape_reg(r'.*'.join(keys))

    # 去掉首位的正则
    r_2 = '!!!??!!!'
    if len(keys) > 4:
        r_2 = escape_reg(r'.*'.join(keys[1:-1]))

    section = section.replace(' ', '')
    
    if re.search(r, section) != None or re.search(r_2, section) != None :
        statis['regex'] += 1
        return True
    else:
        keys_str = ''.join(keys)
        score = 0
        match_line = ''
        for line in section.split(r'\n'):
            radio = fuzz.ratio(keys_str, line)
            if radio > score:
                score = radio
                match_line = line

        if score > 90:
            statis['fuzzy0'] += 1
            # print('%d: %s %d%%像 %s, %s' % (sample_id, keys_str, score, match_line, str(r)))
            return True

    keys = []
    for row in ret:
        keys += row
    # 都不行，用乱序匹配多行文本
    key_str = escape_reg(r''.join(keys))
    all_lines = section.split(r'\n')
    for i in range(len(all_lines)):
        line_str = ''
        k = 1
        while len(line_str) < len(key_str) and i + k < len(all_lines):
            line_str += all_lines[i+k]
            k += 1
        
        def sort_str(str):
            str_list = list(str)
            str_list.sort()
            return ''.join(str_list)
        radio = fuzz.token_sort_ratio(sort_str(line_str), sort_str(key_str))
        if radio > 80 and len(line_str) > 15:
            print(str(sample_id) + '|' + key_str + ' |' + str(radio) + '|' + line_str)
            statis['fuzzy1'] += 1
            return True
        # print(key_str + ' |' + str(radio) + '| '+str(len(key_str) - len(line_str))+'' + line_str)
    statis['err'] = (sample_id, key_str, r, section)
    return False

def is_section_contain_table(section, table, sample_id = 0, statis = None):
    if statis == None:
        statis = collections.defaultdict(int)
    for row in table:
        if is_section_contain_row(section, row, sample_id = sample_id, table = table, statis = statis):
            return True

    # print(statis['err'])
    return False

def test_is_section_contain_table_with_train_data():
    def check_witch_data(row, statis = None):
        statis['total'] += 1
        text = row['text']
        sample_id = row['sample_id']
        table = eval(row['tabel'])

        if table == None or len(table) == 0:
            statis['none'] += 1
            return

        table = table[0][:3]
        
        if is_section_contain_table(text, table, sample_id = sample_id, statis = statis):
            statis['ok'] += 1
        else:
            statis['fail'] += 1

    def nagative_check_witch_data(row, statis = None):
        table = eval(row['tabel'])
        if table == None or len(table) == 0:
            statis['none'] += 1
            return
        table = table[0][:3]

        def check(my_row, statis = None):
            statis['total'] += 1

            if is_section_contain_table(my_row['text'], table, sample_id = my_row['sample_id'], statis = statis):
                statis['ok'] += 1
            else:
                statis['fail'] += 1

        df = train_df[train_df['sample_id'] != row['sample_id']]
        print(row['sample_id'])
        df.head(1000).apply(lambda my_row: check(my_row, statis = statis), axis=1)

    # statis = collections.defaultdict(int)
    # # train_df[train_df['sample_id'] == 3103].progress_apply(lambda row: check_witch_data(row, statis = statis), axis=1)
    # train_df.progress_apply(lambda row: check_witch_data(row, statis = statis), axis=1)
    # print('\n正向执行结果: 总数:%d, 空table=%d, 【失败=%d】, 正则=%d, fuzzy0=%d, fuzzy1=%d' % \
    #     (statis['total'], statis['none'], statis['fail'], \
    #     statis['regex'], statis['fuzzy0'], statis['fuzzy1']))
    
    statis = collections.defaultdict(int)
    train_df.head(3).progress_apply(lambda row: nagative_check_witch_data(row, statis = statis), axis=1)
    print('\n反向结果:  总数:%d, 空table=%d, 成功=%d, 正常=%d, 【误匹配, fuzzy0=%d, fuzzy1=%d】' % \
        (statis['total'], statis['none'], statis['ok'], \
        statis['regex'], statis['fuzzy0'], statis['fuzzy1']))


test_is_section_contain_table_with_train_data()

0%|          | 0/3 [00:00<?, ?it/s]9362
100%|██████████| 3/3 [00:22<00:00,  7.49s/it]
反向结果:  总数:1000, 空table=2, 成功=222, 正常=222, 【误匹配, fuzzy0=0, fuzzy1=0】



#### 2.抽取实际购买公司

# 首先针对任务抽取时间（每个时间跟每个id是一一对应的）
# 要不是取第一个时间，要不就是取最后一个时间（或者时间加一）这里可以建立一个模型预测
# base这里面直接取最后一个时间作为发布日期

CN_NUM = {
    u'〇': 0, u'一': 1, u'二': 2, u'三': 3,
    u'四': 4, u'五': 5, u'六': 6, u'七': 7,
    u'八': 8, u'九': 9, u'零': 0, u'壹': 1,
    u'贰': 2, u'叁': 3, u'肆': 4, u'伍': 5,
    u'陆': 6, u'柒': 7, u'捌': 8, u'玖': 9,
    u'貮': 2, u'两': 2,
}


def get_put_time_from_text(row):
    row = row.replace(' ', '').replace('\\n', '')
    for key in CN_NUM:
        row = row.replace(key, str(CN_NUM[key]))   
    r = row.replace("年", "-").replace("月", "-").replace("日", " ").replace("/", "-").strip()
    regex = "(\d{4}-\d{1,2}-\d{1,2})"
    r = re.findall(regex, r)
    if len(r)==0:
        return np.nan
    time_str = r[-1]
    first = time_str.split('-')[0]
    second = time_str.split('-')[1]
    last = time_str.split('-')[-1]
    second = str.zfill(second, 2)
    last = str.zfill(last, 2)
    r = '-'.join([first, second, last])
    return r

val_result = pd.DataFrame()
val_result['sample_id'] = val_df['sample_id']
val_result['predict_time'] = val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
test_gg = train_outputs.groupby('sample_id').apply(lambda row:list(row['公告日期'])[0]).reset_index()
test_gg.columns = ['sample_id', 'time']
val_result = pd.merge(val_result, test_gg, on='sample_id', how='left')

# 判断验证集的准确率
np.sum(val_result['predict_time'].astype(str) == val_result['time'].astype(str))/len(val_result)

val_time = val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
test_time = test_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)

In [31]:
# 抽取购买公司
# 前几句话出现
# 将其按照\\n 和空格切割
def get_gm(row):
    result = re.split('[\\\\n ]',row)
    for i in result:
        if '公司' in i:
            return i

val_gm = val_df.progress_apply(lambda row:get_gm(row['text']), axis=1)
test_gm = test_df.progress_apply(lambda row:get_gm(row['text']), axis=1)
print(val_gm)

100%|██████████| 1800/1800 [00:00<00:00, 4894.71it/s]
100%|██████████| 8660/8660 [00:01<00:00, 8106.25it/s]1419         成都康弘药业集团股份有限公司
1480    海联金汇科技股份有限公司关于三级子公司
5755           四川广安爱众股份有限公司
2446          普莱柯生物工程股份有限公司
5657          江苏太平洋石英股份有限公司
               ...         
392         上海爱婴室商务服务股份有限公司
177              银泰资源股份有限公司
2767             兴业证券股份有限公司
6973         烟台东诚药业集团股份有限公司
4840             今创集团股份有限公司
Length: 1800, dtype: object



#### 3.清洗提取出来的tabel数据，主要是清洗掉有问题的列 

In [32]:
# 将table转换格式以及处理
def deal_tabel(row):
    row = eval(row)
    if len(row)==0:
        return []
    else:
        new_row = []
        for i in row:
            for d in i:
                new_temp = []
                for h in d:
                    # 这里处理空数据或者错误的数据
                    h = str(h).replace('None', '').replace('\n','').replace(' ', '')                    
                    if h=='':
                        continue
                    if h=='.':
                        continue
                    if h=='/':
                        continue
                    new_temp.append(h)
                new_row.append(new_temp)
        # 这里判断是否构成一个完整得认购数据(通过一个list进行判断)
        new_new_row = []
        for i in new_row:
            if len(i) == 0:
                continue
            elif len(i) <= 4:
                continue
            else:
                new_new_row.append(i)
        return new_new_row

print(val_df)
train_df_tabel = train_df['tabel'].progress_apply(lambda row:deal_tabel(row))
val_df_tabel = val_df['tabel'].progress_apply(lambda row:deal_tabel(row))
test_df_tabel = test_df['tabel'].progress_apply(lambda row:deal_tabel(row))

1%|▏         | 94/7217 [00:00<00:07, 897.98it/s]      sample_id                     file_path  \
1419       1739  datasets/train_data/1739.PDF   
1480       1806  datasets/train_data/1806.PDF   
5755       7032  datasets/train_data/7032.PDF   
2446       2974  datasets/train_data/2974.PDF   
5657       6903  datasets/train_data/6903.PDF   
...         ...                           ...   
392         442   datasets/train_data/442.PDF   
177         193   datasets/train_data/193.PDF   
2767       3359  datasets/train_data/3359.PDF   
6973       8553  datasets/train_data/8553.PDF   
4840       5913  datasets/train_data/5913.PDF   

                                                   text  \
1419  ['证券代码：002773                    证券简称：康弘药业    ...   
1480  [' \n证券代码：002537              证券简称：海联金汇       ...   
5755  ['证券代码：600979         证券简称：广安爱众     公告编号：临 201...   
2446  [' \n证券代码：603566         证券简称： 普莱柯       公告编号：...   
5657  ['证券代码：603688    证券简称：石英股份    公告编号：临 2017-071 ...   
...      

#### 4.抽取的是单独的数据包含
#### 起息日，到息日， 金额，认购日期，产品发行方，理财产品

In [33]:
# 直接提取时间
# 如果出现两个时间第一个就是起息日，第二个就是到期日
# 如果出现一个时间就是起息日
# 出现的第一个money就是最后的金额
# 从这里面抽取所有序列
# 这里认为有逗号出现的就是money

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
    return False

from src.time_extractor import TimeFinder
import datetime
def get_list_data(df):
    df = list(df)
    new_df = []
    for i in tqdm(df):
        temp_df = []
        for h in i:
            new_h = []
            for digital in h:
                if ',' in digital:
                    # 这里也是为了统一数据有些是用元，有些是用万元
                    try:
                        ttt = float(digital.replace(',', '').replace('万元', '').replace('人民币', '').replace('元', ''))
                    except Exception:
                        continue
                    if ttt > 20000:
                        ttt = ttt/10000
                    new_h.append(ttt)
                else:
                    continue
            if len(new_h) == 0:
                continue
            temp_single = {}
            a = '_'.join(h)
            # 抽取时间和money
            t = TimeFinder()
            time_all = t.find_time(a)
            if time_all == None:
                continue
            rgrq = time_all[0]
            cpqxr = time_all[0]
            if len(time_all) > 1:
                try:
                    cpdxr = time_all[1]
                    # 相减
                    d1 = datetime.datetime.strptime(cpqxr, '%Y-%m-%d')
                    d2 = datetime.datetime.strptime(cpdxr, '%Y-%m-%d')
                    d = d2 - d1
                    cpqx = str(d.days) + '天'
                except Exception:
                    cpdxr = np.nan
                    cpqx = np.nan
            else:
                cpdxr = np.nan
                cpqx = np.nan
                
            # 筛选出除开数字与包含时间的列
            # 末尾是
            last_two = ['公司', '银行', '信托', '证券',  '分行', '支行', '中心', '业部', '商行', '建行']
            mowei = np.nan
            selected_bank_and_works = []
            for l in h:
                new_l = list(str(l))
                new_l_test = ''.join(l[-2:])
                if new_l_test in last_two:
                    mowei = l
                    continue
                if '资金' in l:
                    continue
                if '收益' in l:
                    continue
                if '到期' in l:
                    continue
                if ',' in l:
                    continue
                if '.' in l:
                    continue
                if '/' in l:
                    continue
                if '年' in l:
                    continue
                if '-' in l:
                    continue
                if len(l) < 4:
                    continue
                if is_number(l):
                    continue
                selected_bank_and_works.append(l)
            if len(selected_bank_and_works) < 1:
                continue
            
            temp_single['认购日期'] = rgrq
            temp_single['产品起息日'] = cpqxr
            temp_single['产品到期日'] = cpdxr
            temp_single['产品期限'] = cpqx
            temp_single['认购金额(万元)'] = new_h[0]
            temp_single['产品发行方名称'] = mowei
            temp_single['理财产品名称'] = selected_bank_and_works[0]
            temp_df.append(temp_single)
        new_df.append(temp_df)
    return new_df

val_contain_date = get_list_data(val_df_tabel)
test_contain_data = get_list_data(test_df_tabel) 

100%|██████████| 1800/1800 [05:09<00:00,  5.81it/s]
 56%|█████▌    | 4863/8660 [07:37<05:57, 10.63it/s]


KeyboardInterrupt: 

#### 5.汇总整理数据

In [None]:
# 将前面提取到的数据整理成对应格式
sample_id_list = []
rgrq_list = []
lccp_list = []
cpfxf_list = []
rgje_list = []
cpqxr_list = []
cpdxr_list = []
cpqx_list = []
sjgmgsmc_list = []
ggrq_list = []

sample_id = list(val_df['sample_id'])
gg = list(val_gm)
time = list(val_time)
for i, value in enumerate(sample_id):
    for j in val_contain_date[i]:
        sample_id_list.append(sample_id[i])
        rgrq_list.append(j['认购日期'])
        lccp_list.append(j['理财产品名称'])
        cpfxf_list.append(j['产品发行方名称'])
        rgje_list.append(j['认购金额(万元)'])
        cpqxr_list.append(j['产品起息日'])
        cpdxr_list.append(j['产品到期日'])
        cpqx_list.append(j['产品期限'])
        sjgmgsmc_list.append(gg[i])
        ggrq_list.append(time[i])

result = pd.DataFrame()
result['sample_id'] = sample_id_list
result['认购日期'] = rgrq_list
result['理财产品名称'] = lccp_list
result['产品发行方名称'] = cpfxf_list
result['认购金额(万元)'] = rgje_list
result['产品起息日'] = cpqxr_list
result['产品到期日'] = cpdxr_list
result['产品期限'] = cpqx_list
result['实际购买公司名称'] = sjgmgsmc_list
result['公告日期'] = ggrq_list
val_result = result

In [None]:
sample_id_list = []
rgrq_list = []
lccp_list = []
cpfxf_list = []
rgje_list = []
cpqxr_list = []
cpdxr_list = []
cpqx_list = []
sjgmgsmc_list = []
ggrq_list = []

sample_id = list(test_df['sample_id'])
gg = list(test_gm)
time = list(test_time)
for i, value in enumerate(sample_id):
    for j in test_contain_data[i]:
        sample_id_list.append(sample_id[i])
        rgrq_list.append(j['认购日期'])
        lccp_list.append(j['理财产品名称'])
        cpfxf_list.append(j['产品发行方名称'])
        rgje_list.append(j['认购金额(万元)'])
        cpqxr_list.append(j['产品起息日'])
        cpdxr_list.append(j['产品到期日'])
        cpqx_list.append(j['产品期限'])
        sjgmgsmc_list.append(gg[i])
        ggrq_list.append(time[i])

result = pd.DataFrame()
result['sample_id'] = sample_id_list
result['认购日期'] = rgrq_list
result['理财产品名称'] = lccp_list
result['产品发行方名称'] = cpfxf_list
result['认购金额(万元)'] = rgje_list
result['产品起息日'] = cpqxr_list
result['产品到期日'] = cpdxr_list
result['产品期限'] = cpqx_list
result['实际购买公司名称'] = sjgmgsmc_list
result['公告日期'] = ggrq_list
test_result = result
test_result