# 导入相关包

In [1]:
# 导入相关包
import os
import pathlib as pl
import pandas as pd
import numpy as np
import re
from io import StringIO
from datetime import datetime 
import time
from IPython.core.interactiveshell import InteractiveShell
from tqdm.autonotebook import *
import pdfplumber
import collections
tqdm.pandas()
InteractiveShell.ast_node_interactivity = "all"
sys.path.append("..")


# PDF解析原始数据 
## 加载数据并采用pdfplumber抽取PDF中的文字和表格


In [2]:
# 数据准备(train_output文件中格式有点问题，需要提前用excel或者wps打开然后另存为excel文件)
train_outputs = pd.read_excel('../datasets/train_output.xlsx')

# 获取pdf中文字和表格
def extract_pdf_content(pdf_path):
    text_list = []
    table_list = []
    with pdfplumber.open(pdf_path) as pdf:
        for index_page in np.arange(0, len(pdf.pages), 1):
            # 读取多页
            page = pdf.pages[index_page]   # 第n页的信息
            text = page.extract_text()
            text_list.append(text)
            table = page.extract_tables()
            for t in table:
                table_list.append(t)
    return text_list, table_list

def get_dir_file(path):
    '''
    输入文件夹位置，输出整理好的dataframe
    '''
    path_list = os.listdir(path)
    id_list = []
    file_path_list = []
    text_list = []
    table_list = []
    for i in tqdm(path_list):
        if '.PDF' in i:
            file_path = path + i
            id_list.append(int(i.split('.')[0]))
            file_path_list.append(file_path)
            try:
                text_temp, table_temp = extract_pdf_content(file_path)
            except Exception:
                print('此pdf无法读取')
                text_temp, table_temp = [], []
            text_list.append(text_temp)
            table_list.append(table_temp)
            
    df = pd.DataFrame()
    df['sample_id'] = id_list
    df['file_path'] = file_path_list
    df['text'] = text_list
    df['tabel'] = table_list
    df = df.sort_values('sample_id')
    return df

# 文件处理太慢，可持续化保存文件
train_path = '../datasets/train.csv'
if os.path.exists(train_path):
    train_df = pd.read_csv(train_path)
else:
    train_df = get_dir_file('datasets/train_data/')
    train_df.to_csv(train_path,index=False)
    train_df = pd.read_csv(train_path)

test_path =  '../datasets/test.csv'
if os.path.exists(test_path):
    test_df = pd.read_csv(test_path)
else:
    test_df = get_dir_file('datasets/test_data/')
    test_df.to_csv(test_path,index=False)
    test_df = pd.read_csv(test_path)

train_outputs.head(2)
train_df.head(2)
test_df.head(5)

Unnamed: 0,sample_id,认购日期,理财产品名称,产品发行方名称,理财类型,认购金额(万元),产品起息日,产品到息日,产品期限,资金来源,实际购买公司名称,实际购买公司和上市公司关系,买卖方是否有关联关系,公告日期
0,1,2019-03-27,汇聚金1号,中融国际信托有限公司,信托,10000.0,2019-03-27,2019-09-23,180天,自有资金,恒生电子股份有限公司,公司本身,否,2019-04-25
1,1,2019-03-27,招商银行步步生金8699,招商银行,银行理财产品,200.0,2019-03-27,NaT,,自有资金,恒生电子股份有限公司,公司本身,否,2019-04-25


Unnamed: 0,sample_id,file_path,text,tabel
0,1,datasets/train_data/1.PDF,[' ...,"[[['', None, None, '', None, None, '', None, N..."
1,2,datasets/train_data/2.PDF,[' ...,"[[['', None, None, '', None, None, '', None, N..."


Unnamed: 0,sample_id,file_path,text,tabel
0,11188,datasets/test_data/11188.PDF,['北京京西文化旅游股份有限公司监事会\n \n \n关于使用部分闲置募集资金购买理财产品的...,[]
1,11189,datasets/test_data/11189.PDF,['北京京西文化旅游股份有限公司 \n监事会关于使用部分自有资金购买理财产品的意见 \n根据...,[]
2,11190,datasets/test_data/11190.PDF,['证券代码：000802 证券简称：北京文化 公告编号：2...,"[[['委托方', '受托方', '产品 \n类型', '认购金额\n（万元）', '起息日..."
3,11191,datasets/test_data/11191.PDF,['证券代码：000802 证券简称：北京文化 公告编号：2...,"[[['委托方', '受托方', '产品 \n类型', '认购金额\n（万元）', '起息日..."
4,11192,datasets/test_data/11192.PDF,['证券代码：000802 证券简称：北京文化 公告编号：2...,[]


In [3]:
# 构造训练集验证集
train_df = train_df.sample(frac=1, random_state=1017)
val_df = train_df[:1800]
train_df = train_df[1800:]

# 数据处理
## 抽取整体数据（一个sampleid内此字段内容都相同）
## 公告时间，实际购买公司

#### 1.抽取公告时间

In [4]:
# 首先针对任务抽取时间（每个时间跟每个id是一一对应的）
# 要不是取第一个时间，要不就是取最后一个时间（或者时间加一）这里可以建立一个模型预测
# base这里面直接取最后一个时间作为发布日期

##wjc字典里添加了十
CN_NUM = {
    u'月十日':'月10日',u'十月':'10月', u'十日': '0日',u'二十':'二',u'三十':'三', u'十': 1, u'○': 0, u'O':'0', u'Ο':'0',
    u'〇': 0, u'一': 1, u'二': 2, u'三': 3,
    u'四': 4, u'五': 5, u'六': 6, u'七': 7,
    u'八': 8, u'九': 9, u'零': 0, u'壹': 1,
    u'贰': 2, u'叁': 3, u'肆': 4, u'伍': 5,
    u'陆': 6, u'柒': 7, u'捌': 8, u'玖': 9,
    u'貮': 2, u'两': 2,  
}
######### u'○': 0, u'O':'0', u'Ο':'0',##########可删除这些字典，改用添加内容效果一致，这里暂且保留字典


def get_put_time_from_text(row):
    row = row.replace(' ', '').replace('\\n', '')

    ############添加#################改善年份中0有多个不识别的问题########
    start=1
    for word in '一二三四五六七八九':   
        row=re.sub('二.一'+word,'201'+str(start),row)
        start=start+1
    ########添加############慢#######################

    for key in CN_NUM:
        row = row.replace(key, str(CN_NUM[key]))   

    r = row.replace("年", "-").replace("月", "-").replace("日", " ").replace("/", "-").strip()
    regex = "(\d{4}-\d{1,2}-\d{1,2})"
    r = re.findall(regex, r)
    if len(r)==0:
        return np.nan
    time_str = r[-1]
    first = time_str.split('-')[0]
    second = time_str.split('-')[1]
    last = time_str.split('-')[-1]
    second = str.zfill(second, 2)
    last = str.zfill(last, 2)
    r = '-'.join([first, second, last])
    return r




val_result = pd.DataFrame()
val_result['sample_id'] = val_df['sample_id']
val_result['predict_time'] = val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
test_gg = train_outputs.groupby('sample_id').apply(lambda row:list(row['公告日期'])[0]).reset_index()
test_gg.columns = ['sample_id', 'time']
val_result = pd.merge(val_result, test_gg, on='sample_id', how='left')
##val_result['time'].head(10)
###val_result['predict_time'].head(10)

np.set_printoptions(threshold=np.inf)
pd.set_option('max_colwidth',100)

testid=6375
print(train_df[train_df.sample_id==testid]['text'].astype(str))
print(val_df[val_df.sample_id==testid]['text'].astype(str))


# 对于每一行，通过列名name访问对应的元素
for index,row in val_result.iterrows():
    #type(row['predict_time'])
    #print(row['sample_id'],row['predict_time']) # 输出每一行
    try:
        val_result.at[index,'predict_time1']=datetime.strptime(row['predict_time'], "%Y-%m-%d").strftime('%Y/%m/%d')
        #val_result.at[index,'predict_time1']=time.strptime(row['predict_time'], "%Y-%m-%d")
    except:
        #val_result.at[index,'predict_time1']='1900/01/01'
        continue
        # print(row['sample_id'])
        # print(val_df[val_df.sample_id==row['sample_id']]['text'].astype(str))
        
for index,row in val_result.iterrows():
    val_result.at[index,'time1']=val_result.at[index,'time'].strftime("%Y-%m-%d %H:%M:%S")[0:10]
    val_result.at[index,'time2']=val_result.at[index,'time1'].replace('-','')
    try:
        val_result.at[index,'predict_time2']=val_result.at[index,'predict_time'].replace('-','')
    except:
        continue
    #######粗计算时间具体误差################
    val_result.at[index,'差值']=(int)(val_result.at[index,'predict_time2'])-(int)(val_result.at[index,'time2'])

fail_val_result=val_result[val_result['差值']!=0]
#######抽取错误列放入fail的dataframe########################


# val_result['predict_time_inf']=pd.to_datetime(val_result['predict_time'],format='%Y/%m/%d')
# val_result['predict_time_inf'].head(10)

#val_result['predict_time1'] = datetime.strptime(val_result['predict_time'], "%Y-%m-%d").strftime('%Y/%m/%d')
#val_result['predict_time_inf']=pd.to_datetime(val_result['predict_time1'],format='%Y/%m/%d')
#val_result['日期差值']=val_result['predict_time_inf']-val_result['time']
#val_result.head(10)

# 判断验证集的准确率
np.sum(val_result['predict_time'].astype(str) == val_result['time'].astype(str))/len(val_result)

val_time = val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
test_time = test_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)



100%|██████████| 1800/1800 [00:00<00:00, 2868.79it/s]
Series([], Name: text, dtype: object)
5162    [None]
Name: text, dtype: object


0.5883333333333334

100%|██████████| 1800/1800 [00:00<00:00, 2689.48it/s]
100%|██████████| 8660/8660 [00:02<00:00, 4201.86it/s]


#### 2.抽取实际购买公司

In [5]:
# 抽取购买公司
# 前几句话出现
# 将其按照\\n 和空格切割
def get_gm(row):
    row=row.replace('[','').replace(']','').replace('\'','')    ######粗datawash,去除部分典型符号###########
    #head_row=re.split('资金',row)[0]
    #head_row=re.split('使用',head_row)[0]
    #tag_head_row=headrow.replace('[\\\\n ]','$|$')
    result = re.split('[\\\\n ]',row)
    for i in result:
        if '公司' in i:
            i=i.replace('（','(').replace('）',')') ##########修改中文括号#################
            regex="(^.*公司)"##################此四行为添加的##############################
            i=re.findall(regex, i)###########取此行最后一个“公司”前的字符#######
            i=i[0]###############格式转换，list取出str####################################
            if i=='公司':
                continue   ###########跳过字段为公司的答案进入下一个循环#####################
            return i




########################粗改进######################################
def my_get_gm(row):
    row=row.replace('[','').replace(']','').replace('\'','')    ######粗datawash,去除部分典型符号###########
    re_row = re.split('[\\\\n ]',row)
    
    ##################取第一次出现公司的行开始计算###################
    for i in re_row:
        if '公司' not in i:
            re_row.remove(i)
        else:
            break
    
    head_row=""
    #################list替换为文本##########################
    for i in re_row:
        head_row=head_row+i+'$|$'
    
    

    # print(type(head_row.find("公告编号")))
    # head_row=head_row[head_row.find("公告编号"):-1]

    # head_row=head_row.split("^.*公告编号：")[0]
    # head_row_a=head_row.split("资金")[0]
    # head_row=head_row_a.split("使用")[0]
    # head_row=head_row_a.split("委托")[0]
    # head_row=head_row_a.split("董事")[0]
    head_row=head_row.split("理财")[0].split("资金")[0].split("使用")[0].split("董事")[0]
    # print(head_row_a)
    tag_head_row=head_row.replace(' ','').replace('子公司','$|$').replace('公司','公司$|$').replace('关于','$|$').replace('-','$|$')
    # spl_head_row = re.split('\$\|\$',tag_head_row)
    spl_head_row = tag_head_row.split('$|$')
    spl_head_row.reverse()
    result=spl_head_row
    for i in result:
        if '公司' in i:
            i=i.replace('（','(').replace('）',')') ##########修改中文括号#################
            regex="(^.*公司)"##################此四行为添加的##############################
            i=re.findall(regex, i)###########取此行最后一个“公司”前的字符#######
            i=i[0]###############格式转换，list取出str####################################
            if i=='公司' or len(i)<=4 or '”' in i or '“' in i or '简称' in i:
                continue   ###########跳过字段为公司的答案进入下一个循环#####################
            return i
#################到此为止####################################




########################二次改进,文本内容查找追加，成功率暂时比不追加要低一点######################################
def my_get_gm2(row):
    row=row.replace('[','').replace(']','').replace('\'','')    ######粗datawash,去除部分典型符号###########
    re_row = re.split('[\\\\n ]',row)
    
    ###################取第一次出现公司的行开始计算###################
    for i in re_row:
        if '公司' not in i:
            re_row.remove(i)
        else:
            break
    
    
    head_row=""
    #################list替换为文本##########################
    for i in re_row:
        head_row=head_row+i+'$|$'
    
    #################去除标题#############################
    text_row1=head_row[head_row.find("公告"):-1]
    text_row2=head_row[head_row.find("意见"):-1]
    regex = "(子公司.*公司.*购买)"
    text1 = re.findall(regex, text_row1)
    text2 = re.findall(regex, text_row2)
    text=text1+text2
    for i in text:
        if '$|$' in i:
            text.remove(i)
    my_list=[]
    for i in text:
        spl_i=i.replace("购买","$|$").replace("子公司","$|$").replace("公司","公司$|$")
        spl_i=spl_i.split("$|$")
        spl_i.reverse()
        for j in spl_i:
            if '公司' in j:
                j=j.replace('（','(').replace('）',')')
                regex="(^.*公司)"
                j=re.findall(regex, j)
                j=j[0]
                if j=='公司' or len(j)<=4 or len(j)>30 or '”' in j or '“' in j or '简称' in j:
                    continue
                my_list.append(j)
    #print(text1)

    
    
    head_row=head_row.split("理财")[0].split("资金")[0].split("使用")[0].split("董事")[0]
    # print(head_row_a)
    tag_head_row=head_row.replace(' ','').replace('子公司','$|$').replace('公司','公司$|$').replace('关于','$|$').replace('-','$|$')
    # spl_head_row = re.split('\$\|\$',tag_head_row)
    spl_head_row = tag_head_row.split('$|$')
    spl_head_row.reverse()
    result=spl_head_row
    for i in result:
        if '公司' in i:
            i=i.replace('（','(').replace('）',')') ##########修改中文括号#################
            regex="(^.*公司)"##################此四行为添加的##############################
            i=re.findall(regex, i)###########取此行最后一个“公司”前的字符#######
            i=i[0]###############格式转换，list取出str####################################
            if i=='公司' or len(i)<=4 or '”' in i or '“' in i or '简称' in i:
                continue   ###########跳过字段为公司的答案进入下一个循环#####################
            my_list.insert(0,i)
            i=my_list[-1]
            return i
#################到此为止####################################





###val_gm = val_df.progress_apply(lambda row:get_gm(row['text']), axis=1)
val_gm = val_df.progress_apply(lambda row:my_get_gm(row['text']), axis=1)
#test_gm = test_df.progress_apply(lambda row:get_gm(row['text']), axis=1)
test_gm = test_df.progress_apply(lambda row:my_get_gm(row['text']), axis=1)


my_val_result = pd.DataFrame()
my_val_result['sample_id'] = val_df['sample_id']
my_val_result['predict_gs'] = val_df.progress_apply(lambda row: my_get_gm(row['text']), axis=1)
my_test_gs = train_outputs.groupby('sample_id').apply(lambda row:list(row['实际购买公司名称'])[0]).reset_index()
my_test_gs.columns = ['sample_id', 'gs']
my_val_result = pd.merge(my_val_result, my_test_gs, on='sample_id', how='left')
my_val_result['是否相等']=my_val_result['predict_gs']==my_val_result['gs']
fail_my_val_result=my_val_result[my_val_result['是否相等']!=True]


np.set_printoptions(threshold=np.inf)
pd.set_option('max_colwidth',100)

testid=3597
print(train_df[train_df.sample_id==testid]['text'].astype(str))
print(val_df[val_df.sample_id==testid]['text'].astype(str))

# 判断验证集的准确率
np.sum(my_val_result['predict_gs'].astype(str) == my_val_result['gs'].astype(str))/len(my_val_result)

100%|██████████| 1800/1800 [00:01<00:00, 1036.97it/s]
100%|██████████| 8660/8660 [00:06<00:00, 1359.76it/s]
100%|██████████| 1800/1800 [00:02<00:00, 893.25it/s]
Series([], Name: text, dtype: object)
2929    [None, None, None, None, None]
Name: text, dtype: object


0.7972222222222223

In [16]:
######计算文本位置长度#############
def len_count(my_list,my_str):
    my_len=0
    for i in my_list:
        my_len=my_len+len(i)
    my_len=my_len+(len(my_list)-1)*len(my_str)
    return my_len


######基于冒号做文本拼接与分割####效果优
def maohao_cat(text):
    text=text.replace('•','·')
    text=text.replace(r"\n","^")
    text=text.replace(r"', '","^")
    text=text.replace(' ','')
    text=text.replace('（','(').replace('）',')').replace('；','^')     ###############;直接去掉,有极个别答案(答案中出现了引号)将会收到负反馈，多数正反馈
    text=re.sub('\^[1-9]\^','^',text)
    text=re.sub('\^\-[1-9]\-\^','^',text)
    text=re.sub("[（][^）]*?[\^]*$",'',text)
    text=re.sub('[，]*$','',text)
    pos_word=["名称","期限","年","月","日","时间","产品","资金","金额","天","来源","总额","类型","元","关系","无关","不存在关","公司","额","金","受托","签约银行","银行"]
    if '^' not in text:
        return 
    my_list=text.split("^")
    my_str=''
    str_list=[]
    # for i in my_list:
    #     if "：" not in i and ":" not in i and "无关" not in i and "不存在关" not in i and "名称" not in my_str:
    #         my_str=my_str+i
    #         # if len(i)<=6:
    #         #     str_list.append(my_str)      #############解决最后一行被后函数因字数过长舍弃的办法
    #         #     continue
    #     else:
    #         str_list.append(my_str)
    #         my_str=i
    
    for i in my_list:
        if "：" not in i and ":" not in i and len(re.sub('[^\u4e00-\u9fa5]*','',i))<12:
            my_str=my_str+i
            # if "不存在关" in my_str:
            #     str_list.append(my_str)
            #     my_str=''
            # if len(i)<=6:
            #     str_list.append(my_str)      #############解决最后一行被后函数因字数过长舍弃的办法
            #     continue
        else:
            str_list.append(my_str)
            my_str=i
    
    str_list.append(my_str)
    # print(my_list)
    # print('-----')
    # print(str_list)
    # print('-----')
    cat_text=''
    # print(str_list)
    for i in str_list:
        cat_text=cat_text+i+r'^'
        # print(cat_text)
    # print(cat_text.replace(r'^','\n'))
    return cat_text

    
############单独查找名称函数###########
def new_get_mc(text):
    text=maohao_cat(text)  
    if text==None:
        return 
    text=text.replace(r"\n","^")
    text=text.replace(r"', '","^")
    text=text.replace(' ','')
    text=text.replace('（','(').replace('）',')').replace(':','：')
    
    return 0





######寻找文本中的答案##############
####以如下及冒号行做切片#############
def wjc_spl(text):
    spl1=[]      ###如下切片
    maohao=3  ####设定冒号出现次数阈值
    zishu=40   ####设定单行冒号后字数阈值
    my_str=""
    text=maohao_cat(text)    #########调用函数做text正规化
    spl2=[]      ###行切片
    spl3=[]
    neg_word=[]
    pos_word=["名称","期限","年","月","日","时间","产品","资金","金额","天","来源","总额","类型","元","关系","额","金","受托","签约银行"]
    # pos_word=["名称"]
    # print(len(text))

    

    if text==None:
        return -1,-1,-1
    text=text.replace(r"\n","^")
    # print(len(text))
    text=text.replace(r"', '","^")    ######解决换页符被置为', '的问题
    # print(len(text))
    text=text.replace(' ','')
    text=text.replace('（','(').replace('）',')').replace(':','：').replace('如^下：','如下：').replace('如下^：','如下：') #######此处旨在解决如下失效问题
    biaoti=['一、','二、','三、','四、','五、','六、','七、','八、','九、','十、']
    for i in biaoti:
        text=text.replace(i,'如下：')
    # print(len(text))
    # print(text[:10000])
    if "如下：" not in text:
        return -1,-1,-1
    else:
        move=[]
        spl1=text.split("如下：")   #####首次切割####
        # print(len_count(spl1,'如下：'))
        # print(len(spl1[0])+len(spl1[1]))
        my_len=len(spl1[0])+len('如下：')
        spl1=spl1[1:]              ######第一次如下前的内容不关心##########
        len_list=[]
        tag1=[]
        # print(spl1)
        for i in spl1:
            len_list.append(len(i))
            if i.count("：")<=maohao:
                # print('aaaaaaa',len(len_list))
                tag1.append(len(len_list))
                move.append(i)
        # move=list(set(move))
        for x in move:
            # print('pppppppppppppppppppppppppppp')
            # print(x)
            spl1.remove(x)    ####去除冒号过少的部分切片
                
        # print(my_len+0)
        # print(spl1)
        for i in spl1:
            test=i.split('^')
            #print(test[10])
            for j in test:
                spl2.append(j)
            
        # print(spl2[100])
        move=[]
        for i in spl2:
            #print(i)
            if i.count("：")!=1 and i.count(":")!=1:
                # print(i.count("："))
                move.append(i)    ####以行做切片，去除单行里非只有一个冒号的行
        for x in move:
            # print(x)
            spl2.remove(x)
        # print(spl2[0])
        move=[]
        for i in spl2:
            judge1=re.sub('[a-zA-Z]','',i.split("：")[-1])
            judge1=re.sub(r"\d",'',judge1)
            judge1=re.sub(r'”','',judge1)
            judge1=re.sub(r'“','',judge1)
            judge1=re.sub(r'（','',judge1)
            judge1=re.sub(r'）','',judge1)
            # print(len(judge1))
            if len(judge1)>=zishu or len(judge1)<1:     #######去除冒号后关心字段所提中文内容过长或过短的行
                move.append(i)
                # print(i)
        for x in move:
            # print(x)
            spl2.remove(x)
        # print(spl2)
        move=[]
        # print(spl2)
        # print('-----------')

        # print(spl2)
        if spl2==[]:
            # print('该文本变量做行切片结果为空')
            return -1,-1,-1

        for i in spl2:
            x=0
            for j in pos_word:
                judge2=i.split("：")[0]      ########舍弃冒号前（分类字段）不包含pos_word的行
                if j not in judge2:
                    x=x+1
                if x==len(pos_word):
                    # print(i)
                    move.append(i)
            new_move=list(set(move))
            # print(new_move)
        for x in new_move:
            # print(x)
            spl2.remove(x)
            
        # print(spl2)

        


    
    tag1=0
    key_pos_word=['名称']                 ############计算一个pdf生成的dataframe一共需要几行，及每行需要多少列###########
    x=1
    ele_times=[]
    judge3=0
    for i in spl2:
        for j in key_pos_word:
            if j not in i:
                x=x+1                     ############若不含名称字段则结果会是实际+1###############
            else:
                ele_times.append(x)
                x=1
    
            # print(i)
            # print(x)
    ele_times.append(x)

    
    # print(ele_times)
    if len(ele_times)>1:
        mo_times=ele_times[0]+ele_times[-1]-1
        # print(type(mo_times))
        ele_times=ele_times[1:-1]
        ele_times.append(mo_times)
    else:
        test=0
        # print(len(spl2))
        # print(ele_times[0])
        if len(ele_times)==1 and ele_times[0]==len(spl2)+1:
            # print("不含名称字段")                              ##############此处需处理，或者返回一个标记以后处理  冒号行不包含名称字段的情况
            tag1=1
    

    ele_times1=ele_times
    # print(ele_times)
    # print(type(ele_times))

    tag2=0
    key_pos_word=['金额']                 ############计算一个pdf生成的dataframe一共需要几行，及每行需要多少列###########
    x=1
    ele_times=[]
    judge3=0
    for i in spl2:
        for j in key_pos_word:
            if j not in i:
                x=x+1                     ############若不含金额字段则结果会是实际+1###############
            else:
                ele_times.append(x)
                x=1
    
            # print(i)
            # print(x)
    ele_times.append(x)
    # print(ele_times)
    if len(ele_times)>1:
        mo_times=ele_times[0]+ele_times[-1]-1
        # print(type(mo_times))
        ele_times=ele_times[1:-1]
        ele_times.append(mo_times)
    else:
        test=0
        # print(len(spl2))
        # print(ele_times[0])
        if len(ele_times)==1 and ele_times[0]==len(spl2)+1:
            # print("不含金额字段")                              ##############此处需处理，或者返回一个标记以后处理  冒号行不包含金额字段的情况
            tag2=1
    
    ele_times2=ele_times
    # print(ele_times2)

    # print('---')

    

    if len(spl2)<=2:
        return -1,-1,-1

    

    # print(ele_times2)
    # if len(spl2)==1 and spl2[0]=='':
    #     return
    # if len(spl2)==1:
    #     spl2[0].replace(' ','')
   
    return spl2,ele_times1,tag1     ######仅返回“关心”的行切片    #############注意返回变量可能造成函数无法执行



def cut_list(text):
    spl2,ele_times1,tag1=wjc_spl(text)
    # print(spl2)
    # print(ele_times1)
    # print(tag1)
    my_list=[]
    if tag1==0:
        for i in ele_times1:
            # print(i)
            get=spl2[0:i]
            my_list.append(get)
            spl2=spl2[i:]
    else:
        return 0
    # print(my_list)
    return my_list



def get_mc(text):
    # my_list=cut_list(text)
    mc_list=[]
    my_list=cut_list(text)
    my_ziduan=''
    str_spe=''
    if my_list==0:
        return 
    else:
        my_list=cut_list(text)

        for i in my_list:
            this_str=i[0]
            if "名" not in this_str or "受托方" in this_str or "公司名称" in this_str:
                continue
            index=this_str.index("：")
            this_str=this_str[index+1:].replace('。','').replace('；','').replace(';','').replace('、','')
            this_str=re.sub('[(][以下简].*','',this_str)
            this_str=re.sub("[(][^)]*?[\^]*$",'',this_str)
            this_str=re.sub("[，].*$",'',this_str)
            if "银行" in this_str:
                if len(this_str.split("银行")[1])==0:
                    return 
            if "支行" in this_str:
                if len(this_str.split("支行")[1])==0:
                    return 
            if "产品购" in this_str:
                this_str=re.sub('[产品购].*','产品',this_str)

            mc_list.append(this_str)
            # print(mc_list)
    if mc_list ==[]:
        return
    return mc_list



def get_time(text):
    my_list=[]
    my_list,ele_times1,tag1=wjc_spl(text)
    # print(my_list)
    if tag1!=0:
        return
    str_time=''
    str_time_list=[]
    alist=[]
    tmp_list=[]
    if my_list==0:
        # print(111)
        return 
    # if tag1==0:
    #     for i in ele_times1:
    #         # print(i)
    #         get=spl2[0:i]
    #         my_list.append(get)
    #         spl2=spl2[i:]
    # else:
    #     return 0
    else:
        for i in my_list:
            alist=[]
            print(i)
            for j in i:
                # print(j)
                if "年" in j and "月" in j and "日" in j:
                    
                    str_time=re.sub("^.*：","",j)
                    str_time=re.sub("^.*:","",str_time)
                    print(111)
                if '日' not in str_time:
                    return 
                tmp_list=str_time_list.split("日")[0:-1]
                
                for k in tmp_list:
                    str_time=str_time+k+'日'
            alist.append(str_time)
            
        str_time_list.append(alist)
    return str_time_list

        




def spl2_iswm(text):
    spl2,ele_times1,tag1=wjc_spl(text)
    # print(spl2)
    # print(ele_times1)
    # print(tag1)
    my_list=[]
    if tag1==1:
        return "无名称"
    else:
        return ""



# def get_all(text):
#     # my_list=cut_list(text)
#     mc_list=[]
#     my_list=cut_list(text)
#     str_money=''
#     str_len_time=''
#     str_fxf=''    #####发行方####
#     time_list=[]
#     all_list=[]
#     part_list=[]
#     if my_list==0:
#         return 
#     else:
#         my_list=cut_list(text)

#         for i in my_list:
#             for j in i:
#                 if "金额" in j or "总额" in j or "元" in j or ("币种" not in j and "人民币" in j):                     ########此处因可能造成晒错，所以统一遍历
#                     str_money=j
#                     part_list.append(str_money)
#                     continue
#                 if "期限" in j:
#                     str_len_time=j
#                     part_list.append(str_len_time)
#                 if "无关" in j or "不存在关" in j or "受托方" in j:
#                     str_fxf=j
#                     part_list.append(str_fxf)
#                 if "年" in j and "月" in j and "日" in j:
#                     time_list.append(j)
#                 part_list.append(time_list)
#                 all_list.append(part_list)
#             part_list=[]
#     print('---------')
#     print(all_list)
#     if all_list==[]:
#         return 
#     return all_list





# str_test=val_df['text'][660]
# print(type(val_df['text'][201]))
# spllll=wjc_spl(str_test)
#print(spllll)

# def zuhe(text):
#     my_str=maohao_cat(text)
#     spllll=wjc_spl(mystr)
#     return spllll


###test模块##########
# str_test=val_df['text'][226]
# # print(str_test)
# # mystr=maohao_cat(str_test)
# # print(mystr)
# spllll=get_time(str_test)
# # cut_list(str_test)
# # get_all(str_test)
# # print(times1)
# print(spllll)



############全test模块##########
val_text=val_df[['sample_id','text']]
# val_text['sample_id']=val_df['sample_id'].astype(str)
val_text['提取文本关键区域'] = val_df.progress_apply(lambda row: wjc_spl(row['text']), axis=1)
# df["列名"]=df.apply(lambda x:方法名(x,入参2),axis=1)
suc_val_text=val_text[val_text['提取文本关键区域'].notnull() & (val_text['提取文本关键区域'] != "")]
print('一共有',val_text.sample_id.size,'行')
print('其中text抽取了',suc_val_text.提取文本关键区域.size,'行')

suc_val_text['mc'] = val_df.progress_apply(lambda row: get_mc(row['text']), axis=1)
suc_val_text['以名称拆分的list'] = val_df.progress_apply(lambda row: cut_list(row['text']), axis=1)

suc_val_text=suc_val_text.dropna()


# ##########测试无名称的sampleid######    #######仅测试冒号无缺失情况
wm_val_text=val_df[['sample_id','text']]
wm_val_text['无名称标记']=val_df.progress_apply(lambda row: spl2_iswm(row['text']), axis=1)
wm_val_text=wm_val_text[wm_val_text['无名称标记'].notnull() & (wm_val_text['无名称标记'] != "")]
wm_val_text['提取文本关键区域'] = val_df.progress_apply(lambda row: wjc_spl(row['text']), axis=1)

wm_val_text=wm_val_text.dropna()


# print(wm_val_text[5002,'text'])
# print(len(suc_val_text.iloc[858,3]))
# print(suc_val_text.iloc[858,3])
# suc_val_text = suc_val_text[suc_val_text['提取文本关键区域'].str.contains("万")]
# letters=0
# letter=''
# for i in suc_val_text['提取文本关键区域']:
#     if j in i:
#         letter=letter+j
#     if '万' in letter:
#         letters=letters+1
# print('万', letters)
# print(type(suc_val_text.iloc[1,3]))

100%|██████████| 1800/1800 [00:03<00:00, 522.45it/s]
  2%|▏         | 32/1800 [00:00<00:05, 315.91it/s]一共有 1800 行
其中text抽取了 1800 行
100%|██████████| 1800/1800 [00:05<00:00, 311.79it/s]
100%|██████████| 1800/1800 [00:03<00:00, 479.71it/s]
100%|██████████| 1800/1800 [00:03<00:00, 565.99it/s]
100%|██████████| 1800/1800 [00:03<00:00, 560.02it/s]


In [8]:
#text=text.replace(r'[ ]+',' ').replace('\r','^')
def get_title(text):
    global title_num_char
    title_list=[]
    title_type_list=[]
    text_start_iter_list=[]
    text_end_iter_list=[]
 
    for item in title_num_char:
        pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5 ()\[\]（）【】：:]*?[\^]")
        pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5 ()\[\]（）【】：:]*?[\^]")
        #pattern = re.compile(item+r"[ ]*?[^ ]+?[ ]")
        #pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5 ()[]（）【】][]")#*?[\^]
        tmp=pattern.finditer(text)
        for i in tmp:
            title_list.append(i.group())
            text_start_iter_list.append(i.span(0)[0])
            title_type_list.append(1)
            text_end_iter_list.append(i.span(0)[1])

    # for item in title_list:
    for item in s_title_num_char:
        # pattern = re.compile(item+r"[ ]*?[ ]*?[\d][^ ]+?:?[ ]?") 
        # pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5\^ ]*")
        # pattern = re.compile(item+'[0-9\u4e00-\u9fa5()[]（）【】*?[\^]]')*?[\^]
    #    pattern = re.compile(item+r"[\d]*?\u4e00-\u9fa5+[ ]")
        pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5\"(\.*?)\" ()\[\]（）【】：:]*?[\^]")
        tmp=pattern.finditer(text)
        for i in tmp:
            title_list.append(i.group())
            text_start_iter_list.append(i.span(0)[0])
            title_type_list.append(1)
            text_end_iter_list.append(i.span(0)[1])

    title_list.append("引言")
    title_type_list.append(1)
    text_start_iter_list.append(0)
    text_end_iter_list.append(0)

    result_df=pd.DataFrame([title_list,title_type_list,text_start_iter_list,text_end_iter_list]).T.sort_values(by=2).reset_index(drop=True)
    # print(result_df)
    return result_df

def get_title_text(text,title_df):
    # print(title_df)
    title_1_df=title_df[title_df[1]==1]
    text_iter_list=[]
    text_list=[]
    # print(title_1_df)
    for iter1,iter2 in title_1_df[[2,3]].values:
        # print(iter1)
        if(len(text_iter_list)!=0):
            text_iter_list.append(iter1)
        text_iter_list.append(iter2)
    # text_iter_list.append(text_iter_list[len(text_iter_list)-1])
    text_iter_list.append(len(text))
    for index in range(int(len(text_iter_list)/2)):
        text_list.append(text[text_iter_list[2*index]:text_iter_list[2*index+1]])
    
    title_1_df[4]=text_list

    return title_1_df.reset_index(drop=True)

#from fuzzywuzzy import fuzz
def judge_title(sample_id=0,text=r"test\n"):
    # print(text)
    text=text.replace(r"\n","^").replace(r'[ ]+',' ')
    title_df=get_title(text)
    title_df["sample_id"]=[sample_id for x in range(title_df.shape[0])]
    # print(title_df)`
    title_1_df=get_title_text(text,title_df)[["sample_id",0,1,2,3,4]]

    global val_df
    global train_outputs
    val_true_name=train_outputs[train_outputs["sample_id"]==sample_id]["理财产品名称"]
    
    index=0
    neg_index=[]
    for title_des in title_1_df[0].values:
        flag=True
        for item1 in title_pos_words:
            if re.search(item1,title_des) is not None:
                flag=False
                break
        if flag:
            for item in title_neg_words:
                if re.search(item,title_des) is not None:
                    neg_index.append(index)
                    break
        index+=1
    return title_1_df.reset_index(drop=True).drop(neg_index)
    # print(title_list)

def get_judge_title_result(val_df):

    judge_title_result=None


    for sample_id,text in tqdm(val_df[["sample_id","text"]].values):
        # print(sample_id)
        # print(text)
        judge_title_result= judge_title(sample_id,text) if judge_title_result is None else pd.concat([judge_title_result,judge_title(sample_id,text)])
    
    return judge_title_result

title_num_char=["一、","二、","三、","四、","五、","六、","七、","八、","九、","十、","十一、","十二、","十三、","十四、","十五、"]
s_title_num_char=["（一）","（二）","（三）","（四）","（五）","（六）","（七）","（八）","（九）","（十）","（十一）","（十二）","（十三）","（十四）","（十五）"]
s_title_num_char.extend(["[(]一[)]","[(]二[)]","[(]三[)]","[(]四[)]","[(]五[)]","[(]六[)]","[(]七[)]","[(]八[)]","[(]九[)]","[(]十[)]","[(]十一[)]","[(]十二[)]","[(]十三[)]","[(]十四[)]","[(]十五[)]", \
    '1、', '2、', '3、', '4、', '5、', '6、', '7、', '8、', '9、', "《", "<", ">", "》", "\“", "\""
    ])

title_pos_words=[]#"含本次","含本公告","公司及控股子公司",]#,"继续使用","新购买","近期","信托","计划","项目",'']
title_neg_words=["备查","日前","过去","履行","审批","程序","风险","措施","影响","累计","赎回","意见","十二个月内","公告前","报备文件","前期","截至","到期","已到期",'前次',"控制","本公告日前","分析","披露","主体","关联","条款","合同","说明","协议"]
# val_judge_title_result=get_judge_title_result(val_df)
# test_judge_title_result=get_judge_title_result(test_df)
# judge_title_result.to_excel("训练集段落标题分类结果.xlsx",index=None)

# 获取潜在标题
def extra_potential_line(df):
    ret = get_judge_title_result(df)
    return ret[['sample_id', 0]].values.tolist()

# 过滤潜在标题
def filter_line(lines):
    def exclude(line):
        exclude = ['产品情况', '产品.*：', '产品性质：']
        for ex in exclude:
            if re.search(ex, line):
                return True

    ret = []
    for [sample_id, line] in lines:
        if not exclude(line):
            ret.append([sample_id, line])

    return ret

# 给定数据集，获取带有产品名称的所有行
def get_line_container_product(df):
    return filter_line(extra_possible_line(df))

# 给定行，计算有几行是有效的
def count_valid_line(all_titles):
    answer = list(set(train_outputs['理财产品名称'].tolist()))
    statis = collections.defaultdict(int)

    for [_, title] in all_titles:
        for a in answer:
            if a == '理财产品' or a == '收益凭证' or a == '银行理财' or a == '银行理财产品' or len(str(a)) < 4:
                continue

            title = title.replace('（', '').replace('）', '')
            a = str(a).replace('（', '').replace('）', '')

            if title.find(str(a)) != -1:
                statis['ok'] += 1
                # print(title + '|=|' + str(a))

    return statis['ok']

def test_extra_possible_line():
    lines = extra_potential_line(wm_val_text)

    n = count_valid_line(lines)
    print('提取的标题的有效答案: %d' % n)

    lines = filter_line(lines)
    k = count_valid_line(lines)
    print('过滤完成后的有效行数: %d' % k)

    df = pd.DataFrame(lines, columns=['sample_id', '产品名称']) 
    print(df)
    
    return df


test_extra_possible_line()

100%|██████████| 133/133 [00:02<00:00, 56.79it/s]


NameError: name 'collections' is not defined

In [17]:
# loc.c[df['sample_id']].tolist()
# print(type(suc_val_text.loc[:,['sample_id']].tolist))
# suc_val_text.sample_id.tolist()
print(type(suc_val_text.sample_id.tolist()))
# for i in suc_val_text[]
suc_val_text.loc[2446,'mc']
dic = {}
dic1= {}
dic2= {}
x=0
my_df=pd.DataFrame()
for i in suc_val_text.index.tolist():
    my_mc=suc_val_text.loc[i,'mc']
    my_id=suc_val_text.loc[i,'sample_id']
    dic1[i]=my_id
    dic2[i]=my_mc
    # dic[my_id]=my_mc
for key in dic2:
    my_list=dic2[key]
    for j in my_list:
        my_df.loc[x,1]=dic1[key].astype(str)
        my_df.loc[x,2]=j
        x=x+1
my_df.rename(columns={1:'sample_id',2:'理财产品名称'},inplace=True)
# wjc_df = pd.DataFrame([dic])
# new_wjc_df = wjc_df.T
# new_wjc_df.head(10)
my_df.head(10)
temp_single=my_df

def get_F1(val_pred, val_true):
    val_pred = list(val_pred)
    val_true = list(val_true)
    curr = list(set(val_pred).intersection(set(val_true)))
    R = len(curr)/len(val_true)
    P = len(curr)/len(val_pred)
    return 2*P*R/(P+R)

temp_single['sample_id']=temp_single['sample_id'].astype(int)

r = pd.merge(val_df[['sample_id']], train_outputs, on='sample_id', how='left')
val_true = r['sample_id'].astype(str) + r['理财产品名称'].astype(str) 
t_t_r=r
r = temp_single

val_pred = r['sample_id'].astype(str) + r['理财产品名称'].astype(str) 
score = get_F1(val_pred, val_true)
score
i


temp_single=temp_single.drop_duplicates()
my_train_output=train_outputs[['sample_id','理财产品名称']]
my_train_output=my_train_output.drop_duplicates()
# temp_single=temp_single.rename(columns={mc:'理财产品名称'},inplace=False)
compare=pd.merge(temp_single, my_train_output, on=['sample_id','理财产品名称'], how='inner')
# my_train_output1.head(10)
temp_single_a=temp_single
temp_single_a = temp_single_a.append(compare)
temp_single_a = temp_single_a.append(compare)
temp_single_a = temp_single_a.drop_duplicates(subset=['sample_id','理财产品名称'],keep=False)
print('----')
print(len(temp_single_a))
temp_single_a.head(10)
print('----')

r1 = pd.merge(temp_single[['sample_id']], my_train_output, on='sample_id', how='left')
val_true1 = r1['sample_id'].astype(str) + r1['理财产品名称'].astype(str) 
t_t_r1=r1
r1 = temp_single

val_pred1 = r1['sample_id'].astype(str) + r1['理财产品名称'].astype(str) 
score1 = get_F1(val_pred1, val_true1)
score1
i

print(temp_single.loc[873,'理财产品名称'])
print(train_outputs.loc[7800,'理财产品名称'])

strange_inf=['•']
success=['·']

print('总计所提的名称数',len(temp_single))
print('失败的名称数',len(temp_single_a))
print('成功率',(len(compare)-len(temp_single_a))/len(compare))


<class 'list'>


['中国建设银行河南分行“乾元”保本型2017年第41期理财产品', '蕴通财富·日增利60天', '蕴通财富·日增利60天']

Unnamed: 0,sample_id,理财产品名称
0,2974,中国建设银行河南分行“乾元”保本型2017年第41期理财产品
1,2974,蕴通财富·日增利60天
2,2974,蕴通财富·日增利60天
3,446,兴业银行企业金融结构性存款(封闭式)
4,446,单位大额存单
5,446,“乾元-福顺盈”开放式资产组合型理财产品
6,446,现金丰利集合资金信托计划
7,7237,招商银行单位大额存单2016年第1767期
8,4791,中国工商银行保本型法人91天稳利人民币理财产品
9,4791,中国工商银行保本型法人91天稳利人民币理财产品


0.3683409436834094

4840

----
93


Unnamed: 0,sample_id,理财产品名称
6,446,现金丰利集合资金信托计划
11,9004,农业银行“本利丰90天”人民币理财产品
12,9004,宁波银行稳健型861053号单位结构性存款
13,9004,宁波银行稳健型861056号单位结构性存款
14,9004,宁波银行稳健型861057号单位结构性存款
15,9004,浦发银行利多多对公结构性存款2016年JG426期
21,9004,农业银行“汇利丰”2016年第144期金制通结构性存款
29,4028,本理财产品无名义存续期限(受提前终止条款约束)
167,10534,挂钩利率结构性存款-SDGA180203(东至广信)
184,1464,中银保本理财人民币全球智选【理20171374】-全国澳元中


----


0.5110385870608563

4840

江苏银行“聚宝财富天添鑫溢”开放式理财
中建投信托·安泉75号(江阴泰禾)集合资金信托计划
总计所提的名称数 1424
失败的名称数 93
成功率 0.9301277235161532


In [9]:
########标题筛选出产品名称############
#text=text.replace(r'[ ]+',' ').replace('\r','^')
def get_title(text):
    global title_num_char
    title_list=[]
    title_type_list=[]
    text_start_iter_list=[]
    text_end_iter_list=[]
 
    for item in title_num_char:
        pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5 ()\[\]（）【】：:]*?[\^]")
        #pattern = re.compile(item+r"[ ]*?[^ ]+?[ ]")
        #pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5 ()[]（）【】][]")#*?[\^]
        tmp=pattern.finditer(text)
        for i in tmp:
            title_list.append(i.group())
            text_start_iter_list.append(i.span(0)[0])
            title_type_list.append(1)
            text_end_iter_list.append(i.span(0)[1])

    # for item in title_list:
    for item in s_title_num_char:
        # pattern = re.compile(item+r"[ ]*?[ ]*?[\d][^ ]+?:?[ ]?") 
        # pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5\^ ]*")
        # pattern = re.compile(item+'[0-9\u4e00-\u9fa5()[]（）【】*?[\^]]')*?[\^]
    #    pattern = re.compile(item+r"[\d]*?\u4e00-\u9fa5+[ ]")
        pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5 ()\[\]（）【】：:]*?[ ][\^]")
        tmp=pattern.finditer(text)
        for i in tmp:
            title_list.append(i.group())
            text_start_iter_list.append(i.span(0)[0])
            title_type_list.append(1)
            text_end_iter_list.append(i.span(0)[1])

    title_list.append("引言")
    title_type_list.append(1)
    text_start_iter_list.append(0)
    text_end_iter_list.append(0)

    result_df=pd.DataFrame([title_list,title_type_list,text_start_iter_list,text_end_iter_list]).T.sort_values(by=2).reset_index(drop=True)
    # print(result_df)
    return result_df

def get_title_text(text,title_df):
    # print(title_df)
    title_1_df=title_df[title_df[1]==1]
    text_iter_list=[]
    text_list=[]
    # print(title_1_df)
    for iter1,iter2 in title_1_df[[2,3]].values:
        # print(iter1)
        if(len(text_iter_list)!=0):
            text_iter_list.append(iter1)
        text_iter_list.append(iter2)
    # text_iter_list.append(text_iter_list[len(text_iter_list)-1])
    text_iter_list.append(len(text))
    for index in range(int(len(text_iter_list)/2)):
        text_list.append(text[text_iter_list[2*index]:text_iter_list[2*index+1]])
    
    title_1_df[4]=text_list

    return title_1_df.reset_index(drop=True)

#from fuzzywuzzy import fuzz
def judge_title(sample_id=0,text=r"test\n"):
    # print(text)
    text=text.replace(r"\n","^").replace(r'[ ]+',' ')
    title_df=get_title(text)
    title_df["sample_id"]=[sample_id for x in range(title_df.shape[0])]
    # print(title_df)`
    title_1_df=get_title_text(text,title_df)[["sample_id",0,1,2,3,4]]

    

    global val_df
    global train_outputs
    val_true_name=train_outputs[train_outputs["sample_id"]==sample_id]["理财产品名称"]
    
    index=0
    neg_index=[]
    for title_des in title_1_df[0].values:
        for item in title_neg_words:
            if re.search(item,title_des) is not None:
                neg_index.append(index)
                break
        index+=1


    return title_1_df.drop(neg_index)
    # print(title_list)

def get_judge_title_result(val_df):

    judge_title_result=None


    for sample_id,text in tqdm(val_df[["sample_id","text"]].values):
        # print(sample_id)
        # print(text)
        judge_title_result= judge_title(sample_id,text) if judge_title_result is None else pd.concat([judge_title_result,judge_title(sample_id,text)])
    
    return judge_title_result

title_num_char=["一、","二、","三、","四、","五、","六、","七、","八、","九、","十、","十一、","十二、","十三、","十四、","十五、"]
s_title_num_char=["（一）","（二）","（三）","（四）","（五）","（六）","（七）","（八）","（九）","（十）","（十一）","（十二）","（十三）","（十四）","（十五）"]
s_title_num_char.extend(["[(]一[)]","[(]二[)]","[(]三[)]","[(]四[)]","[(]五[)]","[(]六[)]","[(]七[)]","[(]八[)]","[(]九[)]","[(]十[)]","[(]十一[)]","[(]十二[)]","[(]十三[)]","[(]十四[)]","[(]十五[)]"])

title_pos_words=[]
title_neg_words=["备查","日前","过去","履行","审批","程序","风险","措施","影响","累计","赎回","到期","截至","意见","十二个月内","公告前","报备文件","前期"]
val_judge_title_result=get_judge_title_result(val_df)
#test_judge_title_result=get_judge_title_result(test_df)
# judge_title_result.to_excel("训练集段落标题分类结果.xlsx",index=None)



##############此处开始提取标题中的产品名称#################
def get_not_mc():
    val_judge_title_result=get_judge_title_result(val_df)
    return 0

100%|██████████| 1800/1800 [00:19<00:00, 90.65it/s]


In [91]:
title_num_char=["一、","二、","三、","四、","五、","六、","七、","八、","九、","十、","十一、","十二、","十三、","十四、","十五、"]
s_title_num_char=["（一）","（二）","（三）","（四）","（五）","（六）","（七）","（八）","（九）","（十）","（十一）","（十二）","（十三）","（十四）","（十五）"]
s_title_num_char.extend(["[(]一[)]","[(]二[)]","[(]三[)]","[(]四[)]","[(]五[)]","[(]六[)]","[(]七[)]","[(]八[)]","[(]九[)]","[(]十[)]","[(]十一[)]","[(]十二[)]","[(]十三[)]","[(]十四[)]","[(]十五[)]"])

title_pos_words=[]
title_neg_words=["备查","日前","过去","履行","审批","程序","风险","措施","影响","累计","赎回","到期","截至"]


def get_title(text):
    global title_num_char
    title_list=[]
    title_type_list=[]
    text_start_iter_list=[]
    text_end_iter_list=[]
    for item in title_num_char:
        pattern = re.compile(item+r"[ ]*?[^ ]+?[ ]")
        tmp=pattern.finditer(text)
        for i in tmp:
            title_list.append(i.group())
            text_start_iter_list.append(i.span(0)[0])
            title_type_list.append(1)
            text_end_iter_list.append(i.span(0)[1])
    
    # for item in title_list:
    for item in s_title_num_char:
        pattern = re.compile(item+r"[ ]*?[^ ]+?[ ]")
        tmp=pattern.finditer(text)
        for i in tmp:
            title_list.append(i.group())
            text_start_iter_list.append(i.span(0)[0])
            title_type_list.append(2)
            text_end_iter_list.append(i.span(0)[1])

    title_list.append("引言")
    title_type_list.append(1)
    text_start_iter_list.append(0)
    text_end_iter_list.append(0)

    result_df=pd.DataFrame([title_list,title_type_list,text_start_iter_list,text_end_iter_list]).T.sort_values(by=2).reset_index(drop=True)
    # print(result_df)
    return result_df

def get_title_text(text,title_df):
    # print(title_df)
    title_1_df=title_df[title_df[1]==1]
    text_iter_list=[]
    text_list=[]
    # print(title_1_df)
    for iter1,iter2 in title_1_df[[2,3]].values:
        # print(iter1)
        if(len(text_iter_list)!=0):
            text_iter_list.append(iter1)
        text_iter_list.append(iter2)
    # text_iter_list.append(text_iter_list[len(text_iter_list)-1])
    text_iter_list.append(len(text))
    for index in range(int(len(text_iter_list)/2)):
        text_list.append(text[text_iter_list[2*index]:text_iter_list[2*index+1]])
    
    title_1_df[4]=text_list

    return title_1_df.reset_index(drop=True)

from fuzzywuzzy import fuzz
def judge_title(sample_id=0,text=r"test\n"):
    # print(text)

    ##############换行暂时保留#################
    #text=text.replace(r"\n","")


    title_df=get_title(text)
    title_df["sample_id"]=[sample_id for x in range(title_df.shape[0])]
    # print(title_df)
    title_1_df=get_title_text(text,title_df)[["sample_id",0,1,2,3,4]]

    

    global val_df
    global train_outputs
    val_true_name=train_outputs[train_outputs["sample_id"]==sample_id]["理财产品名称"]
    
    index=0
    neg_index=[]
    for title_des in title_1_df[0].values:
        for item in title_neg_words:
            if re.search(item,title_des) is not None:
                neg_index.append(index)
                break
        index+=1


    return title_1_df.drop(neg_index)
    # print(title_list)

judge_title_result=None


for sample_id,text in tqdm(val_df[["sample_id","text"]].values):
    # print(sample_id)
    # print(text)
    judge_title_result= judge_title(sample_id,text) if judge_title_result is None else pd.concat([judge_title_result,judge_title(sample_id,text)])

# judge_title_result.to_excel("训练集段落标题分类结果.xlsx",index=None)


# is_from_text(val_df[val_df["sample_id"]==930]["sample_id"].iloc[0],val_df[val_df["sample_id"]==930]["text"].iloc[0])
# is_from_text(val_df[val_df["sample_id"]==125]["text"].iloc[0])

100%|██████████| 1800/1800 [00:17<00:00, 100.27it/s]


#### 3.清洗提取出来的tabel数据，主要是清洗掉有问题的列 

In [7]:
# 将table转换格式以及处理
def deal_tabel(row):
    row = eval(row)
    if len(row)==0:
        return []
    else:
        new_row = []
        for i in row:
            for d in i:
                new_temp = []
                for h in d:
                    # 这里处理空数据或者错误的数据
                    h = str(h).replace('None', '').replace('\n','').replace(' ', '')                    
                    if h=='':
                        continue
                    if h=='.':
                        continue
                    if h=='/':
                        continue
                    new_temp.append(h)
                new_row.append(new_temp)
        # 这里判断是否构成一个完整得认购数据(通过一个list进行判断)
        new_new_row = []
        for i in new_row:
            if len(i) == 0:
                continue
            elif len(i) <= 4:
                continue
            else:
                new_new_row.append(i)
        return new_new_row
train_df_tabel = train_df['tabel'].progress_apply(lambda row:deal_tabel(row))
val_df_tabel = val_df['tabel'].progress_apply(lambda row:deal_tabel(row))
test_df_tabel = test_df['tabel'].progress_apply(lambda row:deal_tabel(row))

100%|██████████| 7217/7217 [00:23<00:00, 301.14it/s]
100%|██████████| 1800/1800 [00:06<00:00, 291.72it/s]
100%|██████████| 8660/8660 [00:10<00:00, 788.60it/s]


#### 4.抽取的是单独的数据包含
#### 起息日，到息日， 金额，认购日期，产品发行方，理财产品

In [8]:
# 直接提取时间
# 如果出现两个时间第一个就是起息日，第二个就是到期日
# 如果出现一个时间就是起息日
# 出现的第一个money就是最后的金额
# 从这里面抽取所有序列
# 这里认为有逗号出现的就是money

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
    return False

from src.time_extractor import TimeFinder
import datetime
def get_list_data(df):
    df = list(df)
    new_df = []
    for i in tqdm(df):
        temp_df = []
        for h in i:
            new_h = []
            for digital in h:
                if ',' in digital:
                    # 这里也是为了统一数据有些是用元，有些是用万元
                    try:
                        ttt = float(digital.replace(',', '').replace('万元', '').replace('人民币', '').replace('元', ''))
                    except Exception:
                        continue
                    if ttt > 20000:
                        ttt = ttt/10000
                    new_h.append(ttt)
                else:
                    continue
            if len(new_h) == 0:
                continue
            temp_single = {}
            a = '_'.join(h)
            # 抽取时间和money
            t = TimeFinder()
            time_all = t.find_time(a)
            if time_all == None:
                continue
            rgrq = time_all[0]
            cpqxr = time_all[0]
            if len(time_all) > 1:
                try:
                    cpdxr = time_all[1]
                    # 相减
                    d1 = datetime.datetime.strptime(cpqxr, '%Y-%m-%d')
                    d2 = datetime.datetime.strptime(cpdxr, '%Y-%m-%d')
                    d = d2 - d1
                    cpqx = str(d.days) + '天'
                except Exception:
                    cpdxr = np.nan
                    cpqx = np.nan
            else:
                cpdxr = np.nan
                cpqx = np.nan
                
            # 筛选出除开数字与包含时间的列
            # 末尾是
            last_two = ['公司', '银行', '信托', '证券',  '分行', '支行', '中心', '业部', '商行', '建行']
            mowei = np.nan
            selected_bank_and_works = []
            for l in h:
                new_l = list(str(l))
                new_l_test = ''.join(l[-2:])
                if new_l_test in last_two:
                    mowei = l
                    continue
                if '资金' in l:
                    continue
                if '收益' in l:
                    continue
                if '到期' in l:
                    continue
                if ',' in l:
                    continue
                if '.' in l:
                    continue
                if '/' in l:
                    continue
                if '年' in l:
                    continue
                if '-' in l:
                    continue
                if len(l) < 4:
                    continue
                if is_number(l):
                    continue
                selected_bank_and_works.append(l)
            if len(selected_bank_and_works) < 1:
                continue
            
            temp_single['认购日期'] = rgrq
            temp_single['产品起息日'] = cpqxr
            temp_single['产品到期日'] = cpdxr
            temp_single['产品期限'] = cpqx
            temp_single['认购金额(万元)'] = new_h[0]
            temp_single['产品发行方名称'] = mowei
            temp_single['理财产品名称'] = selected_bank_and_works[0]
            temp_df.append(temp_single)
        new_df.append(temp_df)
    return new_df

val_contain_date = get_list_data(val_df_tabel)
test_contain_data = get_list_data(test_df_tabel) 

2%|▏         | 44/1800 [00:38<25:25,  1.15it/s]


KeyboardInterrupt: 

#### 5.汇总整理数据

In [8]:
# 将前面提取到的数据整理成对应格式
sample_id_list = []
rgrq_list = []
lccp_list = []
cpfxf_list = []
rgje_list = []
cpqxr_list = []
cpdxr_list = []
cpqx_list = []
sjgmgsmc_list = []
ggrq_list = []

sample_id = list(val_df['sample_id'])
gg = list(val_gm)
time = list(val_time)
for i, value in enumerate(sample_id):
    for j in val_contain_date[i]:
        sample_id_list.append(sample_id[i])
        rgrq_list.append(j['认购日期'])
        lccp_list.append(j['理财产品名称'])
        cpfxf_list.append(j['产品发行方名称'])
        rgje_list.append(j['认购金额(万元)'])
        cpqxr_list.append(j['产品起息日'])
        cpdxr_list.append(j['产品到期日'])
        cpqx_list.append(j['产品期限'])
        sjgmgsmc_list.append(gg[i])
        ggrq_list.append(time[i])

result = pd.DataFrame()
result['sample_id'] = sample_id_list
result['认购日期'] = rgrq_list
result['理财产品名称'] = lccp_list
result['产品发行方名称'] = cpfxf_list
result['认购金额(万元)'] = rgje_list
result['产品起息日'] = cpqxr_list
result['产品到期日'] = cpdxr_list
result['产品期限'] = cpqx_list
result['实际购买公司名称'] = sjgmgsmc_list
result['公告日期'] = ggrq_list
val_result = result

NameError: name 'val_df' is not defined

In [9]:
sample_id_list = []
rgrq_list = []
lccp_list = []
cpfxf_list = []
rgje_list = []
cpqxr_list = []
cpdxr_list = []
cpqx_list = []
sjgmgsmc_list = []
ggrq_list = []

sample_id = list(test_df['sample_id'])
gg = list(test_gm)
time = list(test_time)
for i, value in enumerate(sample_id):
    for j in test_contain_data[i]:
        sample_id_list.append(sample_id[i])
        rgrq_list.append(j['认购日期'])
        lccp_list.append(j['理财产品名称'])
        cpfxf_list.append(j['产品发行方名称'])
        rgje_list.append(j['认购金额(万元)'])
        cpqxr_list.append(j['产品起息日'])
        cpdxr_list.append(j['产品到期日'])
        cpqx_list.append(j['产品期限'])
        sjgmgsmc_list.append(gg[i])
        ggrq_list.append(time[i])

result = pd.DataFrame()
result['sample_id'] = sample_id_list
result['认购日期'] = rgrq_list
result['理财产品名称'] = lccp_list
result['产品发行方名称'] = cpfxf_list
result['认购金额(万元)'] = rgje_list
result['产品起息日'] = cpqxr_list
result['产品到期日'] = cpdxr_list
result['产品期限'] = cpqx_list
result['实际购买公司名称'] = sjgmgsmc_list
result['公告日期'] = ggrq_list
test_result = result
test_result

NameError: name 'test_df' is not defined