In [11]:
# 导入相关包
import os
import pathlib as pl
import pandas as pd
import numpy as np
import re
from io import StringIO
from datetime import datetime,timedelta
import time
from IPython.core.interactiveshell import InteractiveShell
from tqdm.autonotebook import *
import pdfplumber
import collections
tqdm.pandas()
InteractiveShell.ast_node_interactivity = "all"
sys.path.append("..")

from tgrocery import Grocery
import jieba

In [2]:
# 数据准备(train_output文件中格式有点问题，需要提前用excel或者wps打开然后另存为excel文件)
train_outputs = pd.read_excel('datasets/train_output.xlsx')

# 获取pdf中文字和表格
def extract_pdf_content(pdf_path):
    text_list = []
    table_list = []
    with pdfplumber.open(pdf_path) as pdf:
        for index_page in np.arange(0, len(pdf.pages), 1):
            # 读取多页
            page = pdf.pages[index_page]   # 第n页的信息
            text = page.extract_text()
            text_list.append(text)
            table = page.extract_tables()
            for t in table:
                table_list.append(t)
    return text_list, table_list

def get_dir_file(path):
    '''
    输入文件夹位置，输出整理好的dataframe
    '''
    path_list = os.listdir(path)
    id_list = []
    file_path_list = []
    text_list = []
    table_list = []
    for i in tqdm(path_list):
        if '.PDF' in i:
            file_path = path + i
            id_list.append(int(i.split('.')[0]))
            file_path_list.append(file_path)
            try:
                text_temp, table_temp = extract_pdf_content(file_path)
            except Exception:
                print('此pdf无法读取')
                text_temp, table_temp = [], []
            text_list.append(text_temp)
            table_list.append(table_temp)
            
    df = pd.DataFrame()
    df['sample_id'] = id_list
    df['file_path'] = file_path_list
    df['text'] = text_list
    df['tabel'] = table_list
    df = df.sort_values('sample_id')
    return df

# 文件处理太慢，可持续化保存文件
train_path = 'datasets/train.csv'
if os.path.exists(train_path):
    train_df = pd.read_csv(train_path)
else:
    train_df = get_dir_file('datasets/train_data/')
    train_df.to_csv(train_path,index=False)
    train_df = pd.read_csv(train_path)

test_path =  'datasets/test.csv'
if os.path.exists(test_path):
    test_df = pd.read_csv(test_path)
else:
    test_df = get_dir_file('datasets/test_data/')
    test_df.to_csv(test_path,index=False)
    test_df = pd.read_csv(test_path)

train_outputs.head(2)
train_df.head(2)
test_df.head(2)

Unnamed: 0,sample_id,file_path,text,tabel
0,11188,datasets/test_data/11188.PDF,['北京京西文化旅游股份有限公司监事会\n \n \n关于使用部分闲置募集资金购买理财产品的...,[]
1,11189,datasets/test_data/11189.PDF,['北京京西文化旅游股份有限公司 \n监事会关于使用部分自有资金购买理财产品的意见 \n根据...,[]


In [3]:
# 构造训练集验证集
train_df = train_df.sample(frac=1, random_state=1017)

val_df = train_df[:1800]
# val_df = train_df[:1800].head(20)
train_df = train_df[1800:]
# test_df=test_df.head(20)

train_outputs["sample_id"]=train_outputs["sample_id"].astype(str)
val_df["sample_id"]=val_df["sample_id"].astype(str)
train_df["sample_id"]=train_df["sample_id"].astype(str)
# test_df["sample_id"]=test_df["sample_id"].astype(str)


# 有效文本挖掘

In [15]:
# 首先针对任务抽取时间（每个时间跟每个id是一一对应的）
# 要不是取第一个时间，要不就是取最后一个时间（或者时间加一）这里可以建立一个模型预测
# base这里面直接取最后一个时间作为发布日期

##wjc字典里添加了十
CN_NUM = {
    u'月十日':'月10日',u'十月':'10月', u'十日': '0日',u'二十':'二',u'三十':'三', u'十': 1, u'○': 0, u'O':'0', u'Ο':'0',
    u'〇': 0, u'一': 1, u'二': 2, u'三': 3,
    u'四': 4, u'五': 5, u'六': 6, u'七': 7,
    u'八': 8, u'九': 9, u'零': 0, u'壹': 1,
    u'贰': 2, u'叁': 3, u'肆': 4, u'伍': 5,
    u'陆': 6, u'柒': 7, u'捌': 8, u'玖': 9,
    u'貮': 2, u'两': 2,  
}
######### u'○': 0, u'O':'0', u'Ο':'0',##########可删除这些字典，改用添加内容效果一致，这里暂且保留字典


def get_put_time_from_text(row):
    row = row.replace(' ', '').replace('\\n', '')

    ############添加#################改善年份中0有多个不识别的问题########
    start=1
    for word in '一二三四五六七八九':   
        row=re.sub('二.一'+word,'201'+str(start),row)
        start=start+1
    ########添加############慢#######################

    for key in CN_NUM:
        row = row.replace(key, str(CN_NUM[key]))   

    r = row.replace("年", "-").replace("月", "-").replace("日", " ").replace("/", "-").strip()
    regex = "(\d{4}-\d{1,2}-\d{1,2})"
    r = re.findall(regex, r)
    if len(r)==0:
        return np.nan
    time_str = r[-1]
    first = time_str.split('-')[0]
    second = time_str.split('-')[1]
    last = time_str.split('-')[-1]
    second = str.zfill(second, 2)
    last = str.zfill(last, 2)
    r = '-'.join([first, second, last])
    return r




val_result = pd.DataFrame()
val_result['sample_id'] = val_df['sample_id']
val_result['predict_time'] = val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
test_gg = train_outputs.groupby('sample_id').apply(lambda row:list(row['公告日期'])[0]).reset_index()
test_gg.columns = ['sample_id', 'time']
val_result = pd.merge(val_result, test_gg, on='sample_id', how='left')
##val_result['time'].head(10)
###val_result['predict_time'].head(10)

np.set_printoptions(threshold=np.inf)
pd.set_option('max_colwidth',100)

testid=6375
print(train_df[train_df.sample_id==testid]['text'].astype(str))
print(val_df[val_df.sample_id==testid]['text'].astype(str))


# 对于每一行，通过列名name访问对应的元素
for index,row in val_result.iterrows():
    #type(row['predict_time'])
    #print(row['sample_id'],row['predict_time']) # 输出每一行
    try:
        val_result.at[index,'predict_time1']=datetime.strptime(row['predict_time'], "%Y-%m-%d").strftime('%Y/%m/%d')
        #val_result.at[index,'predict_time1']=time.strptime(row['predict_time'], "%Y-%m-%d")
    except:
        #val_result.at[index,'predict_time1']='1900/01/01'
        continue
        # print(row['sample_id'])
        # print(val_df[val_df.sample_id==row['sample_id']]['text'].astype(str))
        
for index,row in val_result.iterrows():
    val_result.at[index,'time1']=val_result.at[index,'time'].strftime("%Y-%m-%d %H:%M:%S")[0:10]
    val_result.at[index,'time2']=val_result.at[index,'time1'].replace('-','')
    try:
        val_result.at[index,'predict_time2']=val_result.at[index,'predict_time'].replace('-','')
    except:
        continue
    #######粗计算时间具体误差################
    val_result.at[index,'差值']=(int)(val_result.at[index,'predict_time2'])-(int)(val_result.at[index,'time2'])

fail_val_result=val_result[val_result['差值']!=0]
#######抽取错误列放入fail的dataframe########################


# val_result['predict_time_inf']=pd.to_datetime(val_result['predict_time'],format='%Y/%m/%d')
# val_result['predict_time_inf'].head(10)

#val_result['predict_time1'] = datetime.strptime(val_result['predict_time'], "%Y-%m-%d").strftime('%Y/%m/%d')
#val_result['predict_time_inf']=pd.to_datetime(val_result['predict_time1'],format='%Y/%m/%d')
#val_result['日期差值']=val_result['predict_time_inf']-val_result['time']
#val_result.head(10)

# 判断验证集的准确率
np.sum(val_result['predict_time'].astype(str) == val_result['time'].astype(str))/len(val_result)

# val_time = val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
# test_time = test_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)

val_time=pd.DataFrame()
val_time["公告日期"]=val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
# val_gm = val_df.progress_apply(lambda row:my_get_gm(row['text']), axis=1)
val_time['sample_id'] = val_df['sample_id']
#test_gm = test_df.progress_apply(lambda row:get_gm(row['text']), axis=1)

test_time=pd.DataFrame()
test_time["公告日期"]=test_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
# val_gm = val_df.progress_apply(lambda row:my_get_gm(row['text']), axis=1)
test_time['sample_id'] = test_df['sample_id']
#test_gm = test_df.progress_apply(lambda row:get_gm(row['text']), axis=1)

100%|██████████| 1800/1800 [00:00<00:00, 2140.31it/s]
Series([], Name: text, dtype: object)
Series([], Name: text, dtype: object)


0.5883333333333334

100%|██████████| 1800/1800 [00:00<00:00, 2441.53it/s]
100%|██████████| 8660/8660 [00:02<00:00, 3037.53it/s]


In [14]:
# 抽取购买公司
# 前几句话出现
# 将其按照\\n 和空格切割
def get_gm(row):
    row=row.replace('[','').replace(']','').replace('\'','')    ######粗datawash,去除部分典型符号###########
    #head_row=re.split('资金',row)[0]
    #head_row=re.split('使用',head_row)[0]
    #tag_head_row=headrow.replace('[\\\\n ]','$|$')
    result = re.split('[\\\\n ]',row)
    for i in result:
        if '公司' in i:
            i=i.replace('（','(').replace('）',')') ##########修改中文括号#################
            regex="(^.*公司)"##################此四行为添加的##############################
            i=re.findall(regex, i)###########取此行最后一个“公司”前的字符#######
            i=i[0]###############格式转换，list取出str####################################
            if i=='公司':
                continue   ###########跳过字段为公司的答案进入下一个循环#####################
            return i




########################粗改进######################################
def my_get_gm(row):
    row=row.replace('[','').replace(']','').replace('\'','')    ######粗datawash,去除部分典型符号###########
    re_row = re.split('[\\\\n ]',row)
    
    ##################取第一次出现公司的行开始计算###################
    for i in re_row:
        if '公司' not in i:
            re_row.remove(i)
        else:
            break
    
    head_row=""
    #################list替换为文本##########################
    for i in re_row:
        head_row=head_row+i+'$|$'
    
    

    # print(type(head_row.find("公告编号")))
    # head_row=head_row[head_row.find("公告编号"):-1]

    # head_row=head_row.split("^.*公告编号：")[0]
    # head_row_a=head_row.split("资金")[0]
    # head_row=head_row_a.split("使用")[0]
    # head_row=head_row_a.split("委托")[0]
    # head_row=head_row_a.split("董事")[0]
    head_row=head_row.split("理财")[0].split("资金")[0].split("使用")[0].split("董事")[0]
    # print(head_row_a)
    tag_head_row=head_row.replace(' ','').replace('子公司','$|$').replace('公司','公司$|$').replace('关于','$|$').replace('-','$|$')
    # spl_head_row = re.split('\$\|\$',tag_head_row)
    spl_head_row = tag_head_row.split('$|$')
    spl_head_row.reverse()
    result=spl_head_row
    for i in result:
        if '公司' in i:
            i=i.replace('（','(').replace('）',')') ##########修改中文括号#################
            regex="(^.*公司)"##################此四行为添加的##############################
            i=re.findall(regex, i)###########取此行最后一个“公司”前的字符#######
            i=i[0]###############格式转换，list取出str####################################
            if i=='公司' or len(i)<=4 or '”' in i or '“' in i or '简称' in i:
                continue   ###########跳过字段为公司的答案进入下一个循环#####################
            return i
#################到此为止####################################




########################二次改进,文本内容查找追加，成功率暂时比不追加要低一点######################################
def my_get_gm2(row):
    row=row.replace('[','').replace(']','').replace('\'','')    ######粗datawash,去除部分典型符号###########
    re_row = re.split('[\\\\n ]',row)
    
    ###################取第一次出现公司的行开始计算###################
    for i in re_row:
        if '公司' not in i:
            re_row.remove(i)
        else:
            break
    
    
    head_row=""
    #################list替换为文本##########################
    for i in re_row:
        head_row=head_row+i+'$|$'
    
    #################去除标题#############################
    text_row1=head_row[head_row.find("公告"):-1]
    text_row2=head_row[head_row.find("意见"):-1]
    regex = "(子公司.*公司.*购买)"
    text1 = re.findall(regex, text_row1)
    text2 = re.findall(regex, text_row2)
    text=text1+text2
    for i in text:
        if '$|$' in i:
            text.remove(i)
    my_list=[]
    for i in text:
        spl_i=i.replace("购买","$|$").replace("子公司","$|$").replace("公司","公司$|$")
        spl_i=spl_i.split("$|$")
        spl_i.reverse()
        for j in spl_i:
            if '公司' in j:
                j=j.replace('（','(').replace('）',')')
                regex="(^.*公司)"
                j=re.findall(regex, j)
                j=j[0]
                if j=='公司' or len(j)<=4 or len(j)>30 or '”' in j or '“' in j or '简称' in j:
                    continue
                my_list.append(j)
    #print(text1)

    
    
    head_row=head_row.split("理财")[0].split("资金")[0].split("使用")[0].split("董事")[0]
    # print(head_row_a)
    tag_head_row=head_row.replace(' ','').replace('子公司','$|$').replace('公司','公司$|$').replace('关于','$|$').replace('-','$|$')
    # spl_head_row = re.split('\$\|\$',tag_head_row)
    spl_head_row = tag_head_row.split('$|$')
    spl_head_row.reverse()
    result=spl_head_row
    for i in result:
        if '公司' in i:
            i=i.replace('（','(').replace('）',')') ##########修改中文括号#################
            regex="(^.*公司)"##################此四行为添加的##############################
            i=re.findall(regex, i)###########取此行最后一个“公司”前的字符#######
            i=i[0]###############格式转换，list取出str####################################
            if i=='公司' or len(i)<=4 or '”' in i or '“' in i or '简称' in i:
                continue   ###########跳过字段为公司的答案进入下一个循环#####################
            my_list.insert(0,i)
            i=my_list[-1]
            return i
#################到此为止####################################





###val_gm = val_df.progress_apply(lambda row:get_gm(row['text']), axis=1)
val_gm=pd.DataFrame()
val_gm["实际购买公司名称"]=val_df.progress_apply(lambda row:my_get_gm(row['text']), axis=1)
# val_gm = val_df.progress_apply(lambda row:my_get_gm(row['text']), axis=1)
val_gm['sample_id'] = val_df['sample_id']
#test_gm = test_df.progress_apply(lambda row:get_gm(row['text']), axis=1)

test_gm=pd.DataFrame()
test_gm["实际购买公司名称"] = test_df.progress_apply(lambda row:my_get_gm(row['text']), axis=1)
test_gm['sample_id']=test_df['sample_id']


my_val_result = pd.DataFrame()
my_val_result['sample_id'] = val_df['sample_id']
my_val_result['predict_gs'] = val_df.progress_apply(lambda row: my_get_gm(row['text']), axis=1)
my_test_gs = train_outputs.groupby('sample_id').apply(lambda row:list(row['实际购买公司名称'])[0]).reset_index()
my_test_gs.columns = ['sample_id', 'gs']
my_val_result = pd.merge(my_val_result, my_test_gs, on='sample_id', how='left')
my_val_result['是否相等']=my_val_result['predict_gs']==my_val_result['gs']
fail_my_val_result=my_val_result[my_val_result['是否相等']!=True]


np.set_printoptions(threshold=np.inf)
pd.set_option('max_colwidth',100)

testid=3597
print(train_df[train_df.sample_id==testid]['text'].astype(str))
print(val_df[val_df.sample_id==testid]['text'].astype(str))

# 判断验证集的准确率
np.sum(my_val_result['predict_gs'].astype(str) == my_val_result['gs'].astype(str))/len(my_val_result)

100%|██████████| 1800/1800 [00:02<00:00, 681.04it/s]
100%|██████████| 8660/8660 [00:08<00:00, 1067.16it/s]
100%|██████████| 1800/1800 [00:03<00:00, 580.46it/s]
Series([], Name: text, dtype: object)
Series([], Name: text, dtype: object)


0.7972222222222223

In [4]:
#text=text.replace(r'[ ]+',' ').replace('\r','^')
def get_title(text):
    global title_num_char
    title_list=[]
    title_type_list=[]
    text_start_iter_list=[]
    text_end_iter_list=[]
 
    for item in title_num_char:
        pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5 ()\[\]（）【】：:]*?[\^]")
        #pattern = re.compile(item+r"[ ]*?[^ ]+?[ ]")
        #pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5 ()[]（）【】][]")#*?[\^]
        tmp=pattern.finditer(text)
        for i in tmp:
            title_list.append(i.group())
            text_start_iter_list.append(i.span(0)[0])
            title_type_list.append(1)
            text_end_iter_list.append(i.span(0)[1])

    # for item in title_list:
    for item in s_title_num_char:
        # pattern = re.compile(item+r"[ ]*?[ ]*?[\d][^ ]+?:?[ ]?") 
        # pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5\^ ]*")
        # pattern = re.compile(item+'[0-9\u4e00-\u9fa5()[]（）【】*?[\^]]')*?[\^]
    #    pattern = re.compile(item+r"[\d]*?\u4e00-\u9fa5+[ ]")
        pattern = re.compile(item+"[0-9A-Za-z\u4e00-\u9fa5 ()\[\]（）【】：:]*?[ ][\^]")
        tmp=pattern.finditer(text)
        for i in tmp:
            title_list.append(i.group())
            text_start_iter_list.append(i.span(0)[0])
            title_type_list.append(1)
            text_end_iter_list.append(i.span(0)[1])

    title_list.append("引言")
    title_type_list.append(1)
    text_start_iter_list.append(0)
    text_end_iter_list.append(0)

    result_df=pd.DataFrame([title_list,title_type_list,text_start_iter_list,text_end_iter_list]).T.sort_values(by=2).reset_index(drop=True)
    # print(result_df)
    return result_df

def get_title_text(text,title_df):
    # print(title_df)
    title_1_df=title_df[title_df[1]==1]
    text_iter_list=[]
    text_list=[]
    # print(title_1_df)
    for iter1,iter2 in title_1_df[[2,3]].values:
        # print(iter1)
        if(len(text_iter_list)!=0):
            text_iter_list.append(iter1)
        text_iter_list.append(iter2)
    # text_iter_list.append(text_iter_list[len(text_iter_list)-1])
    text_iter_list.append(len(text))
    for index in range(int(len(text_iter_list)/2)):
        text_list.append(text[text_iter_list[2*index]:text_iter_list[2*index+1]])
    
    title_1_df[4]=text_list

    return title_1_df.reset_index(drop=True)

#from fuzzywuzzy import fuzz
def judge_title(sample_id=0,text=r"test\n"):
    # print(text)
    text=text.replace(r"\n","^").replace(r'[ ]+',' ')
    title_df=get_title(text)
    title_df["sample_id"]=[sample_id for x in range(title_df.shape[0])]
    # print(title_df)`
    title_1_df=get_title_text(text,title_df)[["sample_id",0,1,2,3,4]]

    

    global val_df
    global train_outputs
    val_true_name=train_outputs[train_outputs["sample_id"]==sample_id]["理财产品名称"]
    
    index=0
    neg_index=[]
    for title_des in title_1_df[0].values:
        for item in title_neg_words:
            if re.search(item,title_des) is not None:
                neg_index.append(index)
                break
        index+=1


    return title_1_df.drop(neg_index)
    # print(title_list)

def get_judge_title_result(val_df):

    judge_title_result=None


    for sample_id,text in tqdm(val_df[["sample_id","text"]].values):
        # print(sample_id)
        # print(text)
        judge_title_result= judge_title(sample_id,text) if judge_title_result is None else pd.concat([judge_title_result,judge_title(sample_id,text)])

        judge_title_result["sample_id"]=judge_title_result["sample_id"].astype(str)        
    return judge_title_result

title_num_char=["一、","二、","三、","四、","五、","六、","七、","八、","九、","十、","十一、","十二、","十三、","十四、","十五、"]
s_title_num_char=["（一）","（二）","（三）","（四）","（五）","（六）","（七）","（八）","（九）","（十）","（十一）","（十二）","（十三）","（十四）","（十五）"]
s_title_num_char.extend(["[(]一[)]","[(]二[)]","[(]三[)]","[(]四[)]","[(]五[)]","[(]六[)]","[(]七[)]","[(]八[)]","[(]九[)]","[(]十[)]","[(]十一[)]","[(]十二[)]","[(]十三[)]","[(]十四[)]","[(]十五[)]"])

title_pos_words=[]
title_neg_words=[]
title_neg_words=["备查","日前","过去","履行","审批","程序","风险","措施","影响","累计","赎回","到期[^日].*[^2][^0]","到期.{0,2}[/^]","截至","意见","十二个月内","公告前","报备文件","前期"]

val_judge_title_result=get_judge_title_result(val_df)
train_judge_title_result=get_judge_title_result(train_df)
# test_judge_title_result=get_judge_title_result(test_df)
# judge_title_result.to_excel("训练集段落标题分类结果.xlsx",index=None)

100%|██████████| 1800/1800 [00:41<00:00, 43.16it/s]
100%|██████████| 7217/7217 [02:29<00:00, 48.23it/s]


In [12]:
###9月11日
#val_judge_title_result.head(100)
def get_content_df(val_judge_title_result):
    sample_id=val_judge_title_result['sample_id'].drop_duplicates() 
    text_list=[]
    text_sampleid_list=[]
    content_df=pd.DataFrame()
    for i in tqdm(sample_id):
        text_df=val_judge_title_result[val_judge_title_result['sample_id']==i]
    #  text_df.iloc[0,5]
        #text_df
        n=len(text_df)
        text_sampleid_list.append(i)
        text=''
        for y in range(0,n):
            text=text+text_df.iloc[y,1]+text_df.iloc[y,5]
            text=re.sub("[ ]+"," ",text).replace("（","(").replace("）",")")
        #text
        text_list.append(text)
    content_df["sample_id"]=text_sampleid_list
    content_df["text"]=text_list
    return content_df.reset_index(drop=True)


val_content_df=get_content_df(val_judge_title_result)
train_content_df=get_content_df(train_judge_title_result)
test_content_df=get_content_df(test_judge_title_result)

val_content_df=pd.merge(val_content_df,val_df[["sample_id","tabel"]],on="sample_id")
# train_content_df=pd.merge(train_content_df,train_df[["sample_id","tabel"]],on="sample_id")
# test_content_df=pd.merge(test_content_df,test_df[["sample_id","tabel"]],on="sample_id")

100%|██████████| 1800/1800 [00:12<00:00, 147.52it/s]


NameError: name 'train_judge_title_result' is not defined

# 名称提取

In [13]:
######计算文本位置长度#############
def len_count(my_list,my_str):
    my_len=0
    for i in my_list:
        my_len=my_len+len(i)
    my_len=my_len+(len(my_list)-1)*len(my_str)
    return my_len


######基于冒号做文本拼接与分割####效果优
def maohao_cat(text):
    text=text.replace(r"\n","^")
    text=text.replace(r"', '","^")
    text=text.replace(' ','')
    text=text.replace('（','(').replace('）',')').replace('；','^')     ###############;直接去掉,有极个别答案(答案中出现了引号)将会收到负反馈，多数正反馈
    text=re.sub('\^[1-9]\^','^',text)
    text=re.sub('\^\-[1-9]\-\^','^',text)
    text=re.sub("[（][^）]*?[\^]*$",'',text)
    text=re.sub('[，]*$','',text)
    pos_word=["名称","期限","年","月","日","时间","产品","资金","金额","天","来源","总额","类型","元","关系","无关","不存在关"]
    if '^' not in text:
        return 
    my_list=text.split("^")
    my_str=''
    str_list=[]
    # for i in my_list:
    #     if "：" not in i and ":" not in i and "无关" not in i and "不存在关" not in i and "名称" not in my_str:
    #         my_str=my_str+i
    #         # if len(i)<=6:
    #         #     str_list.append(my_str)      #############解决最后一行被后函数因字数过长舍弃的办法
    #         #     continue
    #     else:
    #         str_list.append(my_str)
    #         my_str=i
    
    for i in my_list:
        if "：" not in i and ":" not in i and len(re.sub('[^\u4e00-\u9fa5]*','',i))<5:
            my_str=my_str+i
            # if "不存在关" in my_str:
            #     str_list.append(my_str)
            #     my_str=''
            # if len(i)<=6:
            #     str_list.append(my_str)      #############解决最后一行被后函数因字数过长舍弃的办法
            #     continue
        else:
            str_list.append(my_str)
            my_str=i
    
    str_list.append(my_str)
    # print(my_list)
    # print('-----')
    # print(str_list)
    # print('-----')
    cat_text=''
    # print(str_list)
    for i in str_list:
        cat_text=cat_text+i+r'^'
        # print(cat_text)
    # print(cat_text.replace(r'^','\n'))
    return cat_text

    

        





######寻找文本中的答案##############
####以如下及冒号行做切片#############
def wjc_spl(text):
    spl1=[]      ###如下切片
    maohao=3  ####设定冒号出现次数阈值
    zishu=30   ####设定单行冒号后字数阈值
    my_str=""
    text=maohao_cat(text)    #########调用函数做text正规化
    spl2=[]      ###行切片
    spl3=[]
    neg_word=[]
    pos_word=["名称","期限","年","月","日","时间","产品","资金","金额","天","来源","总额","类型","元","关系","额","金","受托","签约银行"]
    # pos_word=["名称"]
    # print(len(text))
    if text==None:
        return -1,-1,-1
    text=text.replace(r"\n","^")
    # print(len(text))
    text=text.replace(r"', '","^")    ######解决换页符被置为', '的问题
    # print(len(text))
    text=text.replace(' ','')
    text=text.replace('（','(').replace('）',')').replace(':','：').replace('如^下：','如下：').replace('如下^：','如下：')
    # print(len(text))
    # print(text[:10000])
    if "如下：" not in text:
        return -1,-1,-1
    else:
        move=[]
        spl1=text.split("如下：")   #####首次切割####
        # print(len_count(spl1,'如下：'))
        # print(len(spl1[0])+len(spl1[1]))
        my_len=len(spl1[0])+len('如下：')
        spl1=spl1[1:]              ######第一次如下前的内容不关心##########
        len_list=[]
        tag1=[]
        # print(spl1)
        for i in spl1:
            len_list.append(len(i))
            if i.count("：")<=maohao:
                # print('aaaaaaa',len(len_list))
                tag1.append(len(len_list))
                move.append(i)
        # move=list(set(move))
        for x in move:
            # print('pppppppppppppppppppppppppppp')
            # print(x)
            spl1.remove(x)    ####去除冒号过少的部分切片
                
        # print(my_len+0)
        # print(spl1)
        for i in spl1:
            test=i.split('^')
            #print(test[10])
            for j in test:
                spl2.append(j)
            
        # print(spl2[100])
        move=[]
        for i in spl2:
            #print(i)
            if i.count("：")!=1 and i.count(":")!=1:
                # print(i.count("："))
                move.append(i)    ####以行做切片，去除单行里非只有一个冒号的行
        for x in move:
            # print(x)
            spl2.remove(x)
        # print(spl2[0])
        move=[]
        for i in spl2:
            judge1=re.sub('[a-zA-Z]','',i.split("：")[-1])
            judge1=re.sub(r"\d",'',judge1)
            judge1=re.sub(r'”','',judge1)
            judge1=re.sub(r'“','',judge1)
            judge1=re.sub(r'（','',judge1)
            judge1=re.sub(r'）','',judge1)
            # print(len(judge1))
            if len(judge1)>=zishu or len(judge1)<1:     #######去除冒号后关心字段所提中文内容过长或过短的行
                move.append(i)
                # print(i)
        for x in move:
            # print(x)
            spl2.remove(x)
        # print(spl2)
        move=[]
        # print(spl2)
        # print('-----------')

        # print(spl2)
        if spl2==[]:
            # print('该文本变量做行切片结果为空')
            return -1,-1,-1

        for i in spl2:
            x=0
            for j in pos_word:
                judge2=i.split("：")[0]      ########舍弃冒号前（分类字段）不包含pos_word的行
                if j not in judge2:
                    x=x+1
                if x==len(pos_word):
                    # print(i)
                    move.append(i)
            new_move=list(set(move))
            # print(new_move)
        for x in new_move:
            # print(x)
            spl2.remove(x)
            
        # print(spl2)

        


    
    tag1=0
    key_pos_word=['名称']                 ############计算一个pdf生成的dataframe一共需要几行，及每行需要多少列###########
    x=1
    ele_times=[]
    judge3=0
    for i in spl2:
        for j in key_pos_word:
            if j not in i:
                x=x+1                     ############若不含名称字段则结果会是实际+1###############
            else:
                ele_times.append(x)
                x=1
    
            # print(i)
            # print(x)
    ele_times.append(x)

    
    # print(ele_times)
    if len(ele_times)>1:
        mo_times=ele_times[0]+ele_times[-1]-1
        # print(type(mo_times))
        ele_times=ele_times[1:-1]
        ele_times.append(mo_times)
    else:
        test=0
        # print(len(spl2))
        # print(ele_times[0])
        if len(ele_times)==1 and ele_times[0]==len(spl2)+1:
            # print("不含名称字段")                              ##############此处需处理，或者返回一个标记以后处理  冒号行不包含名称字段的情况
            tag1=1
    

    ele_times1=ele_times
    # print(ele_times)
    # print(type(ele_times))

    tag2=0
    key_pos_word=['金额']                 ############计算一个pdf生成的dataframe一共需要几行，及每行需要多少列###########
    x=1
    ele_times=[]
    judge3=0
    for i in spl2:
        for j in key_pos_word:
            if j not in i:
                x=x+1                     ############若不含金额字段则结果会是实际+1###############
            else:
                ele_times.append(x)
                x=1
    
            # print(i)
            # print(x)
    ele_times.append(x)
    # print(ele_times)
    if len(ele_times)>1:
        mo_times=ele_times[0]+ele_times[-1]-1
        # print(type(mo_times))
        ele_times=ele_times[1:-1]
        ele_times.append(mo_times)
    else:
        test=0
        # print(len(spl2))
        # print(ele_times[0])
        if len(ele_times)==1 and ele_times[0]==len(spl2)+1:
            # print("不含金额字段")                              ##############此处需处理，或者返回一个标记以后处理  冒号行不包含金额字段的情况
            tag2=1
    
    ele_times2=ele_times
    # print(ele_times2)

    # print('---')

    

    if len(spl2)<=2:
        return -1,-1,-1

    

    # print(ele_times2)
    # if len(spl2)==1 and spl2[0]=='':
    #     return
    # if len(spl2)==1:
    #     spl2[0].replace(' ','')
   
    return spl2,ele_times1,tag1     ######仅返回“关心”的行切片    #############注意返回变量可能造成函数无法执行



def cut_list(text):
    spl2,ele_times1,tag1=wjc_spl(text)
    # print(spl2)
    # print(ele_times1)
    # print(tag1)
    my_list=[]
    if tag1==0:
        for i in ele_times1:
            # print(i)
            get=spl2[0:i]
            my_list.append(get)
            spl2=spl2[i:]
    else:
        return 0
    # print(my_list)
    return my_list



def get_mc(text):
    # my_list=cut_list(text)
    mc_list=[]
    my_list=cut_list(text)
    if my_list==0:
        return 
    else:
        my_list=cut_list(text)

        for i in my_list:
            this_str=i[0]
            if "名" not in this_str or "受托方" in this_str or "公司名称" in this_str:
                continue
            index=this_str.index("：")
            this_str=this_str[index+1:].replace('。','').replace('；','').replace(';','').replace('、','')
            mc_list.append(this_str)
            # print(mc_list)
    if mc_list ==[]:
        return
    return mc_list



def spl2_iswm(text):
    spl2,ele_times1,tag1=wjc_spl(text)
    # print(spl2)
    # print(ele_times1)
    # print(tag1)
    my_list=[]
    if tag1==1:
        return "无名称"
    else:
        return 



# 位置定位

In [14]:
# 找到关键字在文本中的位置
def find_pos_in_text(text, keys, sample_id = 0):
    ret = []

    if not keys or len(keys) == 0 or not text:
        return None

    textWithArrow = re.sub(' +', ' ', text).replace(r'\n', '^').replace(' ', '^')
    textWithoutWhite = textWithArrow.replace('^', '')

    # 每个非空词的前缀空格个数
    seq, totalSpace = 0, 0
    preleadingSpaceDict = collections.defaultdict(int)
    for i in range(len(textWithArrow)):
        if (textWithArrow[i]) == '^':
            totalSpace += 1
            preleadingSpaceDict[seq] = totalSpace
        else:
            preleadingSpaceDict[seq] = totalSpace
            seq += 1        

    for key in keys:
        nkey = str(key).replace('(', r'\(').replace(')', r'\)').replace('[', r'\[').replace(']', r'\]')
        for it in re.finditer(nkey, textWithoutWhite):
            ret.append([key, it.span()[0]+preleadingSpaceDict[it.span()[0]], \
                it.span()[1]+preleadingSpaceDict[it.span()[1]], sample_id])

    if not len(ret):
        return None

    ret.sort(key = lambda x: x[1])

    return ret

# 理财名称提取

In [15]:
def get_product_text(val_content_df):
    result_df=None
    for sample_id,text in tqdm(val_content_df[["sample_id","text"]].values):
        mc_list=get_mc(text)
        if mc_list is not None:
            mc_list=list(set(mc_list))
            # print(mc_list)
        # print("-----------")
        # sample_id
        tmp_df=pd.DataFrame(find_pos_in_text(text,mc_list),columns=[0,1,2,3])
        tmp_df[[0,1,2,3]]=tmp_df[[0,3,1,2]]
        tmp_df[1]=1
        tmp_df[3]=tmp_df[2]
        tmp_df["sample_id"]=sample_id
        result_df=get_title_text(text,tmp_df) if result_df is None else pd.concat([result_df,get_title_text(text,tmp_df)])
    return result_df.reset_index(drop=True)
    
val_product_text=get_product_text(val_content_df)
train_product_text=get_product_text(train_content_df)
test_product_text=get_product_text(test_content_df)

100%|██████████| 1800/1800 [01:16<00:00, 23.46it/s]


In [20]:
  def get_F1(val_pred, val_true):
      val_pred = list(val_pred)
      val_true = list(val_true)
      curr = list(set(val_pred).intersection(set(val_true)))
      R = len(curr)/len(val_true)
      P = len(curr)/len(val_pred)
      return 2*P*R/(P+R)

  r = pd.merge(val_df[['sample_id']], train_outputs, on='sample_id', how='left')
  val_true = r['sample_id'].astype(str) + r['理财产品名称'].astype(str)
    # r.to_excel("result_after_drop.xlsx",index=None)
  r=val_product_text.drop_duplicates(subset=["sanple_id",0])

  val_pred = r['sanple_id'].astype(str) + r[0].astype(str)
  score = get_F1(val_pred, val_true)
  score

0.34086359736917354

In [41]:
a=val_product_text.drop_duplicates(subset=["sanple_id",0])

b="lhl\验证集row数量对比.xlsx"
b=pd.read_excel(b).reset_index(drop=True)
b["sample_id"]=b["sample_id"].astype(str)
b=b[b["实际"]<=4]
a=set(list(pd.merge(a,b,left_on="sanple_id",right_on="sample_id")["sample_id"].values))
b=set(list(b["sample_id"].values))
print(b.difference(a))

{'1837', '9967', '3622', '3982', '5288', '5614', '9329', '125', '3327', '5227', '2430', '2547', '7465', '185', '1328', '5107', '1852', '3993', '11025', '1009', '7203', '10803', '8099', '3979', '9399', '3319', '10446', '7617', '5126', '4185', '9129', '3052', '10031', '2549', '1713', '1238', '10120', '5592', '6486', '5128', '4107', '5447', '1464', '2410', '1234', '1832', '6492', '7751', '7124', '3245', '8404', '5575', '8115', '8618', '7152', '953', '1789', '7505', '5116', '6697', '10122', '4522', '1783', '1172', '7748', '10121', '7175', '1982', '4972', '6842', '5596', '5135', '6706', '1755', '7600', '3605', '4275', '3983', '4420', '5226', '3989', '3604', '3089', '10105', '3945', '3961', '7204', '10974', '3247', '10177', '1631', '1070', '4261', '3974', '4202', '6840', '7618', '10460', '7182', '4406', '10131', '2565', '9652', '6966', '5891', '9330', '6684', '1606', '6863', '9369', '9143', '6780', '1814', '9181', '1810', '5173', '4147', '8525', '7880', '10183', '8441', '7149', '5188', '1045

# 理财名称所属文本提取

# 理财信息字段提取

In [18]:
# 最后一部分字段采用预测好的部分，跟提取的text做交互采用双输入lstm在dense层做交互预测最后几个字段

# train_lstm_input = pd.merge(train_df, train_outputs, on='sample_id', how='left')
# result_matrix
from tgrocery import Grocery
train_lstm_input = pd.merge(train_df, train_outputs, on='sample_id', how='left')

train_lstm_input = train_lstm_input.fillna('否')

# label_1理财类型-10  label_2资金来源-3 label_3实际购买公司和上市公司关系-3 label_4买卖方是否有关联关系-2
from sklearn.preprocessing import LabelEncoder
label_1 = LabelEncoder()
# label_2 = LabelEncoder()
# label_3 = LabelEncoder()
# label_4 = LabelEncoder()

train_data = pd.DataFrame()
tmp=pd.DataFrame()
train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) 

# train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) + '_' + train_lstm_input['产品发行方名称'].astype(str)

# train_data['text_2'] = train_lstm_input['text'].astype(str)

# train_lstm_input["文本类别"]="理财产品"

train_data['label_1'] = "理财产品"


train_data2=train_lstm_input[train_lstm_input["产品发行方名称"]!="否"].reset_index(drop=True)

# train_data2["文本类别"]="发行方"

tmp['text_1']=train_data2["产品发行方名称"].astype(str)

# tmp['text_2']= train_data2["text"].astype(str)

tmp['label_1']="发行方"

train_data = pd.concat([train_data,tmp]).reset_index(drop=True)

# train_data2=train_lstm_input[train_lstm_input["实际购买公司名称"]!="否"].reset_index(drop=True)

# # train_data2["文本类别"]="发行方"

# tmp['text_1']=train_data2["实际购买公司名称"].astype(str)

# # tmp['text_2']= train_data2["text"].astype(str)

# tmp['label_1']="购买公司"

# train_data = pd.concat([train_data,tmp]).reset_index(drop=True)


other_columns_list=["认购金额(万元)","认购日期","资金来源","实际购买公司和上市公司关系","实际购买公司名称"]

for item in other_columns_list:

    train_lstm_input[item]=train_lstm_input[item].astype(str)
    train_data2=train_lstm_input[train_lstm_input[item]!="否"].reset_index(drop=True)

    # train_data2["文本类别"]=item

    tmp['text_1']=train_data2[item].astype(str)

    # tmp['text_2']= train_data2["text"].astype(str)

    tmp['label_1']="其它"

    
    train_data = pd.concat([train_data,tmp]).reset_index(drop=True)



# train_data['label_2'] = label_2.fit_transform(train_lstm_input['资金来源'])
# train_data['label_3'] = label_3.fit_transform(train_lstm_input['实际购买公司和上市公司关系'])
# train_data['label_4'] = label_4.fit_transform(train_lstm_input['买卖方是否有关联关系'])
train_data

train_src=[]
for text,label in train_data[["text_1","label_1"]].values:
    train_src.append([label,text])


grocery=Grocery("productOrcounter")


grocery.train(train_src)

grocery.save()



Unnamed: 0,text_1,label_1
0,中银保本理财-人民币按期开放理财产品,理财产品
1,中银保本理财-人民币按期开放理财产品,理财产品
2,与利率挂钩的结构性产品,理财产品
3,广发银行“薪加薪”16号XJXCKJ2578,理财产品
4,兴业银行“金雪球-优悦”保本开放式人民币理财产品(2M),理财产品
...,...,...
185903,上海浦兴投资发展有限公司,其它
185904,上海浦兴投资发展有限公司,其它
185905,上海浦兴投资发展有限公司,其它
185906,上海浦兴投资发展有限公司,其它


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lsqlh\AppData\Local\Temp\jieba.cache
Loading model cost 1.910 seconds.
Prefix dict has been built successfully.


<tgrocery.Grocery at 0x1e133b82be0>

In [8]:
grocery.predict("浦发银行利多多").dec_values

{'理财产品': 0.5882297461996445,
 '发行方': 0.22094697741834643,
 '其它': -0.809176723618003}

# 理财信息字段整合

In [22]:
from fuzzywuzzy import fuzz
from src.time_extractor import TimeFinder
import datetime

data_file=pl.Path(r"D:\Work\数据挖掘\baseline_青青草原我的家\2_test_result.csv")

data_df=pd.read_csv(data_file)

In [23]:
data_df=data_df[["sample_id","理财产品名称","时间","认购金额","产品发行方名称","实际购买公司名称"]]

In [24]:

result_dict={}
result_dict["sample_id"]=[]
result_dict["columns"]=[]
result_dict["product_df"]=[]
for sample_id in tqdm(data_df["sample_id"].unique()):
    results_df=None
    tmp_df=data_df[data_df["sample_id"]==sample_id][["理财产品名称","时间","认购金额","产品发行方名称","实际购买公司名称"]]
    table_column=pd.DataFrame(list(tmp_df.columns)).T.reset_index(drop=True).loc[0]
    product_df=pd.DataFrame()
    product_df[[0,1,2,3,4]]=tmp_df
    product_df[0]=product_df[0].astype(str)
    product_df[1]=product_df[1].astype(str)
    product_df[2]=product_df[2].astype(str)
    product_df[3]=product_df[3].astype(str)
    product_df[4]=product_df[4].astype(str)
    results_df = product_df if results_df is None else pd.concat([results_df,product_df])
    result_dict["sample_id"].append(sample_id)
    result_dict["columns"].append(table_column)
    result_dict["product_df"].append(results_df)

result_dict=pd.DataFrame(result_dict)

100%|██████████| 3734/3734 [00:42<00:00, 87.70it/s]


In [25]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
    return False

def judge_type(columns):
    type_index=[]#1:产品名,2:金额,3:发行方，4:期限
    columns=columns.map(lambda x:x.replace("（","(").replace("）",")"))
    product_name_pos_words=["产品名称","产品名册","产品名","理财产品","项目名","回购名","回购品","标的名","金融产","投资项"]#"存款种类","基金类型"#不能为空
    # product_name_neg_words=["编号","代码"]
    amt_pos_words=["存款金","认购金","投资金","投入金","受托金","理财金","金额","（元","(元","(万元","（万元","(亿元","（亿元","人民币","投资规","认购规","存款规","投入规","理财规"]
    counter_name_pos_words=["受托方","银行机","机构名","合作方名","合作银","合作机","受托人","发行主","签约方","协议方","受托机","受托银","认购银","签约银","签约机","协议机","发生主","存放银","存款银","存款机","存放机","购买银","购买机","管理人","管理银","管理机","银行名","发行机","发行主","发行人","对手方","开户银","开户行","开户机"]#可以为空
    time_length_pos_words=["期限","(天)","持有时间"]
    for words in product_name_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        # columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==0))
        if True in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==0):
        for words in ["种类","类型","类别"]:
            judge_flag=[]
            columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
            # columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==0))
            if True  in judge_flag:
                type_index.append(judge_flag.index(True))
                break
        if(len(type_index)==0):
            type_index.append(-1)
        

    for words in amt_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==1):
        type_index.append(-1)
    
    for words in counter_name_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==2):
        type_index.append(-1)
    
    for words in time_length_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==3):
        type_index.append(-1)
    
    
    return type_index
def get_answer_matrix(result_matrix,sample_id=None):

    temp_single={}
    temp_single['认购日期'] = []
    temp_single['产品起息日'] = []
    temp_single['产品到息日'] = []
    temp_single['产品期限'] = []
    temp_single['认购金额(万元)'] = []
    temp_single['产品发行方名称'] = []
    temp_single['理财产品名称'] = []
    temp_single['sample_id'] = []

    temp_single["实际购买公司名称"]=[]
    

    if(sample_id is not None):
        result_matrix=result_matrix[result_matrix["sample_id"]==sample_id]
    for sample_id,columns,product_df in tqdm(result_matrix[["sample_id","columns","product_df"]].values):
        
        type_index=judge_type(columns)
        # product_df
        # columns
        # type_index
        for index in product_df.index:
            tmp_df=product_df.loc[index]
            if(len(tmp_df.shape) ==2 ):
                tmp_df=tmp_df.reset_index(drop=True).loc[0]
            product_name=""
            amt=""
            counter_name=""
            pur_dt=""
            val_dt=""
            coupon_dt=""
            time_limit=""
            #产品名
            if(type_index[0]!=-1):
                if (str(grocery.predict(tmp_df.loc[type_index[0]])) == "理财产品"):
                    product_name=tmp_df.loc[type_index[0]]
            else:
                candidate_list={}
                candidate_list["理财产品"]=[]
                candidate_list["发行方"]=[]
                candidate_list["其它"]=[]
                candidate_list["购买公司"]=[]
                for each_word in tmp_df.head(1):
                    if not (is_number(each_word)):
                        candidate_list[str(grocery.predict(each_word))].append(each_word)
                if(len(candidate_list["理财产品"])!=0):
                    product_name=candidate_list["理财产品"][0]
            #金额
            if(type_index[1]!=-1):
                amt=tmp_df.loc[type_index[1]].replace("（","").replace("）","").replace("(","").replace("(","").replace("元","").replace("圆","")
                type_amt=0
                if("万" in amt or "万" in columns.loc[type_index[1]]):
                    type_amt=1
                if("亿" in amt or "亿" in columns.loc[type_index[1]]):
                    type_amt=2
                amt=re.sub("[^0-9.]","",amt)
                amt=re.sub("[^0-9.]","",amt)
                if(is_number(amt)):
                    amt=float(amt)
                    if(type_amt==0 and amt/10000 >float(50)):
                        amt/=10000
                    if(type_amt==2):
                        amt*=10000
                # print(amt)
            else:
                candidate_list=[]
                value_list=list(tmp_df)
                for item in value_list:
                    tmp=str(item).replace("（","").replace("）","").replace("(","").replace("(","").replace("元","").replace("圆","").replace("亿","").replace("万","")
                    tmp=re.sub("[^0-9.]*额[^0-9.]*","",tmp)
                    tmp=re.sub("[^0-9.]*币[^0-9.]*","",tmp)
                    if(is_number(tmp)):
                        candidate_list.append(float(tmp))

                if len(candidate_list)>0:
                    real_tmp=sorted(candidate_list,reverse=True)[0]

                    for item in value_list:
                        tmp=str(item)
                        tmp=re.sub("[^0-9.]*额[^0-9.]*","",tmp)
                        tmp=re.sub("[^0-9.]*币[^0-9.]*","",tmp)
                        if(is_number(tmp) and float(tmp)==real_tmp):
                            amt=item
                            type_amt=0
                        else:
                            continue
                        if("万" in amt ):
                            type_amt=1
                        if("亿" in amt ):
                            type_amt=2
                        amt=re.sub("[^0-9.]","",amt)
                        if(is_number(amt)):
                            amt=float(amt)
                            # print(amt)
                            if(type_amt==0 and amt/10000 >float(50)):
                                amt/=10000
                            if(type_amt==2):
                                amt*=10000
                        if(amt !="" or amt!=np.nan):
                            break
            #发行方
            if(type_index[2]!=-1):
                if (str(grocery.predict(tmp_df.loc[type_index[2]])) == "发行方"):
                    counter_name=tmp_df.loc[type_index[2]]
            else:
                candidate_list={}
                candidate_list["理财产品"]=[]
                candidate_list["发行方"]=[]
                candidate_list["其它"]=[]
                candidate_list["购买公司"]=[]
                for each_word in tmp_df:
                    if not (is_number(each_word)):
                        each_word=each_word.replace("^","").replace("\n","").replace(" ","")
                        candidate_list[str(grocery.predict(each_word))].append(each_word)
                # print(candidate_list)
                if(len(candidate_list["发行方"])!=0):
                    counter_name=sorted(candidate_list["发行方"],reverse=True)[0]
            
            #期限
            if(type_index[3]!=-1):
                text=str(tmp_df.loc[type_index[3]])
                # print(text)
                a=re.search("\d+?[天]+?",text)
                if (a is None):
                    a=re.search("\d+?[个]+[月]+?",text)
                if a is None:
                    a=re.search("[^\d]\d[年]+?",text)
                    if(a is not None):
                        a=re.search("\d[年]+?",a.group())
                if a is None:
                    a=re.search("^\d[年]+?",text)
                if a is not None:
                    time_limit=a.group().replace("（","").replace("）","").replace("(","").replace("(","")
                else:
                    if(is_number(text) and type(text) is not float):
                        time_limit=str(text)+"天"
                        # print(time_limit)
                    else:
                        time_limit=""

            #三个日期
            # tmp_df
            value_list=[]
            noshow=tmp_df.map(lambda x:value_list.append(str(x)))
            sum_value=(" and ").join(i for i in value_list)
            t = TimeFinder()
            time_all = t.find_time(sum_value)
            # print(time_all)
            if(time_all is not None):
                time_all=sorted(list(set(time_all)),reverse=True)
                # print(len(time_all))
                # print(sum_value)
                # print(product_name)
                if(len(time_all)==1):
                    pur_dt=time_all[0]
                    val_dt=pur_dt
                elif(len(time_all)==2):
                    # time_all=sorted(list(set(time_all)),reverse=True)
                    if(re.search("随时",sum_value) is not None or re.search("工作日",sum_value)):
                        time_limit=""
                        pur_dt=time_all[1]
                        val_dt=time_all[0]
                        coupon_dt = ""
                    else:
                        pur_dt=time_all[1]
                        val_dt=time_all[1]
                        coupon_dt = time_all[0]
                        try:
                            # 相减
                            if(type_index[3]==-1 or time_limit==""):
                                d1 = datetime.datetime.strptime(val_dt, '%Y-%m-%d')
                                d2 = datetime.datetime.strptime(coupon_dt, '%Y-%m-%d')
                                d = d2 - d1
                                time_limit = str(d.days) + '天'
                        except Exception:
                            coupon_dt = ""
                            time_limit = ""
                elif(len(time_all)==3):
                        # print(time_all)
                        pur_dt=time_all[2]
                        val_dt=time_all[1]
                        coupon_dt = time_all[0]
                        try:
                            # 相减
                            d1 = datetime.datetime.strptime(pur_dt, '%Y-%m-%d')
                            d2 = datetime.datetime.strptime(val_dt, '%Y-%m-%d')
                            d = d2 - d1
                            if str(d.days)=="1":
                                d1 = datetime.datetime.strptime(val_dt, '%Y-%m-%d')
                                d2 = datetime.datetime.strptime(coupon_dt, '%Y-%m-%d')
                                d = d2 - d1
                                time_limit=str(d.days)+"天"
                            else:
                                pur_dt=time_all[2]
                                val_dt=pur_dt
                                coupon_dt=""
                        except Exception:
                            coupon_dt = ""
                            time_limit = ""
                elif(len(time_all)>4):
                        time_all=sorted(time_all)
                        pur_dt=time_all[0]
                        val_dt=""
                        coupon_dt=""
                if pur_dt!="" and coupon_dt=="" and time_limit!="":
                    try:
                        coupon_dt=datetime.datetime.strftime(datetime.datetime.strptime(pur_dt, '%Y-%m-%d')+datetime.timedelta(days=int(re.search("\d*",time_limit).group())), '%Y-%m-%d')
                    except:
                        pass

            temp_single['认购日期'].append(pur_dt)
            temp_single['产品起息日'].append(val_dt)
            temp_single['产品到息日'].append(coupon_dt)
            temp_single['产品期限'] .append(time_limit)
            temp_single['认购金额(万元)'].append(amt)
            temp_single['产品发行方名称'] .append(counter_name.replace(" ",""))
            temp_single['理财产品名称'] .append(product_name)
            temp_single['sample_id'].append(sample_id)

            temp_single["实际购买公司名称"].append(tmp_df[4])
        
    temp_single=pd.DataFrame(temp_single)
    return temp_single

val_temp_single=get_answer_matrix(result_dict)

100%|██████████| 3734/3734 [06:48<00:00,  9.13it/s]


In [20]:
val_temp_single.to_excel("val_text_result.xlsx")

In [130]:
  def get_F1(val_pred, val_true):
      val_pred = list(val_pred)
      val_true = list(val_true)
      curr = list(set(val_pred).intersection(set(val_true)))
      R = len(curr)/len(val_true)
      P = len(curr)/len(val_pred)
      return 2*P*R/(P+R)

r = pd.merge(val_df[['sample_id']], train_outputs, on='sample_id', how='left')
val_true = r['sample_id'].astype(str) + r['理财产品名称'].astype(str) + r['认购金额(万元)'].astype(str)+ r['认购日期'].astype(str) + r['产品起息日'].astype(str)+ r['产品到息日'].astype(str)+ r['产品期限'].astype(str)+r['产品发行方名称'].astype(str)+r["实际购买公司名称"].astype(str)
# r.to_excel("result_after_drop.xlsx",index=None)
  
r=val_temp_single


val_pred = r['sample_id'].astype(str) + r['理财产品名称'].astype(str) + r['认购金额(万元)'].astype(str) + r['认购日期'].astype(str) + r['产品起息日'].astype(str)+ r['产品到息日'].astype(str) + r['产品期限'].astype(str)+r['产品发行方名称'].astype(str)+r["实际购买公司"].astype(str)
score = get_F1(val_pred, val_true)
score

0.06003647075326134

In [34]:
r[r["认购金额(万元)"]==""].shape

(5016, 8)

In [30]:
def get_result(judge_title_result,time_list,gm_list,result_matrix):
    # tmp_result=pd.merge(time_list,gm_list,on=["sample_id"])
    # tmp_result["sample_id"]=tmp_rersult["sample_id"].astype(str)
    r=result_matrix.fillna("").reset_index(drop=True)
    i=0
    i_list=[]
    r["sample_id"]=r["sample_id"].astype(str)
    result=pd.merge(time_list,r,on=["sample_id"]).reset_index(drop=True)
    for index in r.index:
        if r.loc[index].dropna().shape[0]<=5 or type(r.loc[index]["理财产品名称"]) is float or len(r.loc[index]["理财产品名称"])<2:
            i_list.append(i)
        i+=1
    r=r.drop(i_list)
    r=r.fillna("").applymap(lambda x:str(x).replace(" ",""))
    # r=drop_judge(judge_title_result,r)


    
    return result
val_temp_single["sample_id"]=val_temp_single["sample_id"].astype(str)
test_time["sample_id"]=test_time["sample_id"].astype(str)
val_result=get_result(None,test_time,test_gm,val_temp_single)
# test_result=get_result(None,test_time,test_gm,test_temp_single)

In [32]:
val_result
val_result.to_excel("text_test_2210_result.xlsx")

Unnamed: 0,公告日期,sample_id,认购日期,产品起息日,产品到息日,产品期限,认购金额(万元),产品发行方名称,理财产品名称,实际购买公司名称
0,2016-10-27,11190,2016-10-26,2016-10-26,,,24000,中国工商银行,无固定期限超短期人民币理财产品,北京京西文化旅游股份有限公司
1,2016-07-28,11191,2016-06-28,2016-06-28,,,1000,中国建设银行,中国建设银行乾元-日鑫月溢理财产品,北京京西文化旅游股份有限公司
2,2016-07-28,11191,2016-07-01,2016-07-01,,,11500,包商银行,包商银行企业鑫喜16005理财产品,北京京西文化旅游股份有限公司
3,2016-07-28,11191,2016-07-27,2016-07-27,,,1000,南京银行,“珠联璧合安享系列-季安享”人民币理财产品,北京京西文化旅游股份有限公司
4,2016-07-28,11191,2016-07-27,2016-07-27,2016-08-24,28天,500,交通银行,“蕴通财富·稳得利”28天周期型理财产品,北京京西文化旅游股份有限公司
...,...,...,...,...,...,...,...,...,...,...
5364,2016-03-02,22363,2015-10-26,2015-10-26,,,600,,交通银行“蕴通财富•日增利S款”,南京全信传输科技股份有限公司
5365,2016-03-02,22363,2016-03-02,2016-03-02,2016-06-08,98天,1000,南京银行,南京银行“珠联璧合-季稳鑫1号”,南京全信传输科技股份有限公司
5366,2018-08-21,22365,2018-08-17,2018-08-17,2019-02-15,2天,10000,盛京银行大连星海支行,盛京银行单位结构性存款2018年第106期,大连派思燃气系统股份有限公司
5367,2017-02-06,22369,2017-02-04,2017-02-04,2017-03-03,27天,5500,兴业银行股份有限公司,“兴业金雪球—优先2号”人民币理财产品,安徽安德利百货股份有限公司


In [84]:
data_df1=val_result.drop(labels=["产品发行方名称"],axis=1)
data_df2=pd.DataFrame()
data_df2[["sample_id","理财产品名称","产品发行方名称","实际购买公司名称"]]=data_df[["sample_id","理财产品名称","产品发行方名称","实际购买公司名称"]]
dafa_df3=pd.merge(data_df1,data_df2,on=["sample_id","理财产品名称","产品发行方名称"])

KeyError: '产品发行方名称'

# 检验筛选

In [114]:
a = pd.merge(val_df[['sample_id']], train_outputs, on='sample_id', how='left')
b=val_result
b["sample_id"]=b["sample_id"].astype(str)
b["认购金额(万元)"]=b["认购金额(万元)"].astype(str)
a["sample_id"]=a["sample_id"].astype(str)
a["认购金额(万元)"]=b["认购金额(万元)"].astype(str)

In [116]:
c=pl.Path("test_result.csv")
pd.read_csv(c,encoding="gbk")

In [118]:
val_temp_single.to_csv("val_txt_result.csv",index=None)

In [119]:
d=pd.read_excel("result_after_drop.xlsx")

In [122]:
c=val_temp_single

In [140]:
c["认购金额(万元)"]=c["认购金额(万元)"].astype(str)
d["认购金额(万元)"]=d["认购金额(万元)"].astype(str)
c_d_df=pd.merge(c,d,on=["理财产品名称","认购金额(万元)"])

In [141]:
c_d_df.to_excel("c_d.xlsx")

In [142]:
c_d_df

Unnamed: 0,认购日期_x,产品起息日_x,产品到息日_x,产品期限_x,认购金额(万元),产品发行方名称_x,理财产品名称,sample_id_x,实际购买公司,认购日期_y,产品起息日_y,产品到息日_y,产品期限_y,产品发行方名称_y,sample_id_y
0,2018-09-30,2018-09-30,,,19000.0,中国民生银行股份有限公司沈阳分行,中银保本理财-人民币按期开放【CNYAQKF】,10447,桃李面包股份有限公司,2018-10-08,2018-10-08,2018-12-28,81天,,10447
1,2018-09-30,2018-09-30,,,19000.0,中国民生银行股份有限公司沈阳分行,中银保本理财-人民币按期开放【CNYAQKF】,10447,桃李面包股份有限公司,2018-10-08,2018-10-08,2018-12-28,81天,,10444
2,2018-10-09,2018-10-09,2018-12-28,80天,12000.0,中国民生银行股份有限公司沈阳分行,挂钩利率结构性存款SDGA180883D,10447,桃李面包股份有限公司,2018-10-09,2018-10-09,2018-12-28,80天,,10447
3,2018-10-09,2018-10-09,2018-12-28,80天,12000.0,中国民生银行股份有限公司沈阳分行,挂钩利率结构性存款SDGA180883D,10447,桃李面包股份有限公司,2018-10-09,2018-10-09,2018-12-28,80天,,10445
4,2019-03-07,2019-03-07,2019-04-16,40天,1000.0,中国民生银行股份有限公司长沙分行侯家塘支行,挂钩指数结构性存款,9111,喜爱集团股份有限公司,2019-03-07,2019-03-07,2019-04-16,40天,中国民生银行股份有限公司长沙分行侯家塘支行,9110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
694,2018-10-26,2018-10-26,2019-02-01,98天,1500.0,中信银行股份有限公司南京江北支行,中信理财之共赢利率结构22496期人民币结构性存款产品-C189T0196,4235,基蛋生物科技股份有限公司,2018-10-26,2018-10-26,2019-02-01,98天,中信银行股份有限公司南京江北支行,4234
695,2018-10-26,2018-10-26,2019-02-01,98天,1500.0,中信银行股份有限公司南京江北支行,中信理财之共赢利率结构22496期人民币结构性存款产品-C189T0196,4235,基蛋生物科技股份有限公司,2018-10-26,2018-10-26,2019-02-01,98天,中信银行股份有限公司南京江北支行,4235
696,2018-07-05,2018-07-05,2019-01-03,2天,4800.0,江苏银行南京龙江支行,江苏银行“聚宝财富宝溢融”人民币开放式B6机构27理财产品,4247,基蛋生物科技股份有限公司,2018-07-05,2018-07-05,2019-01-03,182天,江苏银行南京龙江支行,4247
697,2018-12-14,2018-12-14,2019-12-11,362天,7000.0,华夏银行股份有限公司宁波分行,华夏银行慧盈人民币单位结构性存款产品0243,8737,浙江省围海建设集团股份有限公司,2018-12-14,2018-12-14,2019-12-11,362天,华夏银行股份有限公司宁波分行,8735
