# 导入相关包

In [1]:
# 导入相关包
import os
import pathlib as pl
import pandas as pd
import numpy as np
import re
from io import StringIO
from datetime import datetime 
import time
from IPython.core.interactiveshell import InteractiveShell
from tqdm.autonotebook import *
import pdfplumber
tqdm.pandas()
InteractiveShell.ast_node_interactivity = "all"


def extract_pdf_content(pdf_path):
    text_list = []
    table_list = []
    with pdfplumber.open(pdf_path) as pdf:
        for index_page in np.arange(0, len(pdf.pages), 1):
            # 读取多页
            page = pdf.pages[index_page]   # 第n页的信息
            text = page.extract_text()
            text_list.append(text)
            table = page.extract_tables()
            for t in table:
                table_list.append(t)
    return text_list, table_list

# a='train_output'
# b="train"+r"_\d"
# print(re.match(b,a))
def csv_2_df(file_path):
    file_list=file_path.parents[0].glob("*.csv")
    result=None
    for item in file_list:
        if re.match(file_path.stem.split("_")[0]+r"_\d",item.stem) is not None:
            print(item.name)
            result=pd.read_csv(item) if result is None else pd.concat([result,pd.read_csv(item)])
    return result

test_path = pl.Path('datasets/train_1.csv')

test_df = csv_2_df(test_path)
test_df.to_csv("datasets/train.csv",index=None)

# PDF解析原始数据 
## 加载数据并采用pdfplumber抽取PDF中的文字和表格


In [2]:
# 数据准备(train_output文件中格式有点问题，需要提前用excel或者wps打开然后另存为excel文件)
train_outputs = pd.read_excel('datasets/train_output.xlsx')

# 获取pdf中文字和表格
def extract_pdf_content(pdf_path):
    text_list = []
    table_list = []
    with pdfplumber.open(pdf_path) as pdf:
        for index_page in np.arange(0, len(pdf.pages), 1):
            # 读取多页
            page = pdf.pages[index_page]   # 第n页的信息
            text = page.extract_text()
            text_list.append(text)
            table = page.extract_tables()
            for t in table:
                table_list.append(t)
    return text_list, table_list

def get_dir_file(path):
    '''
    输入文件夹位置，输出整理好的dataframe
    '''
    path_list = os.listdir(path)
    id_list = []
    file_path_list = []
    text_list = []
    table_list = []
    for i in tqdm(path_list):
        if '.PDF' in i:
            file_path = path + i
            id_list.append(int(i.split('.')[0]))
            file_path_list.append(file_path)
            try:
                text_temp, table_temp = extract_pdf_content(file_path)
            except Exception:
                print('此pdf无法读取')
                text_temp, table_temp = [], []
            text_list.append(text_temp)
            table_list.append(table_temp)
            
    df = pd.DataFrame()
    df['sample_id'] = id_list
    df['file_path'] = file_path_list
    df['text'] = text_list
    df['tabel'] = table_list
    df = df.sort_values('sample_id')
    return df

# 文件处理太慢，可持续化保存文件
train_path = 'datasets/train.csv'
if os.path.exists(train_path):
    train_df = pd.read_csv(train_path)
else:
    train_df = get_dir_file('datasets/train_data/')
    train_df.to_csv(train_path,index=False)
    train_df = pd.read_csv(train_path)

test_path =  'datasets/test.csv'
if os.path.exists(test_path):
    test_df = pd.read_csv(test_path)
else:
    test_df = get_dir_file('datasets/test_data/')
    test_df.to_csv(test_path,index=False)
    test_df = pd.read_csv(test_path)

train_outputs.head(2)
train_df.head(2)
test_df.head(2)

Unnamed: 0,sample_id,认购日期,理财产品名称,产品发行方名称,理财类型,认购金额(万元),产品起息日,产品到息日,产品期限,资金来源,实际购买公司名称,实际购买公司和上市公司关系,买卖方是否有关联关系,公告日期
0,1,2019-03-27,汇聚金1号,中融国际信托有限公司,信托,10000.0,2019-03-27,2019-09-23,180天,自有资金,恒生电子股份有限公司,公司本身,否,2019-04-25
1,1,2019-03-27,招商银行步步生金8699,招商银行,银行理财产品,200.0,2019-03-27,NaT,,自有资金,恒生电子股份有限公司,公司本身,否,2019-04-25


Unnamed: 0,sample_id,file_path,text,tabel
0,1,datasets/train_data/1.PDF,[' ...,"[[['', None, None, '', None, None, '', None, N..."
1,2,datasets/train_data/2.PDF,[' ...,"[[['', None, None, '', None, None, '', None, N..."


Unnamed: 0,sample_id,file_path,text,tabel
0,11188,datasets/test_data/11188.PDF,['北京京西文化旅游股份有限公司监事会\n \n \n关于使用部分闲置募集资金购买理财产品的...,[]
1,11189,datasets/test_data/11189.PDF,['北京京西文化旅游股份有限公司 \n监事会关于使用部分自有资金购买理财产品的意见 \n根据...,[]


In [3]:
# 构造训练集验证集
train_df = train_df.sample(frac=1, random_state=1017)
val_df = train_df[:1800]
train_df = train_df[1800:]

# 数据处理
## 抽取整体数据（一个sampleid内此字段内容都相同）
## 公告时间，实际购买公司

#### 1.抽取公告时间

In [65]:
# 首先针对任务抽取时间（每个时间跟每个id是一一对应的）
# 要不是取第一个时间，要不就是取最后一个时间（或者时间加一）这里可以建立一个模型预测
# base这里面直接取最后一个时间作为发布日期

##wjc字典里添加了十
CN_NUM = {
    u'月十日':'月10日',u'十月':'10月', u'十日': '0日',u'二十':'二',u'三十':'三', u'十': 1, u'○': 0, u'O':'0', u'Ο':'0',
    u'〇': 0, u'一': 1, u'二': 2, u'三': 3,
    u'四': 4, u'五': 5, u'六': 6, u'七': 7,
    u'八': 8, u'九': 9, u'零': 0, u'壹': 1,
    u'贰': 2, u'叁': 3, u'肆': 4, u'伍': 5,
    u'陆': 6, u'柒': 7, u'捌': 8, u'玖': 9,
    u'貮': 2, u'两': 2,  
}
######### u'○': 0, u'O':'0', u'Ο':'0',##########可删除这些字典，改用添加内容效果一致，这里暂且保留字典


def get_put_time_from_text(row):
    row = row.replace(' ', '').replace('\\n', '')

    ############添加#################改善年份中0有多个不识别的问题########
    start=1
    for word in '一二三四五六七八九':   
        row=re.sub('二.一'+word,'201'+str(start),row)
        start=start+1
    ########添加############慢#######################

    for key in CN_NUM:
        row = row.replace(key, str(CN_NUM[key]))   

    r = row.replace("年", "-").replace("月", "-").replace("日", " ").replace("/", "-").strip()
    regex = "(\d{4}-\d{1,2}-\d{1,2})"
    r = re.findall(regex, r)
    if len(r)==0:
        return np.nan
    time_str = r[-1]
    first = time_str.split('-')[0]
    second = time_str.split('-')[1]
    last = time_str.split('-')[-1]
    second = str.zfill(second, 2)
    last = str.zfill(last, 2)
    r = '-'.join([first, second, last])
    return r




val_result = pd.DataFrame()
val_result['sample_id'] = val_df['sample_id']
val_result['predict_time'] = val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
test_gg = train_outputs.groupby('sample_id').apply(lambda row:list(row['公告日期'])[0]).reset_index()
test_gg.columns = ['sample_id', 'time']
val_result = pd.merge(val_result, test_gg, on='sample_id', how='left')
##val_result['time'].head(10)
###val_result['predict_time'].head(10)

np.set_printoptions(threshold=np.inf)
pd.set_option('max_colwidth',100)

testid=6375
print(train_df[train_df.sample_id==testid]['text'].astype(str))
print(val_df[val_df.sample_id==testid]['text'].astype(str))


# 对于每一行，通过列名name访问对应的元素
for index,row in val_result.iterrows():
    #type(row['predict_time'])
    #print(row['sample_id'],row['predict_time']) # 输出每一行
    try:
        val_result.at[index,'predict_time1']=datetime.strptime(row['predict_time'], "%Y-%m-%d").strftime('%Y/%m/%d')
        #val_result.at[index,'predict_time1']=time.strptime(row['predict_time'], "%Y-%m-%d")
    except:
        #val_result.at[index,'predict_time1']='1900/01/01'
        continue
        # print(row['sample_id'])
        # print(val_df[val_df.sample_id==row['sample_id']]['text'].astype(str))
        
for index,row in val_result.iterrows():
    val_result.at[index,'time1']=val_result.at[index,'time'].strftime("%Y-%m-%d %H:%M:%S")[0:10]
    val_result.at[index,'time2']=val_result.at[index,'time1'].replace('-','')
    try:
        val_result.at[index,'predict_time2']=val_result.at[index,'predict_time'].replace('-','')
    except:
        continue
    #######粗计算时间具体误差################
    val_result.at[index,'差值']=(int)(val_result.at[index,'predict_time2'])-(int)(val_result.at[index,'time2'])

fail_val_result=val_result[val_result['差值']!=0]
#######抽取错误列放入fail的dataframe########################


# val_result['predict_time_inf']=pd.to_datetime(val_result['predict_time'],format='%Y/%m/%d')
# val_result['predict_time_inf'].head(10)

#val_result['predict_time1'] = datetime.strptime(val_result['predict_time'], "%Y-%m-%d").strftime('%Y/%m/%d')
#val_result['predict_time_inf']=pd.to_datetime(val_result['predict_time1'],format='%Y/%m/%d')
#val_result['日期差值']=val_result['predict_time_inf']-val_result['time']
#val_result.head(10)

# 判断验证集的准确率
np.sum(val_result['predict_time'].astype(str) == val_result['time'].astype(str))/len(val_result)

# val_time = val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
# test_time = test_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)

val_time=pd.DataFrame()
val_time["公告日期"]=val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
# val_gm = val_df.progress_apply(lambda row:my_get_gm(row['text']), axis=1)
val_time['sample_id'] = val_df['sample_id'].astype(str)
#test_gm = test_df.progress_apply(lambda row:get_gm(row['text']), axis=1)

test_time=pd.DataFrame()
test_time["公告日期"]=test_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
# val_gm = val_df.progress_apply(lambda row:my_get_gm(row['text']), axis=1)
test_time['sample_id'] = test_df['sample_id'].astype(str)

val_time.shape
val_time.dropna().shape
def fix_time(df):
    df=df.dropna()
    index_list=[]
    df=df.reset_index(drop=True)
    index=-1
    for sample_id,date in df[["sample_id","公告日期"]].values:
        index+=1
        try:
            tmp=datetime.strptime(date,"%Y-%m-%d")
        except:
            print(index)
            index_list.append(index)
    return df.drop(index_list).reset_index(drop=True)

val_time=fix_time(val_time)
test_time=fix_time(test_time)
#test_gm = test_df.progress_apply(lambda row:get_gm(row['text']), axis=1)


100%|██████████| 1800/1800 [00:01<00:00, 1247.40it/s]
Series([], Name: text, dtype: object)
5162    [None]
Name: text, dtype: object


0.5883333333333334

100%|██████████| 1800/1800 [00:00<00:00, 2038.50it/s]
100%|██████████| 8660/8660 [00:04<00:00, 2115.81it/s]


(1800, 2)

(1797, 2)

344
847
953
1096


#### 2.抽取实际购买公司

In [69]:
# 抽取购买公司
# 前几句话出现
# 将其按照\\n 和空格切割
def get_gm(row):
    row=row.replace('[','').replace(']','').replace('\'','')    ######粗datawash,去除部分典型符号###########
    #head_row=re.split('资金',row)[0]
    #head_row=re.split('使用',head_row)[0]
    #tag_head_row=headrow.replace('[\\\\n ]','$|$')
    result = re.split('[\\\\n ]',row)
    for i in result:
        if '公司' in i:
            i=i.replace('（','(').replace('）',')') ##########修改中文括号#################
            regex="(^.*公司)"##################此四行为添加的##############################
            i=re.findall(regex, i)###########取此行最后一个“公司”前的字符#######
            i=i[0]###############格式转换，list取出str####################################
            if i=='公司':
                continue   ###########跳过字段为公司的答案进入下一个循环#####################
            return i




########################粗改进######################################
def my_get_gm(row):
    row=row.replace('[','').replace(']','').replace('\'','')    ######粗datawash,去除部分典型符号###########
    re_row = re.split('[\\\\n ]',row)
    
    ##################取第一次出现公司的行开始计算###################
    for i in re_row:
        if '公司' not in i:
            re_row.remove(i)
        else:
            break
    
    head_row=""
    #################list替换为文本##########################
    for i in re_row:
        head_row=head_row+i+'$|$'
    
    

    # print(type(head_row.find("公告编号")))
    # head_row=head_row[head_row.find("公告编号"):-1]

    # head_row=head_row.split("^.*公告编号：")[0]
    # head_row_a=head_row.split("资金")[0]
    # head_row=head_row_a.split("使用")[0]
    # head_row=head_row_a.split("委托")[0]
    # head_row=head_row_a.split("董事")[0]
    head_row=head_row.split("理财")[0].split("资金")[0].split("使用")[0].split("董事")[0]
    # print(head_row_a)
    tag_head_row=head_row.replace(' ','').replace('子公司','$|$').replace('公司','公司$|$').replace('关于','$|$').replace('-','$|$')
    # spl_head_row = re.split('\$\|\$',tag_head_row)
    spl_head_row = tag_head_row.split('$|$')
    spl_head_row.reverse()
    result=spl_head_row
    for i in result:
        if '公司' in i:
            i=i.replace('（','(').replace('）',')') ##########修改中文括号#################
            regex="(^.*公司)"##################此四行为添加的##############################
            i=re.findall(regex, i)###########取此行最后一个“公司”前的字符#######
            i=i[0]###############格式转换，list取出str####################################
            if i=='公司' or len(i)<=4 or '”' in i or '“' in i or '简称' in i:
                continue   ###########跳过字段为公司的答案进入下一个循环#####################
            return i
#################到此为止####################################




########################二次改进,文本内容查找追加，成功率暂时比不追加要低一点######################################
def my_get_gm2(row):
    row=row.replace('[','').replace(']','').replace('\'','')    ######粗datawash,去除部分典型符号###########
    re_row = re.split('[\\\\n ]',row)
    
    ###################取第一次出现公司的行开始计算###################
    for i in re_row:
        if '公司' not in i:
            re_row.remove(i)
        else:
            break
    
    
    head_row=""
    #################list替换为文本##########################
    for i in re_row:
        head_row=head_row+i+'$|$'
    
    #################去除标题#############################
    text_row1=head_row[head_row.find("公告"):-1]
    text_row2=head_row[head_row.find("意见"):-1]
    regex = "(子公司.*公司.*购买)"
    text1 = re.findall(regex, text_row1)
    text2 = re.findall(regex, text_row2)
    text=text1+text2
    for i in text:
        if '$|$' in i:
            text.remove(i)
    my_list=[]
    for i in text:
        spl_i=i.replace("购买","$|$").replace("子公司","$|$").replace("公司","公司$|$")
        spl_i=spl_i.split("$|$")
        spl_i.reverse()
        for j in spl_i:
            if '公司' in j:
                j=j.replace('（','(').replace('）',')')
                regex="(^.*公司)"
                j=re.findall(regex, j)
                j=j[0]
                if j=='公司' or len(j)<=4 or len(j)>30 or '”' in j or '“' in j or '简称' in j:
                    continue
                my_list.append(j)
    #print(text1)

    
    
    head_row=head_row.split("理财")[0].split("资金")[0].split("使用")[0].split("董事")[0]
    # print(head_row_a)
    tag_head_row=head_row.replace(' ','').replace('子公司','$|$').replace('公司','公司$|$').replace('关于','$|$').replace('-','$|$')
    # spl_head_row = re.split('\$\|\$',tag_head_row)
    spl_head_row = tag_head_row.split('$|$')
    spl_head_row.reverse()
    result=spl_head_row
    for i in result:
        if '公司' in i:
            i=i.replace('（','(').replace('）',')') ##########修改中文括号#################
            regex="(^.*公司)"##################此四行为添加的##############################
            i=re.findall(regex, i)###########取此行最后一个“公司”前的字符#######
            i=i[0]###############格式转换，list取出str####################################
            if i=='公司' or len(i)<=4 or '”' in i or '“' in i or '简称' in i:
                continue   ###########跳过字段为公司的答案进入下一个循环#####################
            my_list.insert(0,i)
            i=my_list[-1]
            return i
#################到此为止####################################





###val_gm = val_df.progress_apply(lambda row:get_gm(row['text']), axis=1)
val_gm=pd.DataFrame()
val_gm["实际购买公司名称"]=val_df.progress_apply(lambda row:my_get_gm(row['text']), axis=1)
# val_gm = val_df.progress_apply(lambda row:my_get_gm(row['text']), axis=1)
val_gm['sample_id'] = val_df['sample_id']
#test_gm = test_df.progress_apply(lambda row:get_gm(row['text']), axis=1)
val_gm=val_gm.dropna()


test_gm=pd.DataFrame()
test_gm["实际购买公司名称"] = test_df.progress_apply(lambda row:my_get_gm(row['text']), axis=1)
test_gm['sample_id']=test_df['sample_id']
test_gm=test_gm.dropna()

my_val_result = pd.DataFrame()
my_val_result['sample_id'] = val_df['sample_id']
my_val_result['predict_gs'] = val_df.progress_apply(lambda row: my_get_gm(row['text']), axis=1)
my_test_gs = train_outputs.groupby('sample_id').apply(lambda row:list(row['实际购买公司名称'])[0]).reset_index()
my_test_gs.columns = ['sample_id', 'gs']
my_val_result = pd.merge(my_val_result, my_test_gs, on='sample_id', how='left')
my_val_result['是否相等']=my_val_result['predict_gs']==my_val_result['gs']
fail_my_val_result=my_val_result[my_val_result['是否相等']!=True]


np.set_printoptions(threshold=np.inf)
pd.set_option('max_colwidth',100)

testid=3597
print(train_df[train_df.sample_id==testid]['text'].astype(str))
print(val_df[val_df.sample_id==testid]['text'].astype(str))

# 判断验证集的准确率
np.sum(my_val_result['predict_gs'].astype(str) == my_val_result['gs'].astype(str))/len(my_val_result)

100%|██████████| 1800/1800 [00:03<00:00, 502.72it/s]
100%|██████████| 8660/8660 [00:12<00:00, 708.81it/s]
100%|██████████| 1800/1800 [00:03<00:00, 578.41it/s]
Series([], Name: text, dtype: object)
2929    [None, None, None, None, None]
Name: text, dtype: object


0.7972222222222223

文章段落初步分段

In [67]:
# 最后一部分字段采用预测好的部分，跟提取的text做交互采用双输入lstm在dense层做交互预测最后几个字段

# train_lstm_input = pd.merge(train_df, train_outputs, on='sample_id', how='left')
# result_matrix
from tgrocery import Grocery
train_lstm_input = pd.merge(train_df, train_outputs, on='sample_id', how='left')

train_lstm_input = train_lstm_input.fillna('否')

# label_1理财类型-10  label_2资金来源-3 label_3实际购买公司和上市公司关系-3 label_4买卖方是否有关联关系-2
from sklearn.preprocessing import LabelEncoder
label_1 = LabelEncoder()
# label_2 = LabelEncoder()
# label_3 = LabelEncoder()
# label_4 = LabelEncoder()

train_data = pd.DataFrame()
tmp=pd.DataFrame()
train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) 

# train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) + '_' + train_lstm_input['产品发行方名称'].astype(str)

# train_data['text_2'] = train_lstm_input['text'].astype(str)

# train_lstm_input["文本类别"]="理财产品"

train_data['label_1'] = "理财产品"


train_data2=train_lstm_input[train_lstm_input["产品发行方名称"]!="否"].reset_index(drop=True)

# train_data2["文本类别"]="发行方"

tmp['text_1']=train_data2["产品发行方名称"].astype(str)

# tmp['text_2']= train_data2["text"].astype(str)

tmp['label_1']="发行方"

train_data = pd.concat([train_data,tmp]).reset_index(drop=True)

train_data2=train_lstm_input[train_lstm_input["实际购买公司名称"]!="否"].reset_index(drop=True)

# # train_data2["文本类别"]="发行方"

tmp['text_1']=train_data2["实际购买公司名称"].astype(str)

# tmp['text_2']= train_data2["text"].astype(str)

tmp['label_1']="购买公司"

train_data = pd.concat([train_data,tmp]).reset_index(drop=True)


other_columns_list=["认购金额(万元)","认购日期","资金来源","实际购买公司和上市公司关系","实际购买公司名称"]

for item in other_columns_list:

    train_lstm_input[item]=train_lstm_input[item].astype(str)

    train_data2=train_lstm_input[train_lstm_input[item]!="否"].reset_index(drop=True)

    # train_data2["文本类别"]=item

    tmp['text_1']=train_data2[item].astype(str)

    # tmp['text_2']= train_data2["text"].astype(str)

    tmp['label_1']="其它"

    
    train_data = pd.concat([train_data,tmp]).reset_index(drop=True)



# train_data['label_2'] = label_2.fit_transform(train_lstm_input['资金来源'])
# train_data['label_3'] = label_3.fit_transform(train_lstm_input['实际购买公司和上市公司关系'])
# train_data['label_4'] = label_4.fit_transform(train_lstm_input['买卖方是否有关联关系'])
train_data

train_src=[]
for text,label in train_data[["text_1","label_1"]].values:
    train_src.append([label,text])


grocery=Grocery("productOrcounter")


grocery.train(train_src)

grocery.save()



Unnamed: 0,text_1,label_1
0,中银保本理财-人民币按期开放理财产品,理财产品
1,中银保本理财-人民币按期开放理财产品,理财产品
2,与利率挂钩的结构性产品,理财产品
3,广发银行“薪加薪”16号XJXCKJ2578,理财产品
4,兴业银行“金雪球-优悦”保本开放式人民币理财产品(2M),理财产品
...,...,...
212362,上海浦兴投资发展有限公司,其它
212363,上海浦兴投资发展有限公司,其它
212364,上海浦兴投资发展有限公司,其它
212365,上海浦兴投资发展有限公司,其它


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lsqlh\AppData\Local\Temp\jieba.cache
Loading model cost 1.903 seconds.
Prefix dict has been built successfully.


<tgrocery.Grocery at 0x193be8cdc10>

#### 3.清洗提取出来的tabel数据，主要是清洗掉有问题的列 

In [101]:
result_data=pl.Path(r"datasets\1_traintocsv_result.txt")
with open(result_data,encoding="utf8") as f:
    val_csv_df=eval(f.read())

val_csv_df=pd.DataFrame(val_csv_df)

In [102]:
type(val_csv_df["tables"].iloc[0])

list

In [104]:

def str2tables(tabel):
    table=tabel
    table_result=[]
    product_df_list=[]

    for item in table:
        if(len(item)>0):
            # print(item[0])
            table_result.append(pd.Series(item[0]))
            
            
            product_df_list.append(pd.DataFrame(item[1:]))
            # print(pd.DataFrame(item[1:]))
            # print("--------")
        else:
            continue
    return table_result,product_df_list
    
def get_column_table(val_df):
    tmp_table_column={}
    tmp_table_column["sample_id"]=[]
    tmp_table_column["columns"]=[]
    tmp_table_column["product_df"]=[]
    for sample_id,tabel in tqdm(val_df[["sample_id","tables"]].values):
        table_result,product_df_list=str2tables(tabel)
        index=-1
        # print(table_result)
        for item in table_result:
            index+=1
            tmp_table_column["sample_id"].append(sample_id)
            tmp_table_column["columns"].append(item)
            tmp_table_column["product_df"].append(product_df_list[index])
    
    return pd.DataFrame(tmp_table_column)

val_table_column=get_column_table(val_csv_df)
# test_table_column=get_column_table(test_df)

100%|██████████| 9017/9017 [00:19<00:00, 461.90it/s]


In [97]:
val_table_column

Unnamed: 0,sample_id,columns,product_df
0,12850,0 受托人 1 关联关系 2 投资期限 3 产品类型 4 购买金额（元） 5 购入日期 6 终止日期 7 ...,0 1 2 3 4 5 6 7 \ 0 浦发银行昆明关上支行 无 ...
1,13596,0 序号 1 投资主体 2 受托方 3 关联关系 4 产品名称 5 产品类型 ...,0 1 2 3 4 5 6 7 \ 0 1 广州龙之杰科技有限...
2,13596,0 序号 1 投资主体 2 受托方 3 关联关系 4 产品名称 5 产品类型 ...,0 1 2 3 \ 0 1 广州龙之杰科技有限公司 中国工商...
3,13555,0 受托方 1 产品名称 2 关联关系 3 认购金额（万元） 4 起止期限 5 状态 6 利息收入（万元...,0 1 2 3 4 5 6 0 中国光大银行股份有限公司惠州分行 ...
4,13555,0 受托方 1 产品名称 2 关联关系 3 认购金额（万元） 4 起止期限 5 状态 6 利息收入（万元...,0 1 2 3 4 5 6 0 中国民生银行深圳分行 保本型理财产品 ...
...,...,...,...
1444,12861,0 序号 1 委托方 2 受托方 3 委托理财产品名称 4 产品类型 5 委托金额（万元） 6 期限（天...,0 1 2 3 4 5 6 7 \ 0 1 必康新沂 江苏银行 聚宝财富2017...
1445,12875,0 序号 1 公告编号 2 委托方 3 受托方 4 起息日 5 到期日 6 金额（万元） 7 ...,0 1 2 3 4 5 6 7 \ 0 1 2016-156 必康新沂...
1446,12875,0 序号 1 委托方 2 受托方 3 委托理财产品名称 4 产品类型 5 委托金额（万元） 6 期限（天...,0 1 2 3 4 5 6 7 \ 0 1 必康新沂 交通银行徐州分行 蕴通财富...
1447,12691,0 受托人 1 产品名称 2 产品类型 3 金额 4 期限（天） 5 理财起始日 6 理财终止日 7 ...,0 1 2 3 4 5 \ 0 中信银行股份有限公司宁波分行 ...


# 入表函数，生成答案的matrix

In [116]:
from fuzzywuzzy import fuzz
from src.time_extractor import TimeFinder

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
    return False

def judge_type(columns):
    type_index=[]#1:产品名,2:金额,3:发行方,4:期限,5:实际购买公司,6:认购日期,7:公告日期,8:到息日期,9:非购买金额,10:金额单位
    columns=columns.map(lambda x:x.replace("（","(").replace("）",")").replace(" ",""))
    product_name_pos_words=["产品名称","产品名册","产品名","理财产品","项目名","回购名","回购品","标的名","金融产","投资项"]#"存款种类","基金类型"#不能为空
    # product_name_neg_words=["编号","代码"]
    amt_pos_words=["存款金","认购金","投资金","投入金","受托金","理财金","金额","（元","(元","(万元","（万元","(亿元","（亿元","人民币","投资规","认购规","存款规","投入规","理财规","本金"]
    counter_name_pos_words=["受托方","银行机","机构名","合作方名","合作银","合作机","受托人","发行主","签约方","协议方","受托机","受托银","认购银","签约银","签约机","协议机","发生主","存放银","存款银","存款机","存放机","购买银","购买机","管理人","管理银","管理机","银行名","发行机","发行主","发行人","对手方","开户银","开户行","开户机"]#可以为空
    time_length_pos_words=["期限","(天)","持有时间"]
    amt_neg_words=["实际收回","收回","赎回","实际获得","实际损益","收益情况","投资盈亏","投资收益","理财盈亏","理财收益","盈亏","收益（元","收益(元","收益(万元","收益（万元","到期收益","到期收","是否到","是否已","目前状","到期情","到息情"]

    purchaser_pos_words=["委托人","委托公","购买公","委托方","购买单"]
    subscription_dt_pos_words=["认购日期","认购时间"]
    announcement_dt_pos_words=["公告日期","公告时间"]
    coupon_dt_pos_words=["到期日","到息日","终止日","到账日"]
    curr_words=["金额单位"]

    for words in product_name_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        # columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==0))
        if True in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==0):
        for words in ["种类","类型","类别"]:
            judge_flag=[]
            columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
            # columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==0))
            if True  in judge_flag:
                type_index.append(judge_flag.index(True))
                break
        if(len(type_index)==0):
            type_index.append(-1)
        

    for words in amt_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==1):
        type_index.append(-1)
    
    for words in counter_name_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==2):
        type_index.append(-1)
    
    for words in time_length_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==3):
        type_index.append(-1)
    
    for words in purchaser_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==4):
        type_index.append(-1)

    for words in subscription_dt_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==5):
        type_index.append(-1)

    for words in announcement_dt_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==6):
        type_index.append(-1)

    for words in coupon_dt_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==7):
        type_index.append(-1)


    for words in amt_neg_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==8):
        type_index.append(-1)
    
    for words in curr_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(x=="单位"))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==9):
        type_index.append(-1)
    
    
    return type_index
def get_result_matrix(result_matrix,time_list,sample_id=None):

    temp_single={}
    temp_single['认购日期'] = []
    temp_single['产品起息日'] = []
    temp_single['产品到息日'] = []
    temp_single['产品期限'] = []
    temp_single['认购金额(万元)'] = []
    temp_single['产品发行方名称'] = []
    temp_single['理财产品名称'] = []
    temp_single['sample_id'] = []
    temp_single["实际购买公司名称"]=  []
    temp_single["公告日期"]=[]

    if(sample_id is not None):
        result_matrix=result_matrix[result_matrix["sample_id"]==sample_id]
    for sample_id,columns,product_df in tqdm(result_matrix[["sample_id","columns","product_df"]].values):
        
        type_index=judge_type(columns)
        # product_df
        # columns
        # type_index
        for index in product_df.index:
            tmp_df=product_df.loc[index].fillna("")
            if(len(tmp_df.shape) ==2 ):
                tmp_df=tmp_df.reset_index(drop=True).loc[0]
            product_name=""#理财产品名称
            amt=""#认购金额(万元)
            counter_name=""#产品发行方
            pur_dt=""#6认购日期
            val_dt=""#起息日期
            coupon_dt=""#8到息日期
            time_limit=""#期限
            purchaser_name=""#5实际购买公司
            announcement_dt=""#7公告日期
            curr=""#9金额单位


            #产品名
            if(type_index[0]!=-1):
                if (str(grocery.predict(tmp_df.loc[type_index[0]])) == "理财产品"):
                    product_name=tmp_df.loc[type_index[0]]
            else:
                candidate_list={}
                candidate_list["理财产品"]=[]
                candidate_list["发行方"]=[]
                candidate_list["其它"]=[]
                candidate_list["购买公司"]=[]
                for each_word in tmp_df:
                    if not (is_number(each_word)):
                        candidate_list[str(grocery.predict(each_word))].append(each_word)
                if(len(candidate_list["理财产品"])!=0):
                    product_name=candidate_list["理财产品"][0]
            #金额
            if(type_index[1]!=-1):
                amt=tmp_df.loc[type_index[1]].replace("（","").replace("）","").replace("(","").replace("(","").replace("元","").replace("圆","")
                type_amt=0
                if("万" in amt or "万" in columns.loc[type_index[1]] or (type_index[9]!=-1 and  "万" in columns.loc[type_index[9]])):
                    type_amt=1
                if("亿" in amt or "亿" in columns.loc[type_index[1]] or (type_index[9]!=-1 and  "亿" in columns.loc[type_index[9]])):
                    type_amt=2
                amt=re.sub("[^0-9.]","",amt)
                amt=re.sub("[^0-9.]","",amt)
                if(is_number(amt)):
                    amt=float(amt)
                    if(type_amt==0 and amt/10000 >float(50)):
                        amt/=10000
                    if(type_amt==2):
                        amt*=10000
                # print(amt)
            else:
                candidate_list=[]
                value_list=list(tmp_df)
                for item in value_list:
                    tmp=str(item).replace("（","").replace("）","").replace("(","").replace("(","").replace("元","").replace("圆","").replace("亿","").replace("万","")
                    tmp=re.sub("[^0-9.]*额[^0-9.]*","",tmp)
                    tmp=re.sub("[^0-9.]*币[^0-9.]*","",tmp)
                    if(is_number(tmp)):
                        if(type_index[8]!=-1 and re.search(tmp,tmp_df.loc[type_index[8]]) is None):
                            candidate_list.append(float(tmp))
                        else:
                            pass

                if len(candidate_list)>0:
                    real_tmp=sorted(candidate_list,reverse=True)[0]

                    for item in value_list:
                        tmp=str(item).replace("（","").replace("）","").replace("(","").replace("(","").replace("元","").replace("圆","").replace("亿","").replace("万","")
                        tmp=re.sub("[^0-9.]*额[^0-9.]*","",tmp)
                        tmp=re.sub("[^0-9.]*币[^0-9.]*","",tmp)
                        if(is_number(tmp) and float(tmp)==real_tmp):
                            amt=item
                            type_amt=0
                        else:
                            continue
                        if("万" in amt or (type_index[9]!=-1 and  "万" in columns.loc[type_index[9]])):
                            type_amt=1
                        if("亿" in amt or (type_index[9]!=-1 and  "亿" in columns.loc[type_index[9]])):
                            type_amt=2
                        amt=re.sub("[^0-9.]","",amt)
                        if(is_number(amt)):
                            amt=float(amt)
                            # print(amt)
                            if(type_amt==0 and amt/10000 >float(50)):
                                amt/=10000
                            if(type_amt==2):
                                amt*=10000
                        if(amt !="" or amt!=np.nan):
                            break
            #发行方
            if(type_index[2]!=-1):
                if (str(grocery.predict(tmp_df.loc[type_index[2]])) == "发行方"):
                    counter_name=tmp_df.loc[type_index[2]]
            else:
                candidate_list={}
                candidate_list["理财产品"]=[]
                candidate_list["发行方"]=[]
                candidate_list["其它"]=[]
                candidate_list["购买公司"]=[]
                for each_word in tmp_df:
                    if not (is_number(each_word)):
                        each_word=each_word.replace("^","").replace("\n","").replace(" ","")
                        candidate_list[str(grocery.predict(each_word))].append(each_word)
                # print(candidate_list)
                if(len(candidate_list["发行方"])!=0):
                    counter_name=candidate_list["发行方"][0]
            
            #期限
            if(type_index[3]!=-1):
                text=str(tmp_df.loc[type_index[3]])
                # print(text)
                a=re.search("\d+?[天]+?",text)
                if (a is None):
                    a=re.search("\d+?[个]+[月]+?",text)
                if a is None:
                    a=re.search("[^\d]\d[年]+?",text)
                    if(a is not None):
                        a=re.search("\d[年]+?",a.group())
                if a is None:
                    a=re.search("^\d[年]+?",text)
                if a is not None:
                    time_limit=a.group().replace("（","").replace("）","").replace("(","").replace("(","")
                else:
                    if(is_number(text) and type(text) is not float and str(text)!="nan"):
                        time_limit=str(text)+"天"
                        # print(time_limit)
                    else:
                        time_limit=""

            #实际购买公司名称
            if(type_index[4]!=-1):
                if (str(grocery.predict(tmp_df.loc[type_index[4]])) == "购买公司"):
                    purchaser_name=tmp_df.loc[type_index[4]]
                else:
                    candidate_list={}
                    candidate_list["理财产品"]=[]
                    candidate_list["发行方"]=[]
                    candidate_list["其它"]=[]
                    candidate_list["购买公司"]=[]
                for each_word in tmp_df:
                    if not (is_number(each_word)):
                        each_word=each_word.replace("^","").replace("\n","").replace(" ","")
                        candidate_list[str(grocery.predict(each_word))].append(each_word)
                # print(candidate_list)
                if(len(candidate_list["购买公司"])!=0):
                    counter_name=candidate_list["购买公司"][0]
            
            #到息日期
            if(type_index[7]!=-1):
                t = TimeFinder()
                time_all=t.find_time(tmp_df.loc[type_index[7]])
                if( time_all is not None and len(time_all)==1):
                    coupon_dt=time_all[0]
                    tmp_df.loc[type_index[7]]=""
            
            #公告日期
            if(type_index[6]!=-1):
                t = TimeFinder()
                time_all=t.find_time(tmp_df.loc[type_index[6]])
                if(time_all is not None and len(time_all)==1):
                    announcement_dt=time_all[0]
                    tmp_df.loc[type_index[6]]=""

            #认购日期
            if(type_index[5]!=-1):
                t = TimeFinder()
                time_all=t.find_time(tmp_df.loc[type_index[2]])
                if(time_all is not None and len(time_all)==1):
                    pur_dt=time_all[0]
                    tmp_df.loc[type_index[5]]=""
            
            #三个日期
            # tmp_df
            value_list=[]
            noshow=tmp_df.map(lambda x:value_list.append(str(x)))
            sum_value=(" and ").join(i for i in value_list)
            t = TimeFinder()
            time_all = t.find_time(sum_value)
            # print(time_all)
            if(time_all is not None):
                time_all=sorted(list(set(time_all)),reverse=True)
                # print(len(time_all))
                # print(sum_value)
                # print(product_name)
                if(len(time_all)==1):
                    pur_dt=time_all[0]
                    val_dt=pur_dt
                elif(len(time_all)==2):
                    # time_all=sorted(list(set(time_all)),reverse=True)
                    if(re.search("随时",sum_value) is not None or re.search("工作日",sum_value)):
                        time_limit=""
                        pur_dt=time_all[1]
                        val_dt=time_all[0]
                        coupon_dt = ""
                    else:
                        if(pur_dt !=""):
                            pur_dt=time_all[1]
                        val_dt=time_all[1]
                        coupon_dt = time_all[0]
                        try:
                            # 相减
                            if(type_index[3]==-1 or time_limit==""):
                                d1 = datetime.datetime.strptime(val_dt, '%Y-%m-%d')
                                d2 = datetime.datetime.strptime(coupon_dt, '%Y-%m-%d')
                                d = d2 - d1
                                time_limit = str(d.days) + '天'
                        except Exception:
                            coupon_dt = ""
                            time_limit = ""
                elif(len(time_all)==3):
                        # print(time_all)
                        pur_dt=time_all[2]
                        val_dt=time_all[1]
                        coupon_dt = time_all[0]
                        try:
                            # 相减
                            d1 = datetime.datetime.strptime(pur_dt, '%Y-%m-%d')
                            d2 = datetime.datetime.strptime(val_dt, '%Y-%m-%d')
                            d = d2 - d1
                            if str(d.days)=="1":
                                d1 = datetime.datetime.strptime(val_dt, '%Y-%m-%d')
                                d2 = datetime.datetime.strptime(coupon_dt, '%Y-%m-%d')
                                d = d2 - d1
                                time_limit=str(d.days)+"天"
                            else:
                                pur_dt=time_all[2]
                                val_dt=pur_dt
                                coupon_dt=""
                        except Exception:
                            coupon_dt = ""
                            time_limit = ""
                elif(len(time_all)>4):
                        time_all=sorted(time_all)
                        pur_dt=time_all[0]
                        val_dt=""
                        coupon_dt=""
                if pur_dt!="" and coupon_dt=="" and time_limit!="":
                    try:
                        if "天" in time_limit:
                            coupon_dt=datetime.datetime.strftime(datetime.datetime.strptime(pur_dt, '%Y-%m-%d')+datetime.timedelta(days=int(re.search("\d*",time_limit).group())), '%Y-%m-%d')
                        elif "月" in time_limit:
                            coupon_dt=datetime.datetime.strftime(datetime.datetime.strptime(pur_dt, '%Y-%m-%d')+datetime.timedelta(months=int(re.search("\d*",time_limit).group())), '%Y-%m-%d')
                        elif "年" in time_limit:
                            coupon_dt=datetime.datetime.strftime(datetime.datetime.strptime(pur_dt, '%Y-%m-%d')+datetime.timedelta(years=int(re.search("\d*",time_limit).group())), '%Y-%m-%d')
                    except:
                        pass
            
            if(announcement_dt==""):
                announcement_dt=time_list[time_list["sample_id"]==sample_id]["公告日期"].values
            
            temp_single['sample_id'].append(sample_id)
            temp_single['认购日期'].append(pur_dt)
            temp_single['产品起息日'].append(val_dt)
            temp_single['产品到息日'].append(coupon_dt)
            temp_single['产品期限'] .append(time_limit)
            temp_single['认购金额(万元)'].append(amt)
            temp_single['产品发行方名称'] .append(counter_name)
            temp_single['理财产品名称'] .append(product_name)
            temp_single["实际购买公司名称"].append(purchaser_name)
            temp_single['公告日期'].append(announcement_dt)
    
        
    temp_single=pd.DataFrame(temp_single)
    return temp_single

val_result_matrix=get_result_matrix(val_table_column,val_time)
# test_result_matrix=get_result_matrix(test_table_column)

1%|          | 71/7797 [01:30<2:43:19,  1.27s/it]


KeyboardInterrupt: 

In [59]:
a=[1,2,3,4,5,6,7,8]
a=pd.DataFrame(a)
a.iloc[2]=10
a

Unnamed: 0,0
0,1
1,2
2,10
3,4
4,5
5,6
6,7
7,8


In [None]:
def reduction_func(result_df):

    #弃非空字段的空值
    result_df=result_df.dropna(subset=["认购日期","理财产品名称","认购金额(万元)"])
    #弃重复值
    result_df=result_df.sort_values(by=["公告日期"],axis=0)
    result_df=result_df.drop_duplicates(subset=["实际","理财产品名称","认购金额(万元)","产品起息日","产品到息日","产品期限"],keep=first)

    return result_df.reset_index(True)

val_history_result=reduction_func(val_result_matrix)

In [None]:

def get_table_answer_result(result_df,val_gm):

    drop_list=[]
    index=-1
    for coupon_dt,announcement_dt in tqdm(result_df[["产品到息日","公告日期"]].values):
        index+=1
        if (coupon_dt !="" and announcement_dt!=""):
            d1 = datetime.datetime.strptime(coupon_dt, '%Y-%m-%d')
            d2 = datetime.datetime.strptime(announcement_dt, '%Y-%m-%d')
            d = d2 - d1
            if(int(d.days) > 90):
                drop_list.append(index)
        
    result_df=result_df.drop(drop_list).reset_index(drop=True)

    result_df_counter_none=result_df[result_df["实际购买公司名称"]==""]

    result_df=result_df[result_df["实际购买公司名称"]!=""]

    result_df_counter_none=result_df_counter_none.drop(subset=["实际购买公司名称"],axis=1)

    result_df_counter_none=pd.merge(result_df_counter_none,val_gm,on=["sample_id"])

    result_df=pd.concat([result_df,result_df_counter_none])
     
    return result_df

val_answer_result=get_table_answer_result(val_history_result,val_gm)

#### 4.抽取的是单独的数据包含
#### 起息日，到息日， 金额，认购日期，产品发行方，理财产品

In [10]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
    return False

def judge_type(columns):
    type_index=[]#1:产品名,2:金额,3:发行方，4:期限
    columns=columns.map(lambda x:x.replace("（","(").replace("）",")"))
    product_name_pos_words=["产品名称","产品名册","产品名","理财产品","项目名","回购名","回购品","标的名","金融产","投资项"]#"存款种类","基金类型"#不能为空
    # product_name_neg_words=["编号","代码"]
    amt_pos_words=["存款金","认购金","投资金","投入金","受托金","理财金","金额","（元","(元","(万元","（万元","(亿元","（亿元","人民币","投资规","认购规","存款规","投入规","理财规"]
    counter_name_pos_words=["受托方","银行机","机构名","合作方名","合作银","合作机","受托人","发行主","签约方","协议方","受托机","受托银","认购银","签约银","签约机","协议机","发生主","存放银","存款银","存款机","存放机","购买银","购买机","管理人","管理银","管理机","银行名","发行机","发行主","发行人","对手方","开户银","开户行","开户机"]#可以为空
    time_length_pos_words=["期限","(天)","持有时间"]
    for words in product_name_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        # columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==0))
        if True in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==0):
        for words in ["种类","类型","类别"]:
            judge_flag=[]
            columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
            # columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==0))
            if True  in judge_flag:
                type_index.append(judge_flag.index(True))
                break
        if(len(type_index)==0):
            type_index.append(-1)
        

    for words in amt_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==1):
        type_index.append(-1)
    
    for words in counter_name_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==2):
        type_index.append(-1)
    
    for words in time_length_pos_words:
        judge_flag=[]
        columns.map(lambda x:judge_flag.append(fuzz.partial_ratio(words,x)==100))
        if True  in judge_flag:
            type_index.append(judge_flag.index(True))
            break
    if(len(type_index)==3):
        type_index.append(-1)
    
    
    return type_index
def get_answer_matrix(result_matrix,sample_id=None):

    temp_single={}
    temp_single['认购日期'] = []
    temp_single['产品起息日'] = []
    temp_single['产品到息日'] = []
    temp_single['产品期限'] = []
    temp_single['认购金额(万元)'] = []
    temp_single['产品发行方名称'] = []
    temp_single['理财产品名称'] = []
    temp_single['sample_id'] = []

    if(sample_id is not None):
        result_matrix=result_matrix[result_matrix["sample_id"]==sample_id]
    for sample_id,columns,product_df in tqdm(result_matrix[["sample_id","columns","product_df"]].values):
        
        type_index=judge_type(columns)
        # product_df
        # columns
        # type_index
        for index in product_df.index:
            tmp_df=product_df.loc[index]
            if(len(tmp_df.shape) ==2 ):
                tmp_df=tmp_df.reset_index(drop=True).loc[0]
            product_name=""
            amt=""
            counter_name=""
            pur_dt=""
            val_dt=""
            coupon_dt=""
            time_limit=""
            #产品名
            if(type_index[0]!=-1):
                if (str(grocery.predict(tmp_df.loc[type_index[0]])) == "理财产品"):
                    product_name=tmp_df.loc[type_index[0]]
            else:
                candidate_list={}
                candidate_list["理财产品"]=[]
                candidate_list["发行方"]=[]
                candidate_list["其它"]=[]
                candidate_list["购买公司"]=[]
                for each_word in tmp_df.head(1):
                    if not (is_number(each_word)):
                        candidate_list[str(grocery.predict(each_word))].append(each_word)
                if(len(candidate_list["理财产品"])!=0):
                    product_name=candidate_list["理财产品"][0]
            #金额
            if(type_index[1]!=-1):
                amt=tmp_df.loc[type_index[1]].replace("（","").replace("）","").replace("(","").replace("(","").replace("元","").replace("圆","")
                type_amt=0
                if("万" in amt or "万" in columns.loc[type_index[1]]):
                    type_amt=1
                if("亿" in amt or "亿" in columns.loc[type_index[1]]):
                    type_amt=2
                amt=re.sub("[^0-9.]","",amt)
                amt=re.sub("[^0-9.]","",amt)
                if(is_number(amt)):
                    amt=float(amt)
                    if(type_amt==0 and amt/10000 >float(50)):
                        amt/=10000
                    if(type_amt==2):
                        amt*=10000
                # print(amt)
            else:
                candidate_list=[]
                value_list=list(tmp_df)
                for item in value_list:
                    tmp=str(item).replace("（","").replace("）","").replace("(","").replace("(","").replace("元","").replace("圆","").replace("亿","").replace("万","")
                    tmp=re.sub("[^0-9.]*额[^0-9.]*","",tmp)
                    tmp=re.sub("[^0-9.]*币[^0-9.]*","",tmp)
                    if(is_number(tmp)):
                        candidate_list.append(float(tmp))

                if len(candidate_list)>0:
                    real_tmp=sorted(candidate_list,reverse=True)[0]

                    for item in value_list:
                        tmp=str(item)
                        tmp=re.sub("[^0-9.]*额[^0-9.]*","",tmp)
                        tmp=re.sub("[^0-9.]*币[^0-9.]*","",tmp)
                        if(is_number(tmp) and float(tmp)==real_tmp):
                            amt=item
                            type_amt=0
                        else:
                            continue
                        if("万" in amt ):
                            type_amt=1
                        if("亿" in amt ):
                            type_amt=2
                        amt=re.sub("[^0-9.]","",amt)
                        if(is_number(amt)):
                            amt=float(amt)
                            # print(amt)
                            if(type_amt==0 and amt/10000 >float(50)):
                                amt/=10000
                            if(type_amt==2):
                                amt*=10000
                        if(amt !="" or amt!=np.nan):
                            break
            #发行方
            if(type_index[2]!=-1):
                if (str(grocery.predict(tmp_df.loc[type_index[2]])) == "发行方"):
                    counter_name=tmp_df.loc[type_index[2]]
            else:
                candidate_list={}
                candidate_list["理财产品"]=[]
                candidate_list["发行方"]=[]
                candidate_list["其它"]=[]
                candidate_list["购买公司"]=[]
                for each_word in tmp_df:
                    if not (is_number(each_word)):
                        each_word=each_word.replace("^","").replace("\n","").replace(" ","")
                        candidate_list[str(grocery.predict(each_word))].append(each_word)
                # print(candidate_list)
                if(len(candidate_list["发行方"])!=0):
                    counter_name=sorted(candidate_list["发行方"],reverse=True)[0]
            
            #期限
            if(type_index[3]!=-1):
                text=str(tmp_df.loc[type_index[3]])
                # print(text)
                a=re.search("\d+?[天]+?",text)
                if (a is None):
                    a=re.search("\d+?[个]+[月]+?",text)
                if a is None:
                    a=re.search("[^\d]\d[年]+?",text)
                    if(a is not None):
                        a=re.search("\d[年]+?",a.group())
                if a is None:
                    a=re.search("^\d[年]+?",text)
                if a is not None:
                    time_limit=a.group().replace("（","").replace("）","").replace("(","").replace("(","")
                else:
                    if(is_number(text) and type(text) is not float):
                        time_limit=str(text)+"天"
                        # print(time_limit)
                    else:
                        time_limit=""

            #三个日期
            # tmp_df
            value_list=[]
            noshow=tmp_df.map(lambda x:value_list.append(str(x)))
            sum_value=(" and ").join(i for i in value_list)
            t = TimeFinder()
            time_all = t.find_time(sum_value)
            # print(time_all)
            if(time_all is not None):
                time_all=sorted(list(set(time_all)),reverse=True)
                # print(len(time_all))
                # print(sum_value)
                # print(product_name)
                if(len(time_all)==1):
                    pur_dt=time_all[0]
                    val_dt=pur_dt
                elif(len(time_all)==2):
                    # time_all=sorted(list(set(time_all)),reverse=True)
                    if(re.search("随时",sum_value) is not None or re.search("工作日",sum_value)):
                        time_limit=""
                        pur_dt=time_all[1]
                        val_dt=time_all[0]
                        coupon_dt = ""
                    else:
                        pur_dt=time_all[1]
                        val_dt=time_all[1]
                        coupon_dt = time_all[0]
                        try:
                            # 相减
                            if(type_index[3]==-1 or time_limit==""):
                                d1 = datetime.datetime.strptime(val_dt, '%Y-%m-%d')
                                d2 = datetime.datetime.strptime(coupon_dt, '%Y-%m-%d')
                                d = d2 - d1
                                time_limit = str(d.days) + '天'
                        except Exception:
                            coupon_dt = ""
                            time_limit = ""
                elif(len(time_all)==3):
                        # print(time_all)
                        pur_dt=time_all[2]
                        val_dt=time_all[1]
                        coupon_dt = time_all[0]
                        try:
                            # 相减
                            d1 = datetime.datetime.strptime(pur_dt, '%Y-%m-%d')
                            d2 = datetime.datetime.strptime(val_dt, '%Y-%m-%d')
                            d = d2 - d1
                            if str(d.days)=="1":
                                d1 = datetime.datetime.strptime(val_dt, '%Y-%m-%d')
                                d2 = datetime.datetime.strptime(coupon_dt, '%Y-%m-%d')
                                d = d2 - d1
                                time_limit=str(d.days)+"天"
                            else:
                                pur_dt=time_all[2]
                                val_dt=pur_dt
                                coupon_dt=""
                        except Exception:
                            coupon_dt = ""
                            time_limit = ""
                elif(len(time_all)>4):
                        time_all=sorted(time_all)
                        pur_dt=time_all[0]
                        val_dt=""
                        coupon_dt=""
                if pur_dt!="" and coupon_dt=="" and time_limit!="":
                    try:
                        coupon_dt=datetime.datetime.strftime(datetime.datetime.strptime(pur_dt, '%Y-%m-%d')+datetime.timedelta(days=int(re.search("\d*",time_limit).group())), '%Y-%m-%d')
                    except:
                        pass

            temp_single['认购日期'].append(pur_dt)
            temp_single['产品起息日'].append(val_dt)
            temp_single['产品到息日'].append(coupon_dt)
            temp_single['产品期限'] .append(time_limit)
            temp_single['认购金额(万元)'].append(amt)
            temp_single['产品发行方名称'] .append(counter_name)
            temp_single['理财产品名称'] .append(product_name)
            temp_single['sample_id'].append(sample_id)
        
    temp_single=pd.DataFrame(temp_single)
    return temp_single

val_temp_single=get_answer_matrix(val_result_matrix)
test_temp_single=get_answer_matrix(test_result_matrix)

100%|██████████| 1122/1122 [04:45<00:00,  3.93it/s]
100%|██████████| 5481/5481 [12:16<00:00,  7.45it/s]


#### 5.汇总整理数据

temp_single裁剪

In [11]:
def drop_judge(judge_title_result,result,score_limit=31):
    # global judge_title_result
    drop_list=[]
    index=0
    for sample_id,product_name in tqdm(result[["sample_id","理财产品名称"]].values):
        # print(sample_id)
        score_list=[]
        for text in judge_title_result[judge_title_result["sample_id"]==int(sample_id)][4].values:
            # print(text)
            score_list.append(fuzz.partial_token_sort_ratio(product_name,text))
        # print(score_list)
        if  len(score_list)>0 and np.max(pd.DataFrame(score_list)[0])<=score_limit:
            drop_list.append(index)
        index+=1 
    
    return result.copy().reset_index(drop=True).drop(drop_list)

# drop_judge(r)

In [28]:

val_time["sample_id"]=val_time["sample_id"].astype(str)
val_gm["sample_id"]=val_gm["sample_id"].astype(str)
test_time["sample_id"]=test_time["sample_id"].astype(str)
test_gm["sample_id"]=test_gm["sample_id"].astype(str)

def get_result(judge_title_result,time_list,gm_list,result_matrix):
    tmp_result=pd.merge(time_list,gm_list,on=["sample_id"])
    tmp_result["sample_id"]=tmp_result["sample_id"].astype(str)
    r=result_matrix.fillna("").reset_index(drop=True)
    i=0
    i_list=[]
    for index in r.index:
        if r.loc[index].dropna().shape[0]<=5 or type(r.loc[index]["理财产品名称"]) is float or len(r.loc[index]["理财产品名称"])<2:
            i_list.append(i)
        i+=1
    r=r.drop(i_list)
    r=r.fillna("").applymap(lambda x:str(x).replace(" ",""))
    r=drop_judge(judge_title_result,r)


    result=pd.merge(tmp_result,r,on=["sample_id"]).reset_index(drop=True)
    return result

val_result=get_result(val_judge_title_result,val_time,val_gm,val_temp_single)
test_result=get_result(test_judge_title_result,test_time,test_gm,test_temp_single)

100%|██████████| 6022/6022 [00:18<00:00, 328.01it/s]
100%|██████████| 15718/15718 [00:47<00:00, 334.33it/s]


In [29]:
  def get_F1(val_pred, val_true):
      val_pred = list(val_pred)
      val_true = list(val_true)
      curr = list(set(val_pred).intersection(set(val_true)))
      R = len(curr)/len(val_true)
      P = len(curr)/len(val_pred)
      return 2*P*R/(P+R)

  r = pd.merge(val_df[['sample_id']], train_outputs, on='sample_id', how='left')
  val_true = r['sample_id'].astype(str) + r['理财产品名称'].astype(str) + r['认购金额(万元)'].astype(str) +r['产品发行方名称'].astype(str) + r['认购日期'].astype(str) + r['产品起息日'].astype(str)+ r['产品到息日'].astype(str) + r['产品期限'].astype(str)+ r['公告日期'].astype(str)+ r['实际购买公司名称'].astype(str)
    # r.to_excel("result_after_drop.xlsx",index=None)
  
r=val_result


val_pred = r['sample_id'].astype(str) + r['理财产品名称'].astype(str) + r['认购金额(万元)'].astype(str) +r['产品发行方名称'].astype(str) + r['认购日期'].astype(str) + r['产品起息日'].astype(str)+ r['产品到息日'].astype(str) + r['产品期限'].astype(str)+ r['公告日期'].astype(str)+ r['实际购买公司名称'].astype(str)
score = get_F1(val_pred, val_true)
score

0.060510557831705

# 建模过程（预处理模型）

### 1.采用LSTM网络用提取好的部分跟pdf中的text做交互预测
#### 理财类型、资金来源、实际购买公司和上市公司关系、买卖方是否有关联关系

In [14]:
# 最后一部分字段采用预测好的部分，跟提取的text做交互采用双输入lstm在dense层做交互预测最后几个字段
train_lstm_input = pd.merge(train_df, train_outputs, on='sample_id', how='left')
train_lstm_input = train_lstm_input.fillna('否')
# label_1理财类型-10  label_2资金来源-3 label_3实际购买公司和上市公司关系-3 label_4买卖方是否有关联关系-2
from sklearn.preprocessing import LabelEncoder
label_1 = LabelEncoder()
label_2 = LabelEncoder()
label_3 = LabelEncoder()
label_4 = LabelEncoder()

train_data = pd.DataFrame()
train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) + '_' + train_lstm_input['产品发行方名称'].astype(str)
train_data['text_2'] = train_lstm_input['text'].astype(str)

train_data['label_1'] = label_1.fit_transform(train_lstm_input['理财类型'])
train_data['label_2'] = label_2.fit_transform(train_lstm_input['资金来源'])
train_data['label_3'] = label_3.fit_transform(train_lstm_input['实际购买公司和上市公司关系'])
train_data['label_4'] = label_4.fit_transform(train_lstm_input['买卖方是否有关联关系'])


In [15]:
# 导入相关库
import os
import pandas as pd
from tqdm.autonotebook import *
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
from gensim.models import FastText, Word2Vec
import re
from keras.layers import *
from keras.models import *
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import *
from keras.layers.advanced_activations import LeakyReLU, PReLU
import keras.backend as K
from keras.optimizers import *
from keras.utils import to_categorical
import tensorflow as tf
import random as rn
import gc
import logging
import gensim
import jieba
tqdm.pandas()
os.environ['PYTHONHASHSEED'] = '0'
# 显卡使用（如没显卡需要注释掉）
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
np.random.seed(1024)
rn.seed(1024)
tf.random.set_seed(1024)

In [16]:
train_data['text_1'] = train_data['text_1'].progress_apply(lambda row:' '.join(jieba.lcut(str(row))))
train_data['text_2'] = train_data['text_2'].progress_apply(lambda row:' '.join(jieba.lcut(str(row))))
train_data.head(5)

100%|██████████| 27154/27154 [00:03<00:00, 7910.89it/s]
100%|██████████| 27154/27154 [12:21<00:00, 36.61it/s]


Unnamed: 0,text_1,text_2,label_1,label_2,label_3,label_4
0,中银 保本 理财 - 人民币 按期 开放 理财产品 _ 中国银行 股份 有限公司 广州 东山 支行,[ ' 证券 代码 ： 600728 证券 简称 ： 佳 都 科技 公告 编号 ： 2016 - 099 \ n \...,9,1,0,0
1,中银 保本 理财 - 人民币 按期 开放 理财产品 _ 中国银行 股份 有限公司 广州 东山 支行,[ ' 证券 代码 ： 600728 证券 简称 ： 佳 都 科技 公告 编号 ： 2016 - 099 \ n \...,9,1,0,0
2,与 利率 挂钩 的 结构性 产品 _ 中国民生银行 股份 有限公司,[ ' 证券 代码 ： 600211 证券 简称 ： 西藏药业 公告 编号 ： 20...,9,0,0,0
3,广发 银行 “ 薪 加薪 ” 16 号 XJXCKJ2578 _ 广发 银行 股份 有限公司 清远 分行,[ ' 证券 代码 ： 002171 证券 简称 ： 楚江 新材 公告 ...,9,1,2,0
4,兴业银行 “ 金 雪球 - 优悦 ” 保本 开放式 人民币 理财产品 ( 2M ) _ 兴业银行 股份 有限公司 芜湖 分行,[ ' 证券 代码 ： 002171 证券 简称 ： 楚江 新材 公告 ...,9,1,0,0


In [17]:
### Tokenizer 序列化文本
def set_tokenizer(docs, split_char=' ', max_len=100):
    '''
    输入
    docs:文本列表
    split_char:按什么字符切割
    max_len:截取的最大长度
    
    输出
    X:序列化后的数据
    word_index:文本和数字对应的索引
    '''
    tokenizer = Tokenizer(lower=False, char_level=False, split=split_char)
    tokenizer.fit_on_texts(docs)
    X = tokenizer.texts_to_sequences(docs)
    maxlen = max_len
    X = pad_sequences(X, maxlen=maxlen, value=0)
    word_index=tokenizer.word_index
    return X, word_index, tokenizer

### 做embedding 这里采用word2vec 可以换成其他例如（glove词向量）
def trian_save_word2vec(docs, embed_size=300, save_name='w2v.txt', split_char=' '):
    '''
    输入
    docs:输入的文本列表
    embed_size:embed长度
    save_name:保存的word2vec位置
    
    输出
    w2v:返回的模型
    '''
    input_docs = []
    for i in docs:
        input_docs.append(i.split(split_char))
    logging.basicConfig(
    format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
    w2v = Word2Vec(input_docs, size=embed_size, sg=1, window=8, seed=1017, workers=24, min_count=1, iter=10)
    w2v.save(save_name)
    print("w2v model done")
    return w2v

# 得到embedding矩阵
def get_embedding_matrix(word_index, embed_size=300, Emed_path="w2v_300.txt"):
    embeddings_index = Word2Vec.load(Emed_path)
    nb_words = len(word_index)+1
    embedding_matrix = np.zeros((nb_words, embed_size))
    count = 0
    for word, i in tqdm(word_index.items()):
        if i >= nb_words:
            continue
        try:
            embedding_vector = embeddings_index[word]
        except:
            embedding_vector = np.zeros(embed_size)
            count += 1
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector  
    print("null cnt",count)
    return embedding_matrix

# 得到fasttext矩阵
def load_fasttext(word_index, path):  
    count=0
    null_list=[]
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(path, encoding='utf-8') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words =  len(word_index)+1
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= nb_words: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
        else:
            null_list.append(word)
            count+=1
    print("null cnt:",count)
    return embedding_matrix

def get_embedding_matrix_txt(word_index,embed_size=200,Emed_path="w2v_300.txt"):
    embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(
        Emed_path, binary=False)
    nb_words = len(word_index)+1
    embedding_matrix = np.zeros((nb_words, embed_size))
    count = 0
    for word, i in tqdm(word_index.items()):
        if i >= nb_words:
            continue
        try:
            embedding_vector = embeddings_index[word]
        except:
            embedding_vector = np.zeros(embed_size)
            count += 1
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    print("null cnt",count)
    return embedding_matrix

### 2.训练得到word2vec

In [18]:
text_1_list = np.unique(train_data['text_1'])
text_3_list = np.unique(train_data['text_2'])

print('开始序列化')
x1, index_1, token_1 = set_tokenizer(train_data['text_1'], split_char=' ', max_len=30)
x3, index_3, token_3 = set_tokenizer(train_data['text_2'], split_char=' ', max_len=600)
print('序列化完成')
gc.collect()

trian_save_word2vec(text_1_list, save_name='models/w2v_300_1.txt', split_char=' ')
gc.collect()
trian_save_word2vec(text_3_list, save_name='models/w2v_300_3.txt', split_char=' ')
gc.collect()

# 得到emb矩阵
emb1 = get_embedding_matrix(index_1, Emed_path='models/w2v_300_1.txt')
emb3 = get_embedding_matrix(index_3, Emed_path='models/w2v_300_3.txt')
gc.collect()

开始序列化
序列化完成


27

50:INFO:worker thread finished; awaiting finish of 16 more threads
2020-09-11 04:06:21,667:INFO:worker thread finished; awaiting finish of 15 more threads
2020-09-11 04:06:21,676:INFO:worker thread finished; awaiting finish of 14 more threads
2020-09-11 04:06:21,680:INFO:worker thread finished; awaiting finish of 13 more threads
2020-09-11 04:06:21,683:INFO:worker thread finished; awaiting finish of 12 more threads
2020-09-11 04:06:21,694:INFO:worker thread finished; awaiting finish of 11 more threads
2020-09-11 04:06:21,703:INFO:worker thread finished; awaiting finish of 10 more threads
2020-09-11 04:06:21,706:INFO:worker thread finished; awaiting finish of 9 more threads
2020-09-11 04:06:21,708:INFO:worker thread finished; awaiting finish of 8 more threads
2020-09-11 04:06:21,713:INFO:worker thread finished; awaiting finish of 7 more threads
2020-09-11 04:06:21,719:INFO:worker thread finished; awaiting finish of 6 more threads
2020-09-11 04:06:21,728:INFO:worker thread finished; awai

<gensim.models.word2vec.Word2Vec at 0x21a42f25c70>

0

 04:11:16,814:INFO:EPOCH 8 - PROGRESS: at 5.75% examples, 244475 words/s, in_qsize 47, out_qsize 1
2020-09-11 04:11:17,822:INFO:EPOCH 8 - PROGRESS: at 7.75% examples, 256251 words/s, in_qsize 47, out_qsize 0
2020-09-11 04:11:18,836:INFO:EPOCH 8 - PROGRESS: at 10.20% examples, 259623 words/s, in_qsize 47, out_qsize 0
2020-09-11 04:11:19,898:INFO:EPOCH 8 - PROGRESS: at 12.50% examples, 260836 words/s, in_qsize 48, out_qsize 0
2020-09-11 04:11:20,929:INFO:EPOCH 8 - PROGRESS: at 15.52% examples, 263385 words/s, in_qsize 47, out_qsize 0
2020-09-11 04:11:21,974:INFO:EPOCH 8 - PROGRESS: at 18.32% examples, 263234 words/s, in_qsize 47, out_qsize 0
2020-09-11 04:11:22,996:INFO:EPOCH 8 - PROGRESS: at 20.80% examples, 266654 words/s, in_qsize 47, out_qsize 0
2020-09-11 04:11:24,009:INFO:EPOCH 8 - PROGRESS: at 22.66% examples, 263391 words/s, in_qsize 47, out_qsize 1
2020-09-11 04:11:25,071:INFO:EPOCH 8 - PROGRESS: at 25.81% examples, 266710 words/s, in_qsize 47, out_qsize 0
2020-09-11 04:11:26,07

<gensim.models.word2vec.Word2Vec at 0x21a3da85e50>

0

2020-09-11 04:13:16,007:INFO:loading Word2Vec object from models/w2v_300_1.txt
2020-09-11 04:13:16,195:INFO:loading wv recursively from models/w2v_300_1.txt.wv.* with mmap=None
2020-09-11 04:13:16,197:INFO:setting ignored attribute vectors_norm to None
2020-09-11 04:13:16,198:INFO:loading vocabulary recursively from models/w2v_300_1.txt.vocabulary.* with mmap=None
2020-09-11 04:13:16,199:INFO:loading trainables recursively from models/w2v_300_1.txt.trainables.* with mmap=None
2020-09-11 04:13:16,200:INFO:setting ignored attribute cum_table to None
2020-09-11 04:13:16,202:INFO:loaded models/w2v_300_1.txt
100%|██████████| 7189/7189 [00:00<00:00, 53306.27it/s]
2020-09-11 04:13:16,377:INFO:loading Word2Vec object from models/w2v_300_3.txt
2020-09-11 04:13:16,499:INFO:loading wv recursively from models/w2v_300_3.txt.wv.* with mmap=None
2020-09-11 04:13:16,500:INFO:loading vectors from models/w2v_300_3.txt.wv.vectors.npy with mmap=None
2020-09-11 04:13:16,543:INFO:setting ignored attribute v

31

# 构建抽取模型

#### 1.网络结构

In [19]:
from keras.initializers import *

def model_conv(emb1, emb3):
    '''
    注意这个inputs
    seq1、seq2分别是两个输入
    是否做emb可选可不选，
    这个就是我们之前训练已经得到的用于embedding的（embedding_matrix1， embedding_matrix2）
    '''
    K.clear_session()

    emb_layer_1 = Embedding(
        input_dim=emb1.shape[0],
        output_dim=emb1.shape[1],
        weights=[emb1],
        input_length=30,
        trainable=False
    )
    
    emb_layer_3 = Embedding(
        input_dim=emb3.shape[0],
        output_dim=emb3.shape[1],
        weights=[emb3],
        input_length=600,
        trainable=False
    )
    
    
    seq1 = Input(shape=(30,))
    seq3 = Input(shape=(600,))    
    
    x1 = emb_layer_1(seq1)
    x3 = emb_layer_3(seq3)
    
    sdrop=SpatialDropout1D(rate=0.2)

    x1 = sdrop(x1)
    x3 = sdrop(x3)
     
    x = Dropout(0.2)(Bidirectional(GRU(128, return_sequences=True))(x1))
    semantic = TimeDistributed(Dense(100, activation="tanh"))(x)
    merged_1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(100,))(semantic)
    
    x = Dropout(0.2)(Bidirectional(GRU(128, return_sequences=True))(x3))
    semantic = TimeDistributed(Dense(100, activation="tanh"))(x)
    merged_3 = Lambda(lambda x: K.max(x, axis=1), output_shape=(100,))(semantic)
    
    
    x = Multiply()([merged_1, merged_3])
    
    x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(x))))
    x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
    pred_1 = Dense(10, activation='softmax')(x)
    pred_2 = Dense(3, activation='softmax')(x)
    pred_3 = Dense(3, activation='softmax')(x)
    pred_4 = Dense(2, activation='softmax')(x)
    model = Model(inputs=[seq1, seq3], outputs=[pred_1, pred_2, pred_3, pred_4])
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.0001),metrics=["accuracy"])
    return model
gc.collect()

0

In [20]:
model = model_conv(emb1, emb3)
model.summary()
l1 = to_categorical(train_data['label_1'], 10)
l2 = to_categorical(train_data['label_2'], 3)
l3 = to_categorical(train_data['label_3'], 3)
l4 = to_categorical(train_data['label_4'], 2)
model.fit([x1, x3],[l1, l2, l3, l4], batch_size=256, epochs=8, verbose=1, shuffle=True)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 600)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 30, 300)      2157000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 600, 300)     8537700     input_2[0][0]                    
_______________________________________________________________________________________

<tensorflow.python.keras.callbacks.History at 0x21a6d959a60>

In [21]:
#保存权重
model.save_weights('models/lstm_model.h5')

#### 2.保存结果

In [22]:
# 预测验证集
val_df["sample_id"]=val_df["sample_id"].astype(str)
val_result_for_pred = pd.merge(val_result, val_df, on='sample_id', how='left')
val_result_for_pred['text_1'] = val_result_for_pred['理财产品名称'].astype(str) + '_' + val_result_for_pred['产品发行方名称'].astype(str)
val_result_for_pred['text_2'] = val_result_for_pred['text'].astype(str)

val_result_for_pred['text_1'] = val_result_for_pred['text_1'].progress_apply(lambda row:' '.join(jieba.lcut(str(row))))
val_result_for_pred['text_2'] = val_result_for_pred['text_2'].progress_apply(lambda row:' '.join(jieba.lcut(str(row))))

x1 = token_1.texts_to_sequences(val_result_for_pred['text_1'])
x1 = pad_sequences(x1, maxlen=30, value=0)
x3 = token_3.texts_to_sequences(val_result_for_pred['text_2'])
x3 = pad_sequences(x3, maxlen=600, value=0)
pred_result = model.predict([x1, x3], batch_size=1024, verbose=1)
pred_1 = label_1.inverse_transform(np.argmax(pred_result[0], axis=1))
pred_2 = label_2.inverse_transform(np.argmax(pred_result[1], axis=1))
pred_3 = label_3.inverse_transform(np.argmax(pred_result[2], axis=1))
pred_4 = label_4.inverse_transform(np.argmax(pred_result[3], axis=1))


ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [23]:
val_result['理财类型'] = pred_1
val_result['资金来源'] = pred_2
val_result['实际购买公司和上市公司关系'] = pred_3
val_result['买卖方是否有关联关系'] = pred_4

NameError: name 'pred_1' is not defined

#### 3.离线验证评估

In [24]:
# 计算线下f1
# R（召回率）=抽取正确的记录数量/（抽取正确的目标数量+漏抽取的记录数量）
# P（准确率）=抽取正确的记录数量/（抽取错误的记录数量+抽取正确的记录数量）
def get_F1(val_pred, val_true):
    val_pred = list(val_pred)
    val_true = list(val_true)
    curr = list(set(val_pred).intersection(set(val_true)))
    R = len(curr)/len(val_true)
    P = len(curr)/len(val_pred)
    return 2*P*R/(P+R)

train_outputs["sample_id"]=train_outputs["sample_id"].astype(str)
r = pd.merge(val_df[['sample_id']], train_outputs, on='sample_id', how='left')
val_true = r['sample_id'].astype(str) + r['认购日期'].astype(str) + r['理财产品名称'].astype(str) + r['理财类型'].astype(str) + r['认购金额(万元)'].astype(str) + r['产品起息日'].astype(str)+ r['产品到息日'].astype(str) + r['产品期限'].astype(str) + r['资金来源'].astype(str) + r['实际购买公司名称'].astype(str) + r['实际购买公司和上市公司关系'].astype(str) + r['买卖方是否有关联关系'].astype(str) + r['公告日期'].astype(str)

r = val_result
val_pred = r['sample_id'].astype(str) + r['认购日期'].astype(str) + r['理财产品名称'].astype(str) + r['理财类型'].astype(str) + r['认购金额(万元)'].astype(str) + r['产品起息日'].astype(str)+ r['产品到息日'].astype(str) + r['产品期限'].astype(str) + r['资金来源'].astype(str) + r['实际购买公司名称'].astype(str) + r['实际购买公司和上市公司关系'].astype(str) + r['买卖方是否有关联关系'].astype(str) + r['公告日期'].astype(str)

score = get_F1(val_pred, val_true)
score


KeyError: '认购日期'

In [25]:
val_true_file=pl.Path("results/val_true.csv")
val_pred_file=pl.Path("results/val_pred.csv")

tr_true_file=pl.Path("results/tr_true.csv")
val_result.copy().sort_values(by="sample_id").sort_index(axis=1).to_csv(val_pred_file,index=None)
pd.merge(val_df[['sample_id']], train_outputs, on='sample_id', how='left').sort_index(axis=1).sort_values(by="sample_id").to_csv(val_true_file,index=None)
pd.merge(train_df[['sample_id']], train_outputs, on='sample_id', how='left').sort_index(axis=1).sort_values(by="sample_id").to_csv(tr_true_file,index=None)

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

#### 4.最终输出结果

In [26]:
# 预测测试集
test_df["sample_id"]=test_df["sample_id"].astype(str)
test_result_for_pred = pd.merge(test_result, test_df, on='sample_id', how='left')
test_result_for_pred['text_1'] = test_result_for_pred['理财产品名称'].astype(str) + '_' + test_result_for_pred['产品发行方名称'].astype(str)
test_result_for_pred['text_2'] = test_result_for_pred['text'].astype(str)

test_result_for_pred['text_1'] = test_result_for_pred['text_1'].progress_apply(lambda row:' '.join(jieba.lcut(str(row))))
test_result_for_pred['text_2'] = test_result_for_pred['text_2'].progress_apply(lambda row:' '.join(jieba.lcut(str(row))))

x1 = token_1.texts_to_sequences(test_result_for_pred['text_1'])
x1 = pad_sequences(x1, maxlen=30, value=0)
x3 = token_3.texts_to_sequences(test_result_for_pred['text_2'])
x3 = pad_sequences(x3, maxlen=600, value=0)
pred_result = model.predict([x1, x3], batch_size=1024, verbose=1)
pred_1 = label_1.inverse_transform(np.argmax(pred_result[0], axis=1))
pred_2 = label_2.inverse_transform(np.argmax(pred_result[1], axis=1))
pred_3 = label_3.inverse_transform(np.argmax(pred_result[2], axis=1))
pred_4 = label_4.inverse_transform(np.argmax(pred_result[3], axis=1))

test_result['理财类型'] = pred_1
test_result['资金来源'] = pred_2
test_result['实际购买公司和上市公司关系'] = pred_3
test_result['买卖方是否有关联关系'] = pred_4

NameError: name 'test_result' is not defined

In [27]:
test_result.to_csv('results/re_lstm_base.csv',encoding="utf8", index=False)
test_result.to_excel('results/re_lstm_base.xlsx',encoding="gb18030", index=False)

NameError: name 'test_result' is not defined

files_num=int(test_df.shape[0]/4)
test_time["sample_id"]=test_time["sample_id"].astype(str)

# test_time.shape
# test_time.dropna(subset=["公告日期"]).shape
import random
for i in range(1,20):
    random_seed=random.randint(0,10000)
    random_test_df = test_df.sample(frac=1, random_state=random_seed)
    change_df=random_test_df[:files_num]
    stable_df=random_test_df[files_num:]
    random_test_time=pd.merge(change_df,test_time,on=["sample_id"])
    random_test_time.shape
    random_test_time=random_test_time.dropna(subset=["公告日期"])
    random_test_time.shape
    random_test_time["公告日期"]=random_test_time["公告日期"].map(lambda x:datetime.datetime.strftime(datetime.datetime.strptime(str(x.replace("\r","")), '%Y-%m-%d'),'%Y-%m-%d'))


In [38]:
sample_id=4663
# sample_id=7123
tabel=val_df[val_df["sample_id"]==sample_id]["tabel"].iloc[0]
table_result,start_rows_list,first_line_list,product_df_list=row_combine(sample_id,tabel)
index=0
table_result[index]
product_df_list_ele=product_df_list[index].reset_index(drop=True)
product_df_list_ele
# product_df_list_ele
each_sum_rows=get_each_product_row(table_result[index],product_df_list_ele.reset_index(drop=True))

each_sum_rows2=result_matrix[result_matrix["sample_id"]==sample_id]["product_df"].iloc[0]
# # each_sum_rows
each_sum_rows

IndexError: single positional indexer is out-of-bounds