# 导入相关包

In [2]:
# 导入相关包
import os
import pathlib as pl
import pandas as pd
import numpy as np
import re
from io import StringIO
from datetime import datetime,timedelta
import time
from IPython.core.interactiveshell import InteractiveShell
from tqdm.autonotebook import *
import pdfplumber
tqdm.pandas()
InteractiveShell.ast_node_interactivity = "all"
sys.path.append("..")

from tgrocery import Grocery
import jieba

# PDF解析原始数据 
## 加载数据并采用pdfplumber抽取PDF中的文字和表格


In [3]:
# 数据准备(train_output文件中格式有点问题，需要提前用excel或者wps打开然后另存为excel文件)
train_outputs = pd.read_excel('../datasets/train_output.xlsx')

# 获取pdf中文字和表格
def extract_pdf_content(pdf_path):
    text_list = []
    table_list = []
    with pdfplumber.open(pdf_path) as pdf:
        for index_page in np.arange(0, len(pdf.pages), 1):
            # 读取多页
            page = pdf.pages[index_page]   # 第n页的信息
            text = page.extract_text()
            text_list.append(text)
            table = page.extract_tables()
            for t in table:
                table_list.append(t)
    return text_list, table_list

def get_dir_file(path):
    '''
    输入文件夹位置，输出整理好的dataframe
    '''
    path_list = os.listdir(path)
    id_list = []
    file_path_list = []
    text_list = []
    table_list = []
    for i in tqdm(path_list):
        if '.PDF' in i:
            file_path = path + i
            id_list.append(int(i.split('.')[0]))
            file_path_list.append(file_path)
            try:
                text_temp, table_temp = extract_pdf_content(file_path)
            except Exception:
                print('此pdf无法读取')
                text_temp, table_temp = [], []
            text_list.append(text_temp)
            table_list.append(table_temp)
            
    df = pd.DataFrame()
    df['sample_id'] = id_list
    df['file_path'] = file_path_list
    df['text'] = text_list
    df['tabel'] = table_list
    df = df.sort_values('sample_id')
    return df

# 文件处理太慢，可持续化保存文件
train_path = '../datasets/train.csv'
if os.path.exists(train_path):
    train_df = pd.read_csv(train_path)
else:
    train_df = get_dir_file('datasets/train_data/')
    train_df.to_csv(train_path,index=False)
    train_df = pd.read_csv(train_path)

test_path =  '../datasets/test.csv'
if os.path.exists(test_path):
    test_df = pd.read_csv(test_path)
else:
    test_df = get_dir_file('datasets/test_data/')
    test_df.to_csv(test_path,index=False)
    test_df = pd.read_csv(test_path)

train_outputs.head(2)
train_df.head(2)
test_df.head(2)

Unnamed: 0,sample_id,认购日期,理财产品名称,产品发行方名称,理财类型,认购金额(万元),产品起息日,产品到息日,产品期限,资金来源,实际购买公司名称,实际购买公司和上市公司关系,买卖方是否有关联关系,公告日期
0,1,2019-03-27,汇聚金1号,中融国际信托有限公司,信托,10000.0,2019-03-27,2019-09-23,180天,自有资金,恒生电子股份有限公司,公司本身,否,2019-04-25
1,1,2019-03-27,招商银行步步生金8699,招商银行,银行理财产品,200.0,2019-03-27,NaT,,自有资金,恒生电子股份有限公司,公司本身,否,2019-04-25


Unnamed: 0,sample_id,file_path,text,tabel
0,1,datasets/train_data/1.PDF,[' ...,"[[['', None, None, '', None, None, '', None, N..."
1,2,datasets/train_data/2.PDF,[' ...,"[[['', None, None, '', None, None, '', None, N..."


Unnamed: 0,sample_id,file_path,text,tabel
0,11188,datasets/test_data/11188.PDF,['北京京西文化旅游股份有限公司监事会\n \n \n关于使用部分闲置募集资金购买理财产品的...,[]
1,11189,datasets/test_data/11189.PDF,['北京京西文化旅游股份有限公司 \n监事会关于使用部分自有资金购买理财产品的意见 \n根据...,[]


In [4]:
# 构造训练集验证集
train_df = train_df.sample(frac=1, random_state=1017)
val_df = train_df[:1800]
train_df = train_df[1800:]

In [106]:


def get_title(text):
    global title_num_char
    title_list=[]
    title_type_list=[]
    text_start_iter_list=[]
    text_end_iter_list=[]
    for item in title_num_char:
        pattern = re.compile(item+r"[ ]*?[^ ]+?[ ]")
        tmp=pattern.finditer(text)
        for i in tmp:
            title_list.append(i.group())
            text_start_iter_list.append(i.span(0)[0])
            title_type_list.append(1)
            # text_end_iter_list.append(i.span(0)[1]) #把标题纳入text
            text_end_iter_list.append(i.span(0)[0])
    
    # for item in title_list:
    for item in s_title_num_char:
        pattern = re.compile(item+r"[ ]*?[^ ]+?[ ]")
        tmp=pattern.finditer(text)
        for i in tmp:
            title_list.append(i.group())
            text_start_iter_list.append(i.span(0)[0])
            title_type_list.append(2)
            # text_end_iter_list.append(i.span(0)[1]) #把标题纳入text
            text_end_iter_list.append(i.span(0)[0])

    title_list.append("引言")
    title_type_list.append(1)
    text_start_iter_list.append(0)
    text_end_iter_list.append(0)

    result_df=pd.DataFrame([title_list,title_type_list,text_start_iter_list,text_end_iter_list]).T.sort_values(by=2).reset_index(drop=True)
    # print(result_df)
    return result_df

def get_title_text(text,title_df):
    # print(title_df)
    title_1_df=title_df[title_df[1]==1]
    text_iter_list=[]
    text_list=[]
    # print(title_1_df)
    for iter1,iter2 in title_1_df[[2,3]].values:
        # print(iter1)
        if(len(text_iter_list)!=0):
            text_iter_list.append(iter1)
        text_iter_list.append(iter2)
    # text_iter_list.append(text_iter_list[len(text_iter_list)-1])
    text_iter_list.append(len(text))
    for index in range(int(len(text_iter_list)/2)):
        text_list.append(text[text_iter_list[2*index]:text_iter_list[2*index+1]])
    
    if len(text_list)!=0:
        title_1_df[4]=text_list
    else:
        text_list.append(text)
        title_1_df[4]=text_list

    return title_1_df.reset_index(drop=True)

from fuzzywuzzy import fuzz
def judge_title(sample_id=0,text=r"test\n"):
    # print(text)
    text=text.replace(r"\^","")
    # count=re.findall(r".*?\^.*?",text)
    # if (count is not None and len(count)>0):
    #     print(len(count))
    #     print(sample_id)
    #     print("————————————————")
    text=text.replace(r"\n","^").replace("（","(").replace("）",")")
    text=re.sub("[ ]+?[ ]+","",text)
    title_df=get_title(text)
    # count=re.findall(r".*?\^.*?",text)
    # if (count is not None and len(count)>0):
    #     print(len(count))
    #     print(sample_id)
    #     print("————————————————")
    title_df["sample_id"]=[sample_id for x in range(title_df.shape[0])]
    # print(title_df)
    title_1_df=get_title_text(text,title_df)[["sample_id",0,1,2,3,4]]

    

    global val_df
    global train_outputs
    val_true_name=train_outputs[train_outputs["sample_id"]==sample_id]["理财产品名称"]
    
    index=0
    neg_index=[]
    for title_des in title_1_df[0].values:
        for item in title_neg_words:
            if re.search(item,title_des) is not None:
                neg_index.append(index)
                break
        index+=1


    return title_1_df.drop(neg_index)
    # print(title_list)

def get_judge_title_result(val_df):

    judge_title_result=None


    for sample_id,text in tqdm(val_df[["sample_id","text"]].values):
        # print(sample_id)
        # print(text)
        judge_title_result= judge_title(sample_id,text) if judge_title_result is None else pd.concat([judge_title_result,judge_title(sample_id,text)])
    
    return judge_title_result

title_num_char=["一、","二、","三、","四、","五、","六、","七、","八、","九、","十、","十一、","十二、","十三、","十四、","十五、"]
s_title_num_char=["（一）","（二）","（三）","（四）","（五）","（六）","（七）","（八）","（九）","（十）","（十一）","（十二）","（十三）","（十四）","（十五）"]
s_title_num_char.extend(["[(]一[)]","[(]二[)]","[(]三[)]","[(]四[)]","[(]五[)]","[(]六[)]","[(]七[)]","[(]八[)]","[(]九[)]","[(]十[)]","[(]十一[)]","[(]十二[)]","[(]十三[)]","[(]十四[)]","[(]十五[)]"])

title_pos_words=[]
title_neg_words=["备查","日前","过去","履行","审批","程序","风险","措施","影响","累计","赎回","到期","截至","意见","十二个月内","公告前","报备文件","前期"]

val_judge_title_result=get_judge_title_result(val_df)
train_judge_title_result=get_judge_title_result(train_df)
# test_judge_title_result=get_judge_title_result(test_df)
# judge_title_result.to_excel("训练集段落标题分类结果.xlsx",index=None)


100%|██████████| 1800/1800 [00:38<00:00, 46.51it/s]
100%|██████████| 7217/7217 [01:51<00:00, 65.02it/s]


# 发行方\实际购买公司\上下文分类器

In [109]:
# 提取公司
# train_lstm_input = pd.merge(train_df, train_outputs, on='sample_id', how='left')
# result_matrix
train_lstm_input = pd.merge(train_df, train_outputs, on='sample_id', how='left')

train_lstm_input = train_lstm_input.fillna('否')

# label_1理财类型-10  label_2资金来源-3 label_3实际购买公司和上市公司关系-3 label_4买卖方是否有关联关系-2
# label_2 = LabelEncoder()
# label_3 = LabelEncoder()
# label_4 = LabelEncoder()

train_data = pd.DataFrame()
tmp=pd.DataFrame()
# train_data['sample_id']=train_lstm_input['sample_id'].astype(str) 
# train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) 

# # train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) + '_' + train_lstm_input['产品发行方名称'].astype(str)

# # train_data['text_2'] = train_lstm_input['text'].astype(str)

# # train_lstm_input["文本类别"]="理财产品"

# train_data['label_1'] = "理财产品"


train_data2=train_lstm_input[train_lstm_input["产品发行方名称"]!="否"].reset_index(drop=True)

# train_data2["文本类别"]="发行方"
tmp['sample_id']=train_data2['sample_id'].astype(str) 
tmp['text_1']=train_data2["产品发行方名称"].astype(str)

# tmp['text_2']= train_data2["text"].astype(str)

tmp['label_1']="发行方"

train_data = pd.concat([train_data,tmp]).reset_index(drop=True)

train_data2=train_lstm_input[train_lstm_input["实际购买公司名称"]!="否"].reset_index(drop=True)

# train_data2["文本类别"]="发行方"
tmp['sample_id']=train_data2['sample_id'].astype(str) 
tmp['text_1']=train_data2["实际购买公司名称"].astype(str)

# tmp['text_2']= train_data2["text"].astype(str)

tmp['label_1']="购买公司"

train_data = pd.concat([train_data,tmp]).reset_index(drop=True)

train_data=train_data.drop_duplicates(subset=["sample_id","text_1"])
# train_data
sample_text={}
sample_text["sample_id"]=[]
sample_text["text"]=[]
for sample_id in train_judge_title_result["sample_id"].unique():
    text_list=[]
    for item in train_judge_title_result[train_judge_title_result["sample_id"]==sample_id][4].values:
        text_list.append(item)
    sample_text["text"].append(("").join(i for i in text_list))
    sample_text["sample_id"].append(sample_id)

sample_text=pd.DataFrame(sample_text)
sample_text["sample_id"]=sample_text["sample_id"].astype(str)

train_data["sample_id"]=train_data["sample_id"].astype(str)
# for  item in 

# train_src=[]
# for text,label in train_data[["text_1","label_1"]].values:
#     train_src.append([label,text])


# grocery_word_selector=Grocery("wordSelector")


# grocery_word_selector.train(train_src)

# grocery_word_selector.save()



In [110]:
neg_train_data={}
neg_train_data["sample_id"]=[]
neg_train_data["text_1"]=[]
for sample_id,item in tqdm(train_data[["sample_id","text_1"]].values):
    # sample_id
    # item
    try:
        text=sample_text[sample_text["sample_id"]==sample_id]["text"].iloc[0]
        raw_word=item
        char_list=[]
        for char in item:
            char_list.append("["+char+"]")
        word=("[\^ ]*?").join(i for i in char_list)
        # word
        word_pattern=re.compile(word)
        tmp_text=text
        for i in word_pattern.finditer(text):
            len_word=i.span(0)[1]-i.span(0)[0]
            # i.group()
            tmp=re.search("[A-Za-z\u4e00-\u9fa5\^()（）]*"+word,tmp_text)
            if(len(tmp.group()[:len(tmp.group())-len_word])>0):
                neg_train_data["sample_id"].append(sample_id)
                neg_train_data["text_1"].append(tmp.group()[:len(tmp.group())-len_word])
            tmp_text=text[i.span(0)[1]+1:]

        tmp_text=text
        for i in word_pattern.finditer(text):
            len_word=i.span(0)[1]-i.span(0)[0]
            # i.group()
            tmp=re.search(word+"[A-Za-z\u4e00-\u9fa5\^()（）]*",tmp_text)
            if(len(tmp.group()[len_word:])>0):
                neg_train_data["sample_id"].append(sample_id)
                neg_train_data["text_1"].append(tmp.group()[len_word:])
            tmp_text=text[i.span(0)[1]+1:]
    except:
        pass
        sample_id
        item

neg_train_data=pd.DataFrame(neg_train_data)
neg_train_data["label_1"]="上下文"
neg_train_data
        # text
    # firm_pattern=re.compile("[0-9A-Za-z\u4e00-\u9fa5\^()（）]+?"+word+"[0-9A-Za-z\u4e00-\u9fa5\^()（）]*")
    # re_result=firm_pattern.findall(text)

    

  5%|▍         | 1050/22432 [00:07<02:05, 170.41it/s]

'2448'

'江苏银行'

  7%|▋         | 1599/22432 [00:11<02:12, 157.34it/s]

'5942'

'山东沂源农村商业银行股份有限公司'

  8%|▊         | 1820/22432 [00:12<02:03, 166.45it/s]

'5935'

'山东沂源农村商业银行股份有限公司'

'5952'

'交通银行股份有限公司'

8%|▊         | 1840/22432 [00:12<02:00, 170.35it/s]

'5952'

'山东沂源农村商业银行股份有限公司'

 14%|█▍        | 3091/22432 [00:21<02:08, 149.96it/s]

'5934'

'山东沂源农村商业银行股份有限公司'

 18%|█▊        | 4105/22432 [00:30<03:11, 95.59it/s]

'5924'

'山东沂源农村商业银行股份有限公司'

 18%|█▊        | 4148/22432 [00:30<02:38, 115.04it/s]

'5941'

'山东沂源农村商业银行股份有限公司'

 22%|██▏       | 4827/22432 [00:35<02:11, 134.06it/s]

'4515'

'长江证券'

 25%|██▍       | 5509/22432 [00:40<01:51, 151.50it/s]

'5947'

'山东沂源农村商业银行股份有限公司'

 28%|██▊       | 6196/22432 [00:45<02:21, 114.68it/s]

'4514'

'长江证券'

 29%|██▉       | 6565/22432 [00:47<01:59, 133.13it/s]

'1715'

'中国银行彭州支行'

 30%|███       | 6738/22432 [00:48<01:33, 167.00it/s]

'5953'

'山东沂源农村商业银行股份有限公司'

 31%|███       | 6929/22432 [00:50<02:16, 113.69it/s]

'5918'

'山东沂源农村商业银行股份有限公司'

 32%|███▏      | 7080/22432 [00:51<02:01, 125.96it/s]

'5951'

'山东沂源农村商业银行股份有限公司'

 44%|████▍     | 9943/22432 [01:15<01:29, 140.31it/s]

'4516'

'长江证券'

 51%|█████     | 11397/22432 [01:25<01:16, 144.91it/s]

'6855'

'九江银行合肥分行'

 51%|█████     | 11428/22432 [01:25<01:22, 134.15it/s]

'5928'

'山东沂源农村商业银行股份有限公司'

 53%|█████▎    | 11867/22432 [01:28<01:17, 136.18it/s]

'4523'

'平安证券'

 57%|█████▋    | 12876/22432 [01:35<00:59, 160.14it/s]

'4543'

'平安证券'

100%|██████████| 22432/22432 [03:05<00:00, 120.62it/s]


Unnamed: 0,sample_id,text_1,label_1
0,6761,公司与,上下文
1,6761,不存在关联关系,上下文
2,7442,公司与,上下文
3,7442,(以下^简称,上下文
4,10746,日购买了,上下文
...,...,...,...
60533,8778,(以下简称,上下文
60534,5808,^,上下文
60535,5808,^,上下文
60536,5808,(,上下文


In [111]:
train_src=[]
for text,label in pd.concat([train_data,neg_train_data])[["text_1","label_1"]].values:
    train_src.append([label,text])


grocery_word_selector=Grocery("wordSelector")


grocery_word_selector.train(train_src)

grocery_word_selector.save()

<tgrocery.Grocery at 0x20e5f03c3a0>

# 产品名称\发行方\其他字段分类器 

In [329]:
# 最后一部分字段采用预测好的部分，跟提取的text做交互采用双输入lstm在dense层做交互预测最后几个字段

# train_lstm_input = pd.merge(train_df, train_outputs, on='sample_id', how='left')
# result_matrix
# from tgrocery import Grocery
train_lstm_input = pd.merge(train_df, train_outputs, on='sample_id', how='left')

train_lstm_input = train_lstm_input.fillna('否')

# label_1理财类型-10  label_2资金来源-3 label_3实际购买公司和上市公司关系-3 label_4买卖方是否有关联关系-2
# from sklearn.preprocessing import LabelEncoder
# label_2 = LabelEncoder()
# label_3 = LabelEncoder()
# label_4 = LabelEncoder()

train_data = pd.DataFrame()
tmp=pd.DataFrame()
train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) 

# train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) + '_' + train_lstm_input['产品发行方名称'].astype(str)

# train_data['text_2'] = train_lstm_input['text'].astype(str)

# train_lstm_input["文本类别"]="理财产品"

train_data['label_1'] = "理财产品"


train_data2=train_lstm_input[train_lstm_input["产品发行方名称"]!="否"].reset_index(drop=True)

# train_data2["文本类别"]="发行方"

tmp['text_1']=train_data2["产品发行方名称"].astype(str)

# tmp['text_2']= train_data2["text"].astype(str)

tmp['label_1']="发行方"

train_data = pd.concat([train_data,tmp]).reset_index(drop=True)

# train_data2=train_lstm_input[train_lstm_input["实际购买公司名称"]!="否"].reset_index(drop=True)

# # train_data2["文本类别"]="发行方"

# tmp['text_1']=train_data2["实际购买公司名称"].astype(str)

# # tmp['text_2']= train_data2["text"].astype(str)

# tmp['label_1']="购买公司"

# train_data = pd.concat([train_data,tmp]).reset_index(drop=True)


other_columns_list=["认购金额(万元)","认购日期","资金来源","实际购买公司和上市公司关系","实际购买公司名称"]

for item in other_columns_list:
    train_lstm_input[item]=train_lstm_input[item].astype(str)

    train_data2=train_lstm_input[train_lstm_input[item]!="否"].reset_index(drop=True)

    # train_data2["文本类别"]=item

    tmp['text_1']=train_data2[item].astype(str)

    # tmp['text_2']= train_data2["text"].astype(str)

    tmp['label_1']="其它"

    
    train_data = pd.concat([train_data,tmp]).reset_index(drop=True)



# train_data['label_2'] = label_2.fit_transform(train_lstm_input['资金来源'])
# train_data['label_3'] = label_3.fit_transform(train_lstm_input['实际购买公司和上市公司关系'])
# train_data['label_4'] = label_4.fit_transform(train_lstm_input['买卖方是否有关联关系'])
train_data

train_src=[]
for text,label in train_data[["text_1","label_1"]].values:
    train_src.append([label,text])


grocery=Grocery("productOrcounter")


grocery.train(train_src)

grocery.save()



Unnamed: 0,text_1,label_1
0,中银保本理财-人民币按期开放理财产品,理财产品
1,中银保本理财-人民币按期开放理财产品,理财产品
2,与利率挂钩的结构性产品,理财产品
3,广发银行“薪加薪”16号XJXCKJ2578,理财产品
4,兴业银行“金雪球-优悦”保本开放式人民币理财产品(2M),理财产品
...,...,...
185903,上海浦兴投资发展有限公司,其它
185904,上海浦兴投资发展有限公司,其它
185905,上海浦兴投资发展有限公司,其它
185906,上海浦兴投资发展有限公司,其它


<tgrocery.Grocery at 0x20e60ec1a00>

In [6]:
import jieba.analyse
import jieba

text_list=[]
for product_name in tqdm(train_outputs["产品发行方名称"].values):
    text_list.append(str(product_name))

# text_list

a_list=[]

for x in jieba.analyse.extract_tags((",").join(i for i in text_list),topK=100):#可以再添加一个参数指定输出个数
    a_list.append(x)#直接输出关键词和词频

# a_list

# text_list=[]
# for product_name in tqdm(train_outputs["理财产品名称"].values):
#     text_list.append(str(product_name))

# b_list=[]

# for x in jieba.analyse.extract_tags((",").join(i for i in text_list)):#可以再添加一个参数指定输出个数
#     b_list.append(x)

# set(b_list).difference(a_list)

100%|██████████| 32818/32818 [00:00<00:00, 596697.94it/s]


In [157]:
# a="^潜江永安药业股份有限公司"
# firm=['公司', '银行', '分行', '支行', '中心', '业部', '商行', '建行']
# for i in firm:
#     i
#     grocery_word_selector.predict(i).dec_values

# a="信赢    步步高升4号(B15C0873)"
# a=jieba.cut(a)
# print(list(a))
a="123456:123567"
b="123456123567"
index=a.index(":")
a[index+1:]

set(b).issubset(a)

'123567'

True

# firm=['公司', '银行', '分行', '支行', '中心', '业部', '商行', '建行']


delta_limit=0.2
value_limit=0.7

def item_combine(raw_word,item_list):
    global grocery_word_selector
    # print(raw_word)
    # print(item_list)
    key_location_list=[]
    while(" " in item_list):
        item_list.remove(" ")
    for index in range(len(item_list)):
        if(re.search(raw_word,item_list[index]) is not None):
            key_location_list.append(index)
    
    result_dict={}
    result_dict["发行方"]=[]
    result_dict["购买公司"]=[]
    result_dict["上下文"]=[]

    # print(key_location_list)
    
    for location in key_location_list:
        complete_word=[item_list[location]]
        low_location=location-1

        dec_values=None

        while(low_location>=0):
            # print(item_list[low_location])
            a=[item_list[low_location]]
            a.extend(complete_word)
            
            tmp_word=("").join(i for i in a)
            # print(tmp_word)
            # print(grocery_word_selector.predict(tmp_word).dec_values)
            
            if(dec_values is not None):
                delta=grocery_word_selector.predict(tmp_word).dec_values["上下文"]-dec_values
                dec_values=grocery_word_selector.predict(tmp_word).dec_values["上下文"]
                tmp_label=grocery_word_selector.predict(tmp_word)
                if(len(complete_word)>2):
                    if(str(tmp_label)=="上下文" or (delta>delta_limit and tmp_label.dec_values[str(tmp_label)]<value_limit)):
                        break
                    else:
                        complete_word=a
                complete_word=a
            else:
                dec_values=grocery_word_selector.predict(tmp_word).dec_values["上下文"]
                complete_word=a
            low_location-=1
        # dec_values=None
        high_location=location+1
        while(high_location<len(item_list)):
            complete_word.append(item_list[high_location])
            tmp_word=("").join(i for i in complete_word)
            # print(tmp_word)
            # print(grocery_word_selector.predict(tmp_word).dec_values)

            if(dec_values is not None):
                delta=grocery_word_selector.predict(tmp_word).dec_values["上下文"]-dec_values
                dec_values=grocery_word_selector.predict(tmp_word).dec_values["上下文"]
                if(len(complete_word)>2):
                    tmp_label=grocery_word_selector.predict(tmp_word)
                    if(str(tmp_label)=="上下文" or (delta>delta_limit and tmp_label.dec_values[str(tmp_label)]<value_limit)):
                        complete_word.pop()
                        break
                    else:
                        pass
                pass
            else:
                dec_values=grocery_word_selector.predict(tmp_word).dec_values["上下文"]
            high_location+=1
        
        result_word=("").join(i for i in complete_word)

        result_label=str(grocery_word_selector.predict(result_word))
        result_dict[result_label].append(result_word)
    
    return result_dict    


def get_firm_and_counter(sample_id,val_judge_title_result):
    text_list=[]
    for item in val_judge_title_result[val_judge_title_result["sample_id"]==sample_id][4].values:
        text_list.append(item)

    text=("").join(i for i in text_list)
    text

    firm=['公司', '银行', '分行', '支行', '中心', '业部', '商行', '建行',"中行","股份有限"]
    firm_list=[]
    result_dict={}
    result_dict["发行方"]=[]
    result_dict["购买公司"]=[]
    for word in firm:
        raw_word=word
        char_list=[]
        for char in word:
            char_list.append("["+char+"]")
        word=("").join(i for i in char_list)
        firm_pattern=re.compile("[A-Za-z\u4e00-\u9fa5\^()（）]+?"+word+"[A-Za-z\u4e00-\u9fa5\^()（）]*")
        re_result=firm_pattern.findall(text)

        if re_result is None:
            break

        tmp_score=grocery_word_selector.predict(item).dec_values
        result={}

        for key in tmp_score:
            result[key]=[]
        
        for item in re_result:
            w_s_result=grocery_word_selector.predict(item)
            key=str(w_s_result)
            result[key].append(item)

        # result
        for item in result["上下文"]:
            tmp=list(jieba.cut(re.sub("[ ]+[ ]+"," ",item.replace("^",""))))
            # if("向" in tmp):
            #     tmp.remove("向")
            # if("与" in tmp):
            #     tmp.remove("与")
            tmp_dict=item_combine(raw_word,tmp)
            result_dict["发行方"].extend(tmp_dict["发行方"])
            result_dict["购买公司"].extend(tmp_dict["购买公司"])
    result_dict


    word_clear=["^.{0,2}[)]","[(].{0,3}$","^.*?孙公司","^.*?子公司","^.*?[与]","^.*?[向]","^.*?万元","^.*购买","之间.*?$","负责.*?$","使用.*?$","无.{0,3}$","企业金融.*?$","提前.*?$","^.*?如果","合计.*?$",".*保本.*",".*存款.*","将.{0,3}$",".*办理.*","的.{0,3}$","不存在.{0,3}$","利多多.{0,3}$","认购{0,3}$","不.{0,2}$","^.{0,1}公司","^.{0,1}情况","签署.{0,2}$",".*产品名称.*","^.{0,1}收到","支付.{0,3}$","^.*?对方为","签订.{0,3}$","按照.{0,3}$","委托.{0,3}$","保持.{0,3}$","^.{0,1}了","均.{0,2}$","开立.{0,2}$","^.{0,2}在","募集.{0,2}$"]

    for index in range(len(result_dict["发行方"])):
        tmp_item=result_dict["发行方"][index]
        for i in word_clear:
            # print(i)
            tmp_item=re.sub(i,"",tmp_item)
        result_dict["发行方"][index]=tmp_item

    for index in range(len(result_dict["购买公司"])):
        tmp_item=result_dict["购买公司"][index]
        for i in word_clear:
            # print(i)
            tmp_item=re.sub(i,"",tmp_item)
        result_dict["购买公司"][index]=tmp_item

    for key in result_dict:
        result_dict[key]=list(set(result_dict[key]))

    result_dict

    tmp_list=[]
    for key in result_dict:
        for item in result_dict[key]:
            tmp_list.append(item)

    result_df=pd.DataFrame()
    result_df["item"]=tmp_list
    result_df["label"]=result_df["item"].map(lambda x:str(grocery_word_selector.predict(x)))
    result_df["value"]=result_df["item"].map(lambda x:grocery_word_selector.predict(x).dec_values[str(grocery_word_selector.predict(x))])

    print(result_df)

    drop_list=[]
    index=-1
    for item,label,value in result_df.values:

        index+=1
        if(re.search("中国银行",item) is not None):
            continue
        flag=True
        n=-1
        for i,l,v in result_df.values:
            n+=1
            if(re.search("中国银行",i) is not None):
                continue
            if item == i:
                continue
            else:
                if (set(item).issubset(i)):
                    if(value>v+0.35 and abs(len(item)-len(i))<6):
                        drop_list.append(n)
                    else:
                        flag=False
                    break
                if set(i).issubset(item) :
                    if(value+0.35<v and (abs(len(item)-len(i))<6)):
                        drop_list.append(index)
                        flag=False
                    break
        if not flag or len(item)>28:
            drop_list.append(index)

    
    result_df=result_df.drop(list(set(drop_list))).reset_index(drop=True)
    print(result_df)
    drop_list=[]
    index=-1
    for item,label,value in result_df.values:
        index+=1
        if str(grocery.predict(item))=="理财产品":
            drop_list.append(index)
        if(re.search("中国银行",item) is not None):
            continue
        flag=True
        n=-1
        for i,l,v in result_df.values:
            n+=1
            if item == i:
                continue
            else:
                if (set(item).issubset(i)):
                    # print(item)
                    flag=False
                    break
        if not flag:
            drop_list.append(index)

    # print(drop_list)
    result_df=result_df.drop(list(set(drop_list))).reset_index(drop=True)
    result_df["sample_id"]=str(sample_id)
    return result_df

result_df=get_firm_and_counter(446,val_judge_title_result)
result_df
# for item

    # for item in  

In [228]:
# a="^潜江永安药业股份有限公司"
# firm=['公司', '银行', '分行', '支行', '中心', '业部', '商行', '建行']
# for i in firm:
#     i
i='建设银行股份有限公司洛阳分行'
grocery_word_selector.predict(i).dec_values

i='建设银行股份有限公司洛阳'
grocery_word_selector.predict(i).dec_values

i='建设银行股份有限公司洛阳分行无关联'
grocery_word_selector.predict(i).dec_values
# a="信赢    步步高升4号(B15C0873)"
# a=jieba.cut(a)
# print(list(a))
a="123456:123567"
b="1234561235678"
index=a.index(":")
a[index+1:]

set(b).issubset(a)

{'发行方': 0.8192201441132655,
 '购买公司': -0.5656924144816621,
 '上下文': -0.2535277296316181}

{'发行方': 0.8069655438003583,
 '购买公司': -0.14412617842953895,
 '上下文': -0.6628393653708312}

{'发行方': 0.4805379891207882,
 '购买公司': -0.7132887882562079,
 '上下文': 0.2327507991354057}

'123567'

False

In [42]:
a=train_outputs
a["len_firm"]=train_outputs["实际购买公司名称"].map(lambda x:len(x) if type(x) is str else 0)
np.max(a["len_firm"])
a["len_counter"]=train_outputs["产品发行方名称"].map(lambda x:len(x) if type(x) is str else 0)
np.max(a["len_counter"])

26

28

#### 1.抽取公告时间

In [None]:
# 首先针对任务抽取时间（每个时间跟每个id是一一对应的）
# 要不是取第一个时间，要不就是取最后一个时间（或者时间加一）这里可以建立一个模型预测
# base这里面直接取最后一个时间作为发布日期

CN_NUM = {
    u'〇': 0, u'一': 1, u'二': 2, u'三': 3,
    u'四': 4, u'五': 5, u'六': 6, u'七': 7,
    u'八': 8, u'九': 9, u'零': 0, u'壹': 1,
    u'贰': 2, u'叁': 3, u'肆': 4, u'伍': 5,
    u'陆': 6, u'柒': 7, u'捌': 8, u'玖': 9,
    u'貮': 2, u'两': 2,
}


def get_put_time_from_text(row):
    row = row.replace(' ', '').replace('\\n', '')
    for key in CN_NUM:
        row = row.replace(key, str(CN_NUM[key]))   
    r = row.replace("年", "-").replace("月", "-").replace("日", " ").replace("/", "-").strip()
    regex = "(\d{4}-\d{1,2}-\d{1,2})"
    r = re.findall(regex, r)
    if len(r)==0:
        return np.nan
    time_str = r[-1]
    first = time_str.split('-')[0]
    second = time_str.split('-')[1]
    last = time_str.split('-')[-1]
    second = str.zfill(second, 2)
    last = str.zfill(last, 2)
    r = '-'.join([first, second, last])
    return r

val_result = pd.DataFrame()
val_result['sample_id'] = val_df['sample_id']
val_result['predict_time'] = val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
test_gg = train_outputs.groupby('sample_id').apply(lambda row:list(row['公告日期'])[0]).reset_index()
test_gg.columns = ['sample_id', 'time']
val_result = pd.merge(val_result, test_gg, on='sample_id', how='left')

# 判断验证集的准确率
np.sum(val_result['predict_time'].astype(str) == val_result['time'].astype(str))/len(val_result)

val_time = val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
# test_time = test_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)

#### 2.抽取实际购买公司

In [None]:
# 抽取购买公司
# 前几句话出现
# 将其按照\\n 和空格切割
def get_gm(row):
    result = re.split('[\\\\n ]',row)
    for i in result:
        if '公司' in i:
            return i

val_gm = val_df.progress_apply(lambda row:get_gm(row['text']), axis=1)
# test_gm = test_df.progress_apply(lambda row:get_gm(row['text']), axis=1)

In [None]:
# 最后一部分字段采用预测好的部分，跟提取的text做交互采用双输入lstm在dense层做交互预测最后几个字段

# train_lstm_input = pd.merge(train_df, train_outputs, on='sample_id', how='left')
result_matrix
train_lstm_input = pd.merge(train_table_df, train_outputs, on='sample_id', how='left')

train_lstm_input = train_lstm_input.fillna('否')

# label_1理财类型-10  label_2资金来源-3 label_3实际购买公司和上市公司关系-3 label_4买卖方是否有关联关系-2
from sklearn.preprocessing import LabelEncoder
label_1 = LabelEncoder()
# label_2 = LabelEncoder()
# label_3 = LabelEncoder()
# label_4 = LabelEncoder()

train_data = pd.DataFrame()
tmp=pd.DataFrame()
train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) 

# train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) + '_' + train_lstm_input['产品发行方名称'].astype(str)

train_data['text_2'] = train_lstm_input['text'].astype(str)

train_lstm_input["文本类别"]="理财产品"

train_data['label_1'] = label_1.fit_transform(train_lstm_input["文本类别"])


train_data2=train_lstm_input[train_lstm_input["产品发行方名称"]!="无"].reset_index(drop=True)

train_data2["文本类别"]="发行方"

tmp['text_1']=train_data2["产品发行方名称"].astype(str)

tmp['text_2']= train_data2["text"].astype(str)

tmp['label_1']=label_1.fit_transform(train_data2["文本类别"])

train_data = pd.concat([train_data,tmp]).reset_index(drop=True)


other_columns_list=["认购金额(万元)","认购日期"]

for item in other_columns_list:

    train_data2=train_lstm_input[train_lstm_input[item]!="无"].reset_index(drop=True)

    train_data2["文本类别"]="其他"

    tmp['text_1']=train_data2[item].astype(str)

    tmp['text_2']= train_data2["text"].astype(str)

    tmp['label_1']=label_1.fit_transform(train_data2["文本类别"])

    
    train_data = pd.concat([train_data,tmp]).reset_index(drop=True)



# train_data['label_2'] = label_2.fit_transform(train_lstm_input['资金来源'])
# train_data['label_3'] = label_3.fit_transform(train_lstm_input['实际购买公司和上市公司关系'])
# train_data['label_4'] = label_4.fit_transform(train_lstm_input['买卖方是否有关联关系'])
train_data
