# 导入相关包

In [2]:
# 导入相关包
import os
import pathlib as pl
import pandas as pd
import numpy as np
import re
from io import StringIO
from datetime import datetime,timedelta
import time
from IPython.core.interactiveshell import InteractiveShell
from tqdm.autonotebook import *
import pdfplumber
tqdm.pandas()
InteractiveShell.ast_node_interactivity = "all"
sys.path.append("..")

from tgrocery import Grocery


# PDF解析原始数据 
## 加载数据并采用pdfplumber抽取PDF中的文字和表格


In [3]:
# 数据准备(train_output文件中格式有点问题，需要提前用excel或者wps打开然后另存为excel文件)
train_outputs = pd.read_excel('../datasets/train_output.xlsx')

# 获取pdf中文字和表格
def extract_pdf_content(pdf_path):
    text_list = []
    table_list = []
    with pdfplumber.open(pdf_path) as pdf:
        for index_page in np.arange(0, len(pdf.pages), 1):
            # 读取多页
            page = pdf.pages[index_page]   # 第n页的信息
            text = page.extract_text()
            text_list.append(text)
            table = page.extract_tables()
            for t in table:
                table_list.append(t)
    return text_list, table_list

def get_dir_file(path):
    '''
    输入文件夹位置，输出整理好的dataframe
    '''
    path_list = os.listdir(path)
    id_list = []
    file_path_list = []
    text_list = []
    table_list = []
    for i in tqdm(path_list):
        if '.PDF' in i:
            file_path = path + i
            id_list.append(int(i.split('.')[0]))
            file_path_list.append(file_path)
            try:
                text_temp, table_temp = extract_pdf_content(file_path)
            except Exception:
                print('此pdf无法读取')
                text_temp, table_temp = [], []
            text_list.append(text_temp)
            table_list.append(table_temp)
            
    df = pd.DataFrame()
    df['sample_id'] = id_list
    df['file_path'] = file_path_list
    df['text'] = text_list
    df['tabel'] = table_list
    df = df.sort_values('sample_id')
    return df

# 文件处理太慢，可持续化保存文件
train_path = '../datasets/train.csv'
if os.path.exists(train_path):
    train_df = pd.read_csv(train_path)
else:
    train_df = get_dir_file('datasets/train_data/')
    train_df.to_csv(train_path,index=False)
    train_df = pd.read_csv(train_path)

test_path =  '../datasets/test.csv'
if os.path.exists(test_path):
    test_df = pd.read_csv(test_path)
else:
    test_df = get_dir_file('datasets/test_data/')
    test_df.to_csv(test_path,index=False)
    test_df = pd.read_csv(test_path)

train_outputs.head(2)
train_df.head(2)
test_df.head(2)

Unnamed: 0,sample_id,认购日期,理财产品名称,产品发行方名称,理财类型,认购金额(万元),产品起息日,产品到息日,产品期限,资金来源,实际购买公司名称,实际购买公司和上市公司关系,买卖方是否有关联关系,公告日期
0,1,2019-03-27,汇聚金1号,中融国际信托有限公司,信托,10000.0,2019-03-27,2019-09-23,180天,自有资金,恒生电子股份有限公司,公司本身,否,2019-04-25
1,1,2019-03-27,招商银行步步生金8699,招商银行,银行理财产品,200.0,2019-03-27,NaT,,自有资金,恒生电子股份有限公司,公司本身,否,2019-04-25


Unnamed: 0,sample_id,file_path,text,tabel
0,1,datasets/train_data/1.PDF,[' ...,"[[['', None, None, '', None, None, '', None, N..."
1,2,datasets/train_data/2.PDF,[' ...,"[[['', None, None, '', None, None, '', None, N..."


Unnamed: 0,sample_id,file_path,text,tabel
0,11188,datasets/test_data/11188.PDF,['北京京西文化旅游股份有限公司监事会\n \n \n关于使用部分闲置募集资金购买理财产品的...,[]
1,11189,datasets/test_data/11189.PDF,['北京京西文化旅游股份有限公司 \n监事会关于使用部分自有资金购买理财产品的意见 \n根据...,[]


In [4]:
# 构造训练集验证集
train_df = train_df.sample(frac=1, random_state=1017)
val_df = train_df[:1800]
train_df = train_df[1800:]

# 数据处理
## 抽取整体数据（一个sampleid内此字段内容都相同）
## 公告时间，实际购买公司

In [5]:
# 提取公司
# train_lstm_input = pd.merge(train_df, train_outputs, on='sample_id', how='left')
# result_matrix
train_lstm_input = pd.merge(train_df, train_outputs, on='sample_id', how='left')

train_lstm_input = train_lstm_input.fillna('否')

# label_1理财类型-10  label_2资金来源-3 label_3实际购买公司和上市公司关系-3 label_4买卖方是否有关联关系-2
from sklearn.preprocessing import LabelEncoder
label_1 = LabelEncoder()
# label_2 = LabelEncoder()
# label_3 = LabelEncoder()
# label_4 = LabelEncoder()

train_data = pd.DataFrame()
tmp=pd.DataFrame()
train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) 

# train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) + '_' + train_lstm_input['产品发行方名称'].astype(str)

# train_data['text_2'] = train_lstm_input['text'].astype(str)

# train_lstm_input["文本类别"]="理财产品"

train_data['label_1'] = "理财产品"


train_data2=train_lstm_input[train_lstm_input["产品发行方名称"]!="否"].reset_index(drop=True)

# train_data2["文本类别"]="发行方"

tmp['text_1']=train_data2["产品发行方名称"].astype(str)

# tmp['text_2']= train_data2["text"].astype(str)

tmp['label_1']="发行方"

train_data = pd.concat([train_data,tmp]).reset_index(drop=True)

train_data2=train_lstm_input[train_lstm_input["实际购买公司名称"]!="否"].reset_index(drop=True)

# train_data2["文本类别"]="发行方"

tmp['text_1']=train_data2["实际购买公司名称"].astype(str)

# tmp['text_2']= train_data2["text"].astype(str)

tmp['label_1']="购买公司"

train_data = pd.concat([train_data,tmp]).reset_index(drop=True)


other_columns_list=["认购金额(万元)","认购日期","资金来源","实际购买公司和上市公司关系"]

for item in other_columns_list:

    train_data2=train_lstm_input[train_lstm_input[item]!="否"].reset_index(drop=True)

    # train_data2["文本类别"]=item

    tmp['text_1']=train_data2[item].astype(str)

    # tmp['text_2']= train_data2["text"].astype(str)

    tmp['label_1']="其它"

    
    train_data = pd.concat([train_data,tmp]).reset_index(drop=True)



# train_data['label_2'] = label_2.fit_transform(train_lstm_input['资金来源'])
# train_data['label_3'] = label_3.fit_transform(train_lstm_input['实际购买公司和上市公司关系'])
# train_data['label_4'] = label_4.fit_transform(train_lstm_input['买卖方是否有关联关系'])
train_data

train_src=[]
for text,label in train_data[["text_1","label_1"]].values:
    train_src.append([label,text])


grocery_word_selector=Grocery("wordSelector")


grocery_word_selector.train(train_src)

grocery_word_selector.save()



Unnamed: 0,text_1,label_1
0,中银保本理财-人民币按期开放理财产品,理财产品
1,中银保本理财-人民币按期开放理财产品,理财产品
2,与利率挂钩的结构性产品,理财产品
3,广发银行“薪加薪”16号XJXCKJ2578,理财产品
4,兴业银行“金雪球-优悦”保本开放式人民币理财产品(2M),理财产品
...,...,...
185903,控股参股公司,其它
185904,控股参股公司,其它
185905,控股参股公司,其它
185906,控股参股公司,其它


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lsqlh\AppData\Local\Temp\jieba.cache
Loading model cost 1.063 seconds.
Prefix dict has been built successfully.


<tgrocery.Grocery at 0x133c4393e20>

In [8]:
import jieba.analyse

text_list=[]
for product_name in tqdm(train_outputs["产品发行方名称"].values):
    text_list.append(str(product_name))

# text_list

a_list=[]

for x in jieba.analyse.extract_tags((",").join(i for i in text_list),topK=100):#可以再添加一个参数指定输出个数
    a_list.append(x)#直接输出关键词和词频

a_list

# text_list=[]
# for product_name in tqdm(train_outputs["理财产品名称"].values):
#     text_list.append(str(product_name))

# b_list=[]

# for x in jieba.analyse.extract_tags((",").join(i for i in text_list)):#可以再添加一个参数指定输出个数
#     b_list.append(x)

# set(b_list).difference(a_list)

100%|██████████| 32818/32818 [00:00<00:00, 456291.91it/s]


['支行',
 '有限公司',
 '股份',
 '分行',
 '兴业银行',
 '银行',
 '招商银行',
 '交通银行',
 '上海浦东发展银行',
 '厦门',
 '中国银行',
 '中信银行',
 '中国民生银行',
 '中国工商银行',
 '中国农业银行',
 'nan',
 '宁波',
 '台州',
 '平安',
 '上海',
 '证券',
 '浦发银行',
 '杭州',
 '广发',
 '中国光大银行',
 '民生银行',
 '农业银行',
 '文滨',
 '深圳',
 '中国建设银行',
 '信托',
 '上海银行',
 '营业部',
 '合肥',
 '北京',
 '工商银行',
 '建设银行',
 '商业银行',
 '南京',
 '张家港',
 '无锡',
 '广州',
 '北京分行',
 '国际',
 '江苏',
 '农村',
 '开发区',
 '光大银行',
 '成都',
 '福州',
 '长沙',
 '松江',
 '椒江',
 '湖州',
 '华夏银行',
 '苏州',
 '台州市',
 '有限责任',
 '管理',
 '临海',
 '东莞',
 '浙商',
 '天水',
 '集美',
 '中山',
 '中融',
 '分理处',
 '中信证券',
 '海通',
 '汕头',
 '中国',
 '闵行',
 '农商',
 '嘉善',
 '坂田',
 '淮安',
 '海沧',
 '资产',
 '农行',
 '交行',
 '青岛',
 '国泰君安证券股份有限公司',
 '哈尔滨',
 '财通',
 '巴南',
 '绍兴',
 '惠州',
 '玉环',
 '吴江',
 '临安',
 '烟台',
 '灵桥',
 '徽商',
 '银河证券',
 '天津',
 '中行',
 '民生',
 '高新',
 '建投',
 '益阳']

#### 1.抽取公告时间

In [4]:
# 首先针对任务抽取时间（每个时间跟每个id是一一对应的）
# 要不是取第一个时间，要不就是取最后一个时间（或者时间加一）这里可以建立一个模型预测
# base这里面直接取最后一个时间作为发布日期

CN_NUM = {
    u'〇': 0, u'一': 1, u'二': 2, u'三': 3,
    u'四': 4, u'五': 5, u'六': 6, u'七': 7,
    u'八': 8, u'九': 9, u'零': 0, u'壹': 1,
    u'贰': 2, u'叁': 3, u'肆': 4, u'伍': 5,
    u'陆': 6, u'柒': 7, u'捌': 8, u'玖': 9,
    u'貮': 2, u'两': 2,
}


def get_put_time_from_text(row):
    row = row.replace(' ', '').replace('\\n', '')
    for key in CN_NUM:
        row = row.replace(key, str(CN_NUM[key]))   
    r = row.replace("年", "-").replace("月", "-").replace("日", " ").replace("/", "-").strip()
    regex = "(\d{4}-\d{1,2}-\d{1,2})"
    r = re.findall(regex, r)
    if len(r)==0:
        return np.nan
    time_str = r[-1]
    first = time_str.split('-')[0]
    second = time_str.split('-')[1]
    last = time_str.split('-')[-1]
    second = str.zfill(second, 2)
    last = str.zfill(last, 2)
    r = '-'.join([first, second, last])
    return r

val_result = pd.DataFrame()
val_result['sample_id'] = val_df['sample_id']
val_result['predict_time'] = val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
test_gg = train_outputs.groupby('sample_id').apply(lambda row:list(row['公告日期'])[0]).reset_index()
test_gg.columns = ['sample_id', 'time']
val_result = pd.merge(val_result, test_gg, on='sample_id', how='left')

# 判断验证集的准确率
np.sum(val_result['predict_time'].astype(str) == val_result['time'].astype(str))/len(val_result)

val_time = val_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)
# test_time = test_df.progress_apply(lambda row: get_put_time_from_text(row['text']), axis=1)

100%|██████████| 1800/1800 [00:00<00:00, 4269.77it/s]


0.4583333333333333

100%|██████████| 1800/1800 [00:00<00:00, 4977.69it/s]


#### 2.抽取实际购买公司

In [5]:
# 抽取购买公司
# 前几句话出现
# 将其按照\\n 和空格切割
def get_gm(row):
    result = re.split('[\\\\n ]',row)
    for i in result:
        if '公司' in i:
            return i

val_gm = val_df.progress_apply(lambda row:get_gm(row['text']), axis=1)
# test_gm = test_df.progress_apply(lambda row:get_gm(row['text']), axis=1)

100%|██████████| 1800/1800 [00:00<00:00, 4470.96it/s]


In [None]:
# 最后一部分字段采用预测好的部分，跟提取的text做交互采用双输入lstm在dense层做交互预测最后几个字段

# train_lstm_input = pd.merge(train_df, train_outputs, on='sample_id', how='left')
result_matrix
train_lstm_input = pd.merge(train_table_df, train_outputs, on='sample_id', how='left')

train_lstm_input = train_lstm_input.fillna('否')

# label_1理财类型-10  label_2资金来源-3 label_3实际购买公司和上市公司关系-3 label_4买卖方是否有关联关系-2
from sklearn.preprocessing import LabelEncoder
label_1 = LabelEncoder()
# label_2 = LabelEncoder()
# label_3 = LabelEncoder()
# label_4 = LabelEncoder()

train_data = pd.DataFrame()
tmp=pd.DataFrame()
train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) 

# train_data['text_1'] = train_lstm_input['理财产品名称'].astype(str) + '_' + train_lstm_input['产品发行方名称'].astype(str)

train_data['text_2'] = train_lstm_input['text'].astype(str)

train_lstm_input["文本类别"]="理财产品"

train_data['label_1'] = label_1.fit_transform(train_lstm_input["文本类别"])


train_data2=train_lstm_input[train_lstm_input["产品发行方名称"]!="无"].reset_index(drop=True)

train_data2["文本类别"]="发行方"

tmp['text_1']=train_data2["产品发行方名称"].astype(str)

tmp['text_2']= train_data2["text"].astype(str)

tmp['label_1']=label_1.fit_transform(train_data2["文本类别"])

train_data = pd.concat([train_data,tmp]).reset_index(drop=True)


other_columns_list=["认购金额(万元)","认购日期"]

for item in other_columns_list:

    train_data2=train_lstm_input[train_lstm_input[item]!="无"].reset_index(drop=True)

    train_data2["文本类别"]="其他"

    tmp['text_1']=train_data2[item].astype(str)

    tmp['text_2']= train_data2["text"].astype(str)

    tmp['label_1']=label_1.fit_transform(train_data2["文本类别"])

    
    train_data = pd.concat([train_data,tmp]).reset_index(drop=True)



# train_data['label_2'] = label_2.fit_transform(train_lstm_input['资金来源'])
# train_data['label_3'] = label_3.fit_transform(train_lstm_input['实际购买公司和上市公司关系'])
# train_data['label_4'] = label_4.fit_transform(train_lstm_input['买卖方是否有关联关系'])
train_data


In [51]:
# 导入相关库
import os
import pandas as pd
from tqdm.autonotebook import *
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
from gensim.models import FastText, Word2Vec
import re
from keras.layers import *
from keras.models import *
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import *
from keras.layers.advanced_activations import LeakyReLU, PReLU
import keras.backend as K
from keras.optimizers import *
from keras.utils import to_categorical
import tensorflow as tf
import random as rn
import gc
import logging
import gensim
import jieba
tqdm.pandas()
os.environ['PYTHONHASHSEED'] = '0'
# 显卡使用（如没显卡需要注释掉）
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
np.random.seed(1024)
rn.seed(1024)
tf.random.set_seed(1024)

In [52]:
train_data['text_1'] = train_data['text_1'].progress_apply(lambda row:' '.join(jieba.lcut(str(row))))
train_data['text_2'] = train_data['text_2'].progress_apply(lambda row:' '.join(jieba.lcut(str(row))))
train_data.head(5)

100%|██████████| 33576/33576 [00:01<00:00, 24915.15it/s]
100%|██████████| 33576/33576 [02:29<00:00, 224.39it/s]


Unnamed: 0,text_1,text_2,label_1
0,结构性 存款,"联动 联动 银行 保证 收益 10 , 0004.35% 91 天 2017 / 8 / 2...",0
1,单位 大额 存单,兴业银行 股份 有限公司 兴业银行 股份 有限公司 兴业银行 股份 有限公司 广东 华兴 银...,0
2,“ 乾元 - 福顺盈 ” 开放式 资产 组合型 理财产品,兴业银行 股份 有限公司 兴业银行 股份 有限公司 兴业银行 股份 有限公司 广东 华兴 银...,0
3,兴业银行 企业 金融 结构性 存款 ( 封闭式 ),兴业银行 股份 有限公司 兴业银行 股份 有限公司 兴业银行 股份 有限公司 广东 华兴 银...,0
4,兴证资 管鑫利 5 号 集合 资产 管理 计划,12341234 华懋 科技 华懋 科技 华懋 科技 华懋 科技 中国银行 股份 有限公司 ...,0


In [53]:
### Tokenizer 序列化文本
def set_tokenizer(docs, split_char=' ', max_len=100):
    '''
    输入
    docs:文本列表
    split_char:按什么字符切割
    max_len:截取的最大长度
    
    输出
    X:序列化后的数据
    word_index:文本和数字对应的索引
    '''
    tokenizer = Tokenizer(lower=False, char_level=False, split=split_char)
    tokenizer.fit_on_texts(docs)
    X = tokenizer.texts_to_sequences(docs)
    maxlen = max_len
    X = pad_sequences(X, maxlen=maxlen, value=0)
    word_index=tokenizer.word_index
    return X, word_index, tokenizer

### 做embedding 这里采用word2vec 可以换成其他例如（glove词向量）
def trian_save_word2vec(docs, embed_size=300, save_name='w2v.txt', split_char=' '):
    '''
    输入
    docs:输入的文本列表
    embed_size:embed长度
    save_name:保存的word2vec位置
    
    输出
    w2v:返回的模型
    '''
    input_docs = []
    for i in docs:
        input_docs.append(i.split(split_char))
    logging.basicConfig(
    format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
    w2v = Word2Vec(input_docs, size=embed_size, sg=1, window=8, seed=1017, workers=24, min_count=1, iter=10)
    w2v.save(save_name)
    print("w2v model done")
    return w2v

# 得到embedding矩阵
def get_embedding_matrix(word_index, embed_size=300, Emed_path="w2v_300.txt"):
    embeddings_index = Word2Vec.load(Emed_path)
    nb_words = len(word_index)+1
    embedding_matrix = np.zeros((nb_words, embed_size))
    count = 0
    for word, i in tqdm(word_index.items()):
        if i >= nb_words:
            continue
        try:
            embedding_vector = embeddings_index[word]
        except:
            embedding_vector = np.zeros(embed_size)
            count += 1
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector  
    print("null cnt",count)
    return embedding_matrix

# 得到fasttext矩阵
def load_fasttext(word_index, path):  
    count=0
    null_list=[]
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(path, encoding='utf-8') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words =  len(word_index)+1
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= nb_words: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
        else:
            null_list.append(word)
            count+=1
    print("null cnt:",count)
    return embedding_matrix

def get_embedding_matrix_txt(word_index,embed_size=200,Emed_path="w2v_300.txt"):
    embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(
        Emed_path, binary=False)
    nb_words = len(word_index)+1
    embedding_matrix = np.zeros((nb_words, embed_size))
    count = 0
    for word, i in tqdm(word_index.items()):
        if i >= nb_words:
            continue
        try:
            embedding_vector = embeddings_index[word]
        except:
            embedding_vector = np.zeros(embed_size)
            count += 1
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    print("null cnt",count)
    return embedding_matrix

In [54]:
text_1_list = np.unique(train_data['text_1'])
text_3_list = np.unique(train_data['text_2'])

print('开始序列化')
x1, index_1, token_1 = set_tokenizer(train_data['text_1'], split_char=' ', max_len=30)
x3, index_3, token_3 = set_tokenizer(train_data['text_2'], split_char=' ', max_len=600)
print('序列化完成')
gc.collect()

trian_save_word2vec(text_1_list, save_name='../models/w2v_300_1.txt', split_char=' ')
gc.collect()
trian_save_word2vec(text_3_list, save_name='../models/w2v_300_3.txt', split_char=' ')
gc.collect()

# 得到emb矩阵
emb1 = get_embedding_matrix(index_1, Emed_path='../models/w2v_300_1.txt')
emb3 = get_embedding_matrix(index_3, Emed_path='../models/w2v_300_3.txt')
gc.collect()

开始序列化
序列化完成


0

-05 17:19:12,808:INFO:worker thread finished; awaiting finish of 16 more threads
2020-09-05 17:19:12,810:INFO:worker thread finished; awaiting finish of 15 more threads
2020-09-05 17:19:12,812:INFO:worker thread finished; awaiting finish of 14 more threads
2020-09-05 17:19:12,815:INFO:worker thread finished; awaiting finish of 13 more threads
2020-09-05 17:19:12,819:INFO:worker thread finished; awaiting finish of 12 more threads
2020-09-05 17:19:12,820:INFO:worker thread finished; awaiting finish of 11 more threads
2020-09-05 17:19:12,822:INFO:worker thread finished; awaiting finish of 10 more threads
2020-09-05 17:19:12,824:INFO:worker thread finished; awaiting finish of 9 more threads
2020-09-05 17:19:12,824:INFO:worker thread finished; awaiting finish of 8 more threads
2020-09-05 17:19:12,825:INFO:worker thread finished; awaiting finish of 7 more threads
2020-09-05 17:19:12,826:INFO:worker thread finished; awaiting finish of 6 more threads
2020-09-05 17:19:12,827:INFO:worker thread 

<gensim.models.word2vec.Word2Vec at 0x1ca6a1cf730>

0

 threads
2020-09-05 17:19:17,405:INFO:worker thread finished; awaiting finish of 17 more threads
2020-09-05 17:19:17,407:INFO:worker thread finished; awaiting finish of 16 more threads
2020-09-05 17:19:17,415:INFO:worker thread finished; awaiting finish of 15 more threads
2020-09-05 17:19:17,417:INFO:worker thread finished; awaiting finish of 14 more threads
2020-09-05 17:19:17,419:INFO:worker thread finished; awaiting finish of 13 more threads
2020-09-05 17:19:17,421:INFO:worker thread finished; awaiting finish of 12 more threads
2020-09-05 17:19:17,423:INFO:worker thread finished; awaiting finish of 11 more threads
2020-09-05 17:19:17,426:INFO:worker thread finished; awaiting finish of 10 more threads
2020-09-05 17:19:17,429:INFO:worker thread finished; awaiting finish of 9 more threads
2020-09-05 17:19:17,439:INFO:worker thread finished; awaiting finish of 8 more threads
2020-09-05 17:19:17,440:INFO:worker thread finished; awaiting finish of 7 more threads
2020-09-05 17:19:17,446:IN

<gensim.models.word2vec.Word2Vec at 0x1ca0fcd1190>

0

2020-09-05 17:19:22,753:INFO:loading Word2Vec object from ../models/w2v_300_1.txt
2020-09-05 17:19:22,818:INFO:loading wv recursively from ../models/w2v_300_1.txt.wv.* with mmap=None
2020-09-05 17:19:22,819:INFO:setting ignored attribute vectors_norm to None
2020-09-05 17:19:22,819:INFO:loading vocabulary recursively from ../models/w2v_300_1.txt.vocabulary.* with mmap=None
2020-09-05 17:19:22,820:INFO:loading trainables recursively from ../models/w2v_300_1.txt.trainables.* with mmap=None
2020-09-05 17:19:22,820:INFO:setting ignored attribute cum_table to None
2020-09-05 17:19:22,821:INFO:loaded ../models/w2v_300_1.txt
100%|██████████| 2886/2886 [00:00<00:00, 74076.01it/s]
2020-09-05 17:19:22,871:INFO:loading Word2Vec object from ../models/w2v_300_3.txt
null cnt 347
2020-09-05 17:19:23,056:INFO:loading wv recursively from ../models/w2v_300_3.txt.wv.* with mmap=None
2020-09-05 17:19:23,057:INFO:setting ignored attribute vectors_norm to None
2020-09-05 17:19:23,058:INFO:loading vocabulary

9

In [58]:
from keras.initializers import *

def model_conv(emb1, emb3):
    '''
    注意这个inputs
    seq1、seq2分别是两个输入
    是否做emb可选可不选，
    这个就是我们之前训练已经得到的用于embedding的（embedding_matrix1， embedding_matrix2）
    '''
    K.clear_session()

    emb_layer_1 = Embedding(
        input_dim=emb1.shape[0],
        output_dim=emb1.shape[1],
        weights=[emb1],
        input_length=30,
        trainable=False
    )
    
    emb_layer_3 = Embedding(
        input_dim=emb3.shape[0],
        output_dim=emb3.shape[1],
        weights=[emb3],
        input_length=600,
        trainable=False
    )
    
    
    seq1 = Input(shape=(30,))
    seq3 = Input(shape=(600,))    
    
    x1 = emb_layer_1(seq1)
    x3 = emb_layer_3(seq3)
    
    sdrop=SpatialDropout1D(rate=0.2)

    x1 = sdrop(x1)
    x3 = sdrop(x3)
     
    x = Dropout(0.2)(Bidirectional(GRU(128, return_sequences=True))(x1))
    semantic = TimeDistributed(Dense(100, activation="tanh"))(x)
    merged_1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(100,))(semantic)
    
    x = Dropout(0.2)(Bidirectional(GRU(128, return_sequences=True))(x3))
    semantic = TimeDistributed(Dense(100, activation="tanh"))(x)
    merged_3 = Lambda(lambda x: K.max(x, axis=1), output_shape=(100,))(semantic)
    
    
    x = Multiply()([merged_1, merged_3])
    
    x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(x))))
    x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
    pred_1 = Dense(3, activation='softmax')(x)
    # pred_2 = Dense(3, activation='softmax')(x)
    # pred_3 = Dense(3, activation='softmax')(x)
    # pred_4 = Dense(2, activation='softmax')(x)
    model = Model(inputs=[seq1, seq3], outputs=[pred_1])
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.0001),metrics=["accuracy"])
    return model
gc.collect()

81006

In [59]:
model = model_conv(emb1, emb3)
model.summary()
l1 = to_categorical(train_data['label_1'], 3)
# l2 = to_categorical(train_data['label_2'], 3)
# l3 = to_categorical(train_data['label_3'], 3)
# l4 = to_categorical(train_data['label_4'], 2)
model.fit([x1, x3],[l1], batch_size=256, epochs=8, verbose=1, shuffle=True)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 600)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 30, 300)      866100      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 600, 300)     1753200     input_2[0][0]                    
_______________________________________________________________________________________

AttributeError: in user code:

    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:789 run_step  **
        outputs = model.train_step(data)
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:759 train_step
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\compile_utils.py:388 update_state
        self.build(y_pred, y_true)
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\compile_utils.py:318 build
        self._metrics = nest.map_structure_up_to(y_pred, self._get_metric_objects,
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\util\nest.py:1135 map_structure_up_to
        return map_structure_with_tuple_paths_up_to(
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\util\nest.py:1234 map_structure_with_tuple_paths_up_to
        results = [func(*args, **kwargs) for args in zip(flat_path_list,
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\util\nest.py:1234 <listcomp>
        results = [func(*args, **kwargs) for args in zip(flat_path_list,
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\util\nest.py:1137 <lambda>
        lambda _, *values: func(*values),  # Discards the path arg.
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\compile_utils.py:419 _get_metric_objects
        return [self._get_metric_object(m, y_t, y_p) for m in metrics]
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\compile_utils.py:419 <listcomp>
        return [self._get_metric_object(m, y_t, y_p) for m in metrics]
    C:\Users\lsqlh\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\compile_utils.py:440 _get_metric_object
        y_t_rank = len(y_t.shape.as_list())

    AttributeError: 'tuple' object has no attribute 'shape'


In [None]:
#保存权重
model.save_weights('models/lstm_model.h5')

In [None]:
# 预测验证集
val_result_for_pred = pd.merge(val_result, val_df, on='sample_id', how='left')
val_result_for_pred['text_1'] = val_result_for_pred['理财产品名称'].astype(str) + '_' + val_result_for_pred['产品发行方名称'].astype(str)
val_result_for_pred['text_2'] = val_result_for_pred['text'].astype(str)

val_result_for_pred['text_1'] = val_result_for_pred['text_1'].progress_apply(lambda row:' '.join(jieba.lcut(str(row))))
val_result_for_pred['text_2'] = val_result_for_pred['text_2'].progress_apply(lambda row:' '.join(jieba.lcut(str(row))))

x1 = token_1.texts_to_sequences(val_result_for_pred['text_1'])
x1 = pad_sequences(x1, maxlen=30, value=0)
x3 = token_3.texts_to_sequences(val_result_for_pred['text_2'])
x3 = pad_sequences(x3, maxlen=600, value=0)
pred_result = model.predict([x1, x3], batch_size=1024, verbose=1)
pred_1 = label_1.inverse_transform(np.argmax(pred_result[0], axis=1))
pred_2 = label_2.inverse_transform(np.argmax(pred_result[1], axis=1))
pred_3 = label_3.inverse_transform(np.argmax(pred_result[2], axis=1))
pred_4 = label_4.inverse_transform(np.argmax(pred_result[3], axis=1))


In [None]:
val_result['理财类型'] = pred_1
val_result['资金来源'] = pred_2
val_result['实际购买公司和上市公司关系'] = pred_3
val_result['买卖方是否有关联关系'] = pred_4