In [1]:
import tensorflow as tf
from tensorflow.python.ops import rnn,rnn_cell

  from ._conv import register_converters as _register_converters


# 模型 双向LSTM + 评论特征向量attention

In [2]:
class HUAPA_ONE_LAYER(object):
    def __init__(self,max_len,class_num,embedding_file,embedding_dim,hidden_size,feature_dim,attention_dim,train,regularizer):
        #每个评论中包含的单词数
        self.max_len = max_len
        #评论类别
        self.class_num = class_num
        #词嵌入文件
        self.embedding_file = embedding_file
        #词嵌入维度
        self.embedding_dim = embedding_dim
        #隐藏状态维度
        self.hidden_size = hidden_size
        #简单向量维度
        self.feature_dim = feature_dim
        #attention层中w的维度
        self.attention_dim = attention_dim
        #是否训练过程
        self.train = train
        #是否正则化
        self.regularizer= regularizer
        
        with tf.name_scope('input'):
            #评论文本特征的嵌入向量
            self.doc_feature = tf.placeholder(tf.float32,[None,self.feature_dim],name='doc_feature')
            #input_x的维度[评论数量，评论中包含的单词数]
            self.input_x = tf.placeholder(tf.int32,[None,self.max_len],name='input_x')
            #input_y的维度[评论数量，评论的标签数目]  one-hot编码
            self.input_y = tf.placeholder(tf.float32,[None,self.class_num],name='input_y')
            #评论的长度(即单词数)
            self.doc_len = tf.placeholder(tf.int32,[None],name='doc_len')
            
        with tf.name_scope('weights'):
            self.weights = {
                #决策的softmax层
                'softmax' : tf.Variable(tf.random_uniform([2*hidden_size,self.class_num],-0.01,0.01)),
                #attention层参数
                'wh' : tf.Variable(tf.random_uniform([2*hidden_size,self.attention_dim],-0.01,0.01)),
                'v' : tf.Variable(tf.random_uniform([self.attention_dim,1],-0.01,0.01)),
                #评论特征嵌入参数
                'wf' : tf.Variable(tf.random_uniform([self.feature_dim,self.attention_dim],-0.01,0.01))
            }
            
        with tf.name_scope('biases'):
            self.biases = {
                'softmax' : tf.Variable(tf.random_uniform([self.class_num],-0.01,0.01)),
                'wh' : tf.Variable(tf.random_uniform([self.attention_dim],-0.01,0.01))
            }
            
        with tf.name_scope('embedding'):
            #词嵌入文本，不做训练
            self.word_embedding = tf.constant(self.embedding_file,name='word_embedding',dtype=tf.float32)
            #x的维度为[输入数据量，评论中包含的sentence数量，sentence中包含的词汇数量,word嵌入维度]
            self.x = tf.nn.embedding_lookup(self.word_embedding,self.input_x)
    
    def softmax(self, inputs, length, max_length):
        inputs = tf.cast(inputs, tf.float32)
        inputs = tf.exp(inputs)
        length = tf.reshape(length, [-1])
        mask = tf.reshape(tf.cast(tf.sequence_mask(length, max_length), tf.float32), tf.shape(inputs))
        inputs *= mask
        _sum = tf.reduce_sum(inputs, reduction_indices=2, keep_dims=True) + 1e-9
        return inputs / _sum  
    
    def feature_attention(self):
        #inputs 的维度是 [评论数量，词汇个数（短语长度）,单词嵌入维度] 
        inputs = tf.reshape(self.x, [-1, self.max_len, self.embedding_dim])
        #LSTM层
        with tf.name_scope('word_encode'):
            outputs, state = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=tf.nn.rnn_cell.LSTMCell(self.hidden_size, forget_bias=1.0),
                cell_bw=tf.nn.rnn_cell.LSTMCell(self.hidden_size, forget_bias=1.0),
                inputs=inputs,
                sequence_length=self.doc_len,
                dtype=tf.float32,
                scope='word'
            )
            #outputs的维度应该是[评论数量，词汇数，隐状态维度*2]
            outputs = tf.concat(outputs,2)
        
        #attention层
        #输入数量
        batch_size = tf.shape(outputs)[0]
        with tf.name_scope('word_attention'):
            #output维度 [评论数量，隐藏状态]
            output = tf.reshape(outputs, [-1, 2 * self.hidden_size])
            u = tf.matmul(output, self.weights['wh']) + self.biases['wh']
            #u的维度 [评论数量，最大词汇数，隐藏状态]
            u = tf.reshape(u, [-1, self.max_len, self.attention_dim])
            u += tf.matmul(self.doc_feature, self.weights['wf'])[:,None,:]
            u = tf.tanh(u)
            u = tf.reshape(u, [-1, self.attention_dim])
            #alpha的维度是[评论数，1,最大词汇数]
            alpha = tf.reshape(tf.matmul(u, self.weights['v']),
                               [batch_size, 1, self.max_len])
            alpha = self.softmax(alpha, self.doc_len, self.max_len)
            outputs = tf.matmul(alpha, outputs)
            
        with tf.name_scope('softmax'):
            self.doc = tf.reshape(outputs, [batch_size, 2 * self.hidden_size],name='doc_vectors')
            self.scores = tf.matmul(self.doc, self.weights['softmax']) + self.biases['softmax']
            #添加正则
            #if self.train:
            #if self.regularizer != None:
            tf.add_to_collection('losses',self.regularizer(self.weights['softmax']))
            #添加dropout层
            #    self.scores = tf.nn.dropout(self.scores,0.5)
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        with tf.name_scope("loss"):
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.scores, labels=tf.argmax(self.input_y,1))
            self.loss = tf.reduce_mean(losses) + tf.add_n(tf.get_collection('losses'))

        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.correct_num = tf.reduce_sum(tf.cast(correct_predictions, dtype=tf.int32),name='correct_num')
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
            
        with tf.name_scope('metrics'):
            self.mse = tf.reduce_sum(tf.square(self.predictions - tf.argmax(self.input_y, 1)),name='mse')

    def build_model(self):
        self.feature_attention()

# 工具类

In [3]:
# 词嵌入向量
def load_embedding(embedding_file_path, corpus, embedding_dim):
    wordset = set();
    for line in corpus:
        line = line.strip().split()
        for w in line:
            wordset.add(w.lower())
    words_dict = dict(); word_embedding = []; index = 1
    words_dict['$EOF$'] = 0  #add EOF
    word_embedding.append(np.zeros(embedding_dim))
    with open(embedding_file_path, 'r') as f:
        for line in f:
            check = line.strip().split()
            if len(check) > embedding_dim + 1 or len(check) == 2:
                continue
            line = line.strip().split()
            if line[0] not in wordset: 
                continue
            embedding = [float(s) for s in line[1:]]
            word_embedding.append(embedding)
            words_dict[line[0]] = index
            index +=1
    return np.asarray(word_embedding), words_dict

#将文本转化为词索引表示
def fit_transform(x_text,word_dict,max_len):
    x, doc_len = [], []
    for index,doc in enumerate(x_text):
        t_x = np.zeros((max_len),dtype=int)
        i = 0
        for word in doc.strip().split():
            if i >= max_len:
                break
            if word not in word_dict:
                continue
            t_x[i] = word_dict[word]
            i += 1
        #记录评论中词汇长度
        x.append(t_x)
        doc_len.append(i)
    return np.asarray(x),np.asarray(doc_len)

class Dataset(object):
    def __init__(self,data_file,feature_file):
        self.t_feature = []
        self.t_label = []
        self.t_docs = []
        with open(data_file,'r') as f:
            for line in f:
                line = line.strip().split('\t')
                self.t_label.append(int(line[7])+1)
                self.t_docs.append(line[4].lower())
        with open(feature_file,'r') as f:
            for line in f:
                line = line.strip().split('\t')
                self.t_feature.append(np.asarray(line[1][1:-1].split(',')))
            self.data_size = len(self.t_docs)
    
    #全部数据转换，用来生成文档向量表示
    def getAllData(self,wordsdict,max_len,n_class):
        self.all_labels = np.eye(n_class,dtype=np.float32)[self.t_label]
        self.all_docs ,self.all_docs_len = fit_transform(self.t_docs,wordsdict,max_len)
        self.all_features = self.t_feature
    
    def genBatch(self,wordsdict,batch_size,max_len,n_class):
        self.epoch = (int)(len(self.t_docs) / batch_size)
        if len(self.t_docs) % batch_size !=0:
            self.epoch += 1
        self.labels = []
        self.docs = []
        self.doc_len = []
        self.features = []
        
        for i in range(self.epoch):
            self.features.append(np.asarray(self.t_feature[i*batch_size:(i+1)*batch_size]))
            self.labels.append(np.eye(n_class,dtype=np.float32)[self.t_label[i*batch_size:(i+1)*batch_size]])
            b_docs,b_doc_lens = fit_transform(self.t_docs[i*batch_size:(i+1)*batch_size],wordsdict,max_len)
            self.docs.append(b_docs)
            self.doc_len.append(b_doc_lens)
    
    def batch_iter(self,wordsdict,n_class,batch_size,num_epochs,max_len,shuffle=True):
        data_size = len(self.t_docs)
        num_batches_per_epoch = int(data_size / batch_size) + \
                                (1 if data_size % batch_size else 0)
        self.t_label = np.asarray(self.t_label)
        self.t_docs = np.asarray(self.t_docs)
        self.t_feature = np.asarray(self.t_feature)
        
        for epoch in range(num_epochs):
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                self.t_label = self.t_label[shuffle_indices]
                self.t_docs = self.t_docs[shuffle_indices]
                self.t_feature = self.t_feature[shuffle_indices]
            
            for batch_num in range(num_batches_per_epoch):
                start = batch_num * batch_size
                end = min((batch_num + 1) * batch_size, data_size)
                label = np.eye(n_class, dtype=np.float32)[self.t_label[start:end]]
                features = self.t_feature[start:end]
                b_docs,b_doc_lens = fit_transform(self.t_docs[start:end],wordsdict,max_len)
                batch_data = zip(features, b_docs, label, b_doc_lens)
                yield batch_data

# 划分训练集,验证集，测试集

In [4]:
#总共数据量
n = 15433
#训练集60% 验证集20% 测试集20%
n_train = (int)(n * 0.6)
n_validate = n_train + (int)(n * 0.2)
with open('./Video_text.txt') as f:
    l = f.readlines()
    with open('./Video_text_train.txt','w') as a:
        a.writelines(l[0:n_train])
    with open('./Video_text_validate.txt','w') as b:
        b.writelines(l[n_train:n_validate])
    with open('./Video_text_test.txt','w') as c:
        c.writelines(l[n_validate:])
with open('./Video_simple_feature.txt') as f:
    l = f.readlines()
    with open('./Video_simple_feature_train.txt','w') as a:
        a.writelines(l[0:n_train])
    with open('./Video_simple_feature_validate.txt','w') as b:
        b.writelines(l[n_train:n_validate])
    with open('./Video_simple_feature_test.txt','w') as c:
        c.writelines(l[n_validate:])

# 训练

In [5]:
import os,datetime,time, pickle
import numpy as np

In [6]:
# Data loading params
tf.flags.DEFINE_integer("n_class", 3, "Numbers of class")

#Model hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 200, "Dimensionality of character embedding")
tf.flags.DEFINE_integer("hidden_size", 50, "hidden_size of rnn")
tf.flags.DEFINE_integer("max_len",500,"the max number of words in a review")
tf.flags.DEFINE_float("lr", 0.005, "Learning rate")
tf.flags.DEFINE_float("rr",0.01,'regulariztion rate')
tf.flags.DEFINE_integer('attention_dim',100,'Dimensionality of attention layer')

# Training parameters
tf.flags.DEFINE_integer("batch_size", 100, "Batch Size")
tf.flags.DEFINE_integer("num_epochs", 1000, "Number of training epochs")
tf.flags.DEFINE_integer("evaluate_every", 25, "Evaluate model on dev set after this many steps")

# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

In [7]:
FLAGS = tf.flags.FLAGS
FLAGS.flag_values_dict()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value.default))
print("")
FLAGS = FLAGS.__flags

#Load data
print('Loading data...')
trainset = Dataset('./Video_text_train.txt','./Video_simple_feature_train.txt')
devset = Dataset('./Video_text_validate.txt','./Video_simple_feature_validate.txt')
testset = Dataset('./Video_text_test.txt','./Video_simple_feature_test.txt')

embeddingpath = './yelp-2014-embedding-200d.txt'
alldata = np.concatenate([trainset.t_docs, devset.t_docs, testset.t_docs], axis=0)
embeddingfile, wordsdict = load_embedding(embeddingpath,alldata, FLAGS['embedding_dim'].default)
del alldata
print("Loading data finished...")

#存储字典映射
with open("./wordsdict.txt", 'wb') as f:
    pickle.dump(wordsdict, f, 0)
#存储embeddingfile
with open("./embeddingfile.txt",'wb') as f:
    pickle.dump(embeddingfile,f,0)

trainbatches = trainset.batch_iter(wordsdict, FLAGS['n_class'].default, FLAGS['batch_size'].default,
                                 FLAGS['num_epochs'].default, FLAGS['max_len'].default)
devset.genBatch(wordsdict,FLAGS['batch_size'].default,FLAGS['max_len'].default,FLAGS['n_class'].default)
testset.genBatch(wordsdict,FLAGS['batch_size'].default,FLAGS['max_len'].default,FLAGS['n_class'].default)


with tf.Graph().as_default():
    session_config = tf.ConfigProto(
        allow_soft_placement=FLAGS['allow_soft_placement'].default,
        log_device_placement=FLAGS['log_device_placement'].default
    )
    session_config.gpu_options.allow_growth = True
    sess = tf.Session(config=session_config)
    with sess.as_default():
        huapa = HUAPA_ONE_LAYER(
            max_len = FLAGS['max_len'].default,
            class_num = FLAGS['n_class'].default,
            embedding_file = embeddingfile,
            embedding_dim = FLAGS['embedding_dim'].default,
            hidden_size = FLAGS['hidden_size'].default,
            feature_dim = 10,
            attention_dim = FLAGS['attention_dim'].default,
            train = True,
            regularizer = tf.contrib.layers.l2_regularizer(FLAGS['rr'].default)
        )
        huapa.build_model()
        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(FLAGS['lr'].default)
        grads_and_vars = optimizer.compute_gradients(huapa.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        # Save dict
        timestamp = str(int(time.time()))
        checkpoint_dir = os.path.abspath("./checkpoints/"+timestamp)
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
        #with open(checkpoint_dir + "/wordsdict.txt", 'wb') as f:
        #    pickle.dump(wordsdict, f)
        
        sess.run(tf.global_variables_initializer())
        
        def train_step(batch):
            features, b_docs, label, doc_len = zip(*batch)
            #设置为训练
            huapa.train = True
            #print(np.array(features).shape,np.array(b_docs).shape,np.array(label).shape,np.array(doc_len).shape)
            feed_dict = {
                huapa.doc_feature: features,
                huapa.input_x: b_docs,
                huapa.input_y: label,
                huapa.doc_len: doc_len
            }
            _, step, loss, accuracy = sess.run(
                [train_op, global_step, huapa.loss, huapa.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
        
        def predict_step(features,b_docs,label,doc_len, name=None):
            feed_dict = {
                huapa.doc_feature: features,
                huapa.input_x: b_docs,
                huapa.input_y: label,
                huapa.doc_len: doc_len
            }
            step, loss, accuracy, correct_num, mse = sess.run(
                [global_step, huapa.loss, huapa.accuracy, huapa.correct_num, huapa.mse],
                feed_dict)
            return correct_num, accuracy, mse
        
        def predict(dataset, name=None):
            huapa.train = False
            acc = 0
            rmse = 0.
            for i in range(dataset.epoch):
                correct_num, _, mse = predict_step(dataset.features[i], dataset.docs[i], dataset.labels[i],
                                                   dataset.doc_len[i], name)
                acc += correct_num
                rmse += mse
            acc = acc * 1.0 / dataset.data_size
            rmse = np.sqrt(rmse / dataset.data_size)
            return acc, rmse

        topacc = 0.
        toprmse = 0.
        better_dev_acc = 0.
        predict_round = 0
        
        # Training loop. For each batch...
        for tr_batch in trainbatches:
            train_step(tr_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % FLAGS['evaluate_every'].default == 0:
                predict_round += 1
                print("\nEvaluation round %d:" % (predict_round))

                dev_acc, dev_rmse = predict(devset, name="dev")
                print("dev_acc: %.4f    dev_RMSE: %.4f" % (dev_acc, dev_rmse))
                test_acc, test_rmse = predict(testset, name="test")
                print("test_acc: %.4f    test_RMSE: %.4f" % (test_acc, test_rmse))

                # print topacc with best dev acc
                if dev_acc >= better_dev_acc:
                    better_dev_acc = dev_acc
                    topacc = test_acc
                    toprmse = test_rmse
                    path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                    print("Saved model checkpoint to {}\n".format(path))
                print("topacc: %.4f   RMSE: %.4f" % (topacc, toprmse))
            


Parameters:
ALLOW_SOFT_PLACEMENT=True
ATTENTION_DIM=100
BATCH_SIZE=100
EMBEDDING_DIM=200
EVALUATE_EVERY=25
HIDDEN_SIZE=50
LOG_DEVICE_PLACEMENT=False
LR=0.005
MAX_LEN=500
N_CLASS=3
NUM_EPOCHS=1000
RR=0.01

Loading data...
Loading data finished...
Instructions for updating:
keep_dims is deprecated, use keepdims instead
2019-03-08T11:48:21.314076: step 1, loss 1.09321, acc 0.6
2019-03-08T11:48:22.224203: step 2, loss 1.04977, acc 0.63
2019-03-08T11:48:23.068660: step 3, loss 1.02903, acc 0.52
2019-03-08T11:48:23.882710: step 4, loss 1.0286, acc 0.52
2019-03-08T11:48:24.717156: step 5, loss 0.84269, acc 0.68
2019-03-08T11:48:25.544335: step 6, loss 1.07668, acc 0.52
2019-03-08T11:48:26.334584: step 7, loss 0.867726, acc 0.65
2019-03-08T11:48:27.107432: step 8, loss 0.957349, acc 0.58
2019-03-08T11:48:27.884750: step 9, loss 0.845242, acc 0.7
2019-03-08T11:48:28.658709: step 10, loss 1.00922, acc 0.55
2019-03-08T11:48:29.426324: step 11, loss 0.966212, acc 0.58
2019-03-08T11:48:30.185290: 

KeyboardInterrupt: 

# 生成向量表示

In [8]:
FLAGS = tf.flags.FLAGS
FLAGS.flag_values_dict()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value.default))
print("")
FLAGS = FLAGS.__flags

checkpoint_file = './checkpoints/1552022238/model-650'
#加载数据
testset = Dataset('./Video_text.txt','./Video_simple_feature.txt')
with open('./wordsdict.txt','rb') as f:
    wordsdict = pickle.load(f)
testset.getAllData(wordsdict,FLAGS['max_len'].default,FLAGS['n_class'].default)
    
def evaluate():
    graph = tf.Graph()
    with graph.as_default():
        session_config = tf.ConfigProto(
            allow_soft_placement=FLAGS['allow_soft_placement'].default,
            log_device_placement=FLAGS['log_device_placement'].default
        )
        session_config.gpu_options.allow_growth = True
        sess = tf.Session(config=session_config)
        with sess.as_default():
            saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)
            
            input_x = graph.get_operation_by_name("input/input_x").outputs[0]
            input_y = graph.get_operation_by_name("input/input_y").outputs[0]
            doc_len = graph.get_operation_by_name("input/doc_len").outputs[0]
            doc_feature = graph.get_operation_by_name("input/doc_feature").outputs[0]
        
            hupa_doc_vectors = graph.get_operation_by_name("softmax/doc_vectors").outputs[0]
            hupa_correct_num = graph.get_operation_by_name("accuracy/correct_num").outputs[0]
            hupa_accuracy = graph.get_operation_by_name("accuracy/accuracy").outputs[0]
            hupa_mse = graph.get_operation_by_name("metrics/mse").outputs[0]
            
            validate_feed = {
                input_x : testset.all_docs,
                input_y: testset.all_labels,
                doc_len : testset.all_docs_len,
                doc_feature : testset.all_features
            }
            doc_vectors,accuracy,correct_num,mse = sess.run([hupa_doc_vectors,hupa_accuracy,hupa_correct_num, hupa_mse],validate_feed)
            acc = correct_num * 1.0 / testset.data_size
            rmse = np.sqrt(mse / testset.data_size)
            
            print("\ntest_acc: %.4f    test_RMSE: %.4f\n" % (acc, rmse))
            print(accuracy)
            print(doc_vectors.shape)
            np.savetxt('doc_vectors.txt',doc_vectors)
evaluate()


Parameters:
ALLOW_SOFT_PLACEMENT=True
ATTENTION_DIM=100
BATCH_SIZE=100
EMBEDDING_DIM=200
EVALUATE_EVERY=25
HIDDEN_SIZE=50
LOG_DEVICE_PLACEMENT=False
LR=0.005
MAX_LEN=500
N_CLASS=3
NUM_EPOCHS=1000
RR=0.01

INFO:tensorflow:Restoring parameters from ../checkpoints/1552022238/model-650

test_acc: 0.7172    test_RMSE: 0.6300

0.7171645
(15433, 100)


In [None]:
test_acc: 0.6355    test_RMSE: 0.7299

In [1]:
import torch

In [12]:
a = torch.rand(3,3)
a

tensor([[0.1028, 0.4757, 0.1334],
        [0.9818, 0.5819, 0.2551],
        [0.6113, 0.6436, 0.4179]])

In [13]:
b = torch.tensor([[0],[1]])
b = b.reshape(2)
c = torch.tensor([[1,0],[0,1]])
d = torch.index_select(a,0,b)
e = torch.index_select(d,1,)

tensor([[0.1028, 0.4757, 0.1334],
        [0.9818, 0.5819, 0.2551]])

In [1]:
import numpy as np
import torch

In [15]:
a = np.mean([1,2,3])
a

2.0

In [2]:
a = torch.Tensor([[1,0,2,0],[3,2,0,0]])
a

tensor([[1., 0., 2., 0.],
        [3., 2., 0., 0.]])

In [6]:
c = np.flatnonzero(a.cpu().data.numpy())
d = len(c)
d

4

In [7]:
a[0]

tensor([1., 0., 2., 0.])