In [13]:
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Dropout, Flatten
import tensorflow.keras.backend as K

# 自注意力层
class Self_Attention(Layer):
    # input:  [None, n, k]输入为n个维度为k的词向量
    # mask:   [None, n]表示填充词位置的mask
    # output: [None, k]输出n个词向量的加权和
    def __init__(self, dropout_rate=0.0):
        super(Self_Attention, self).__init__()
        self.dropout_layer = Dropout(dropout_rate)

    def build(self, input_shape):
        self.k = input_shape[0][-1]  #词向量维度
        self.W_layer = Dense(self.k, activation='tanh', use_bias=True) #对h的映射
        self.U_weight = self.add_weight(name='U', shape=(self.k, 1),   #U记忆矩阵
                                        initializer=tf.keras.initializers.glorot_uniform(),
                                        trainable=True)

    def call(self, inputs, **kwargs):
        input, mask = inputs #输入有两部分[input, mask]
        if K.ndim(input) != 3:
            raise ValueError("The dim of inputs is required 3 but get {}".format(K.ndim(input)))

        # 计算score
        x = self.W_layer(input)              # [None, n, k]
        score = tf.matmul(x, self.U_weight)  # [None, n, 1]
        score = self.dropout_layer(score)    # 随机dropout(也可不要)

        # softmax之前进行mask
        mask = tf.expand_dims(mask, axis=-1)  # [None, n, 1]
        padding = tf.cast(tf.ones_like(mask)*(-2**31+1), tf.float32) #mask的位置填充很小的负数
        score = tf.where(tf.equal(mask, 0), padding, score)
        score = tf.nn.softmax(score, axis=1)  # [None, n, 1] mask之后计算softmax

        # 向量加权和
        output = tf.matmul(input, score, transpose_a=True)   
        output /= self.k**0.5                                
        output = tf.squeeze(output, axis=-1)                 
        return output


In [14]:
class Image_Text_Attention(Layer):
    # 该层的输入有三部分image_emb、seq_emb、mask
    # image_emb: [None, M, 4096]对应M个4096维的图像向量(由vgg16提取得到)，每条评论的M可以不一致
    # seq_emb:   [None, L, k]表示L个维度为k的句向量
    # mask:      [None, L]表示L个句子的mask(因为存在句子数不足L的文档，有被padding的句子)
    # output:    [None, M, k]输出为M个图像对应的文档向量表示
    def __init__(self, dropout_rate=0.0):
        super(Image_Text_Attention, self).__init__()
        self.dropout_layer = Dropout(dropout_rate)

    def build(self, input_shape):
        self.l = input_shape[1][1]   # 句子个数
        self.k = input_shape[1][-1]  # 句向量维度
        self.img_layer = Dense(1, activation='tanh', use_bias=True)  # 将image_emb映射到1维
        self.seq_layer = Dense(1, activation='tanh', use_bias=True)  # 将seq_emb也映射到1维(方便内积)
        self.V_weight = self.add_weight(name='V', shape=(self.l, self.l),
                                        initializer=tf.keras.initializers.glorot_uniform(),
                                        trainable=True)

    def call(self, inputs, **kwargs):
        image_emb, seq_emb, mask = inputs  # 输入为三部分[image_emb, seq_emb, mask]

        # 线性映射
        p = self.img_layer(image_emb)  # [None, M, 1]
        q = self.seq_layer(seq_emb)    # [None, L, 1]

        # 内积+映射(计算score)
        emb = tf.matmul(p, q, transpose_b=True)   # [None, M, L]
        emb = emb + tf.transpose(q, [0, 2, 1])    # [None, M, L]
        emb = tf.matmul(emb, self.V_weight)       # [None, M, L]
        score = self.dropout_layer(emb)           # 随机dropout(也可不要)

        # mask
        mask = tf.tile(tf.expand_dims(mask, axis=1), [1, score.shape[1], 1])  # [None, M, L]，将mask矩阵复制到与score相同的形状
        padding = tf.cast(tf.ones_like(mask) * (-2 ** 31 + 1), tf.float32)
        score = tf.where(tf.equal(mask, 0), padding, score)
        score = tf.nn.softmax(score, axis=-1)      # [None, M, L]

        # 向量加权和
        output = tf.matmul(score, seq_emb)   # [None, M, k]
        output /= self.k**0.5                # 归一化
        return output

In [15]:
import torch.nn as nn
class Vgg16Net(nn.Module):
    def __init__(self):
        super(Vgg16Net, self).__init__()

        self.layer1 = nn.Sequential(
            # （输入通道，输出通道，卷积核大小） 例：32*32*3 —> (32+2*1-3)/1+1 = 32，输出：32*32*64
            nn.Conv2d(3, 64, 3, padding=1), 
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            # （输入通道，输出通道，卷积核大小） 输入：32*32*64，卷积：3*64*64，输出：32*32*64
            nn.Conv2d(64, 64, 3, padding=1), 
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),

            nn.MaxPool2d(kernel_size=2, stride=2)   # 输入：32*32*64，输出：6*16*64
        )

        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),

            nn.Conv2d(128, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),

            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.layer3 = nn.Sequential(
            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),

            nn.Conv2d(256, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.layer4 = nn.Sequential(
            nn.Conv2d(256, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(512, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.layer5 = nn.Sequential(
            nn.Conv2d(512, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(512, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(512, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.conv_layer = nn.Sequential(
            self.layer1,
            self.layer2,
            self.layer3,
            self.layer4,
            self.layer5,
        )

        self.fc = nn.Sequential(
            nn.Linear(512, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),

            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),

            nn.Linear(4096, 1000),
            nn.ReLU(inplace=True),
            nn.Dropout(),

            nn.Linear(1000, 82),
        )

    def forward(self, x):
        x = self.conv_layer(x)
        x = x.view(-1, 512)
        x = self.fc(x)

In [16]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GRU, Bidirectional

class VistaNet(Model):
    def __init__(self, block_nums=[2,2,3,3,3], out_dim=4096, vgg_dropout=0.0, attention_dropout=0.0, gru_units=[64, 128], class_num=5):
        # block_nums: vgg16各层卷积的个数
        # out_dim: vgg16输出维度
        # dropout: 各层的dropout系数
        # gru_units: 两个单层双向GRU的输出维度
        # class_num： 模型最终输出维度
        super(VistaNet, self).__init__()
        self.vgg16 = Vgg16Net(block_nums, out_dim, vgg_dropout)       # VGG-16
        self.word_self_attention = Self_Attention(attention_dropout)# 第一层中的自注意力
        self.img_seq_attention = Image_Text_Attention(attention_dropout)  # 第二层中的Image-Text注意力
        self.doc_self_attention = Self_Attention(attention_dropout) # 第三层中的自注意力
        # 两个单层双向GRU层
        self.BiGRU_layer1 = Bidirectional(GRU(units=gru_units[0],
                                             kernel_regularizer=tf.keras.regularizers.l2(1e-5),
                                             recurrent_regularizer=tf.keras.regularizers.l2(1e-5),
                                             return_sequences=True),
                                          merge_mode='concat')
        self.BiGRU_layer2 = Bidirectional(GRU(units=gru_units[1],
                                             kernel_regularizer=tf.keras.regularizers.l2(1e-5),
                                             recurrent_regularizer=tf.keras.regularizers.l2(1e-5),
                                             return_sequences=True),
                                          merge_mode='concat')
        self.output_layer = Dense(class_num, activation='softmax') # 任务层

    def call(self, inputs, training=None, mask=None):
        # 输入inputs包含三部分：(假设batchsize为1，省略掉第一维None)
        # image_inputs: [M, 227, 227, 3]一条评论样本包含的M个图像
        # text_inputs:  [L, T, k]一条样本表示一个文档，所以输入张量为3维:[最大句子数，最大单词数， 词向量维度]
        # mask: [L, T]每句话中mask词的位置
        image_inputs, text_inputs, mask = inputs 

        # 获取图像emb向量
        image_emb = self.vgg16(image_inputs)       # [M, 224, 224, 3] -> [M, 4096]

        # 经过GRU层获取词向量word_emb
        word_emb = self.BiGRU_layer1(text_inputs)  # [L, T, k] -> [L, T, 2k]

        # 经过self_attention得到句向量seq_emb
        input = [word_emb, mask]                   # [L, T, 2k] & [L, T]
        seq_emb = self.word_self_attention(input)  # [L, T, 2k] -> [L, 2k]

        # 经过GRU层提取语义
        input = tf.expand_dims(seq_emb, axis=0)    # [1, L, 2k]
        seq_emb = self.BiGRU_layer2(input)         # [1, L, 2k] -> [1, L, 4k]

        # 经过img_seq_attention得到M个文档向量doc_emb
        image_emb = tf.expand_dims(image_emb, axis=0) # [1, M, 4096]
        mask = tf.argmax(mask, axis=1)                # [L, ]
        mask = tf.expand_dims(mask, axis=0)           # [1, L]
        input = [image_emb, seq_emb, mask]
        doc_emb = self.img_seq_attention(input)       # [1, M, 4k] M个文档向量表示

        # 经过self_attention得到最终的文档向量
        mask = tf.ones(shape=[1, doc_emb.shape[1]])   # [1, M],全为非0值，因为该注意力无需mask
        input = [doc_emb, mask]
        D_emb = self.doc_self_attention(input)        # [1, 4k]

        # output layer
        output = self.output_layer(D_emb)             # [1, class_num]
        return output


In [12]:
import os
from PIL import Image

model = VistaNet()

for i in range(1,5129):
    img = Image.open("C:/Users/15328/Desktop/information/大三下/当代人工智能/exp5data/实验五数据/实验五数据/data/"+str(i)+".jpg")
    path = r'C:/Users/15328/Desktop/information/大三下/当代人工智能/exp5data/实验五数据/实验五数据/data/'+str(i)+'.txt'
    file1 = open(path,'r')
    text_input = file1.read()
    mask = (i,i)
    input = [img, text_input, mask]
    pre = model(input)

AttributeError: Exception encountered when calling layer "vgg_net_2" (type VggNet).

'JpegImageFile' object has no attribute 'shape'

Call arguments received:
  • inputs=<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=551x325 at 0x159147A3850>
  • kwargs={'training': 'None'}