### Imports and TF configs

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.utils import Progbar

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

config = tf.compat.v1.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.95 # Change this value as per requirement
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

In [None]:
import os, nltk, re, sys
import pandas as pd
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
import jieba

from collections import defaultdict
from PIL import Image

### Data loading and processing part

In [None]:
test_id = pkl.load(open("./weibo/test_id.pickle", 'rb'))
train_id = pkl.load(open("./weibo/train_id.pickle", 'rb'))
validate_id = pkl.load(open("./weibo/validate_id.pickle", 'rb'))

train_id = pd.DataFrame({"values": map(int, train_id.values()), "tweet id": map(int, train_id.keys())})
test_id = pd.DataFrame({"values": map(int, test_id.values()), "tweet id": map(int, test_id.keys())})
validation_id = pd.DataFrame({"values": map(int, validate_id.values()), "tweet id": map(int, validate_id.keys())})

ids = {
    "train": train_id.set_index("tweet id")['values'],
    "test": test_id.set_index("tweet id")['values'],
    "validation": validation_id.set_index("tweet id")['values']
}

In [None]:
columns="tweet id|user name|tweet url|user url|publish time| original?|retweet count|comment count|praise count|user id|user authentication type|user fans count|user follow count|user tweet count|publish platform".split("|")

In [None]:
def clean_str_sst(string):
    """
    Tokenization/string cleaning for the SST dataset
    """
    string = re.sub("[，。 :,.；|-“”——_/nbsp+&;@、《》～（）())#O！：【】]", "", string)
    return string.strip().lower()

def stopwordslist(filepath = './weibo/stop_words.txt'):
    stopwords = {}
    for line in open(filepath, 'r', encoding='utf-8').readlines():
        line = line.strip()
        stopwords[line] = 1
    #stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords


In [None]:
# ids = pd.concat([train_id, validation_id, test_id]).set_index("tweet id")['values']
stop_words = stopwordslist()
image_paths = {
    0: "./weibo/nonrumor_images/",
    1: "./weibo/rumor_images/"
}
image_filelist = {
    0: os.listdir(image_paths[0]),
    1: os.listdir(image_paths[1])
}

def load_tweets(split):
    map_id = {}
    tweet_data = []
    pre_path = "./weibo/tweets/"
    id = ids[split]
    file_list = [(0, pre_path + "test_nonrumor.txt"), (1, pre_path + "test_rumor.txt"), \
                        (0, pre_path + "train_nonrumor.txt"), (1, pre_path + "train_rumor.txt")]
    
    for label, path in file_list:
        with open(path, 'r', encoding='utf-8') as input_file:
            while True:
                try:
                    lines = ['', '', '']
                    data = {}

                    for i in range(len(lines)):
                        lines[i]=next(input_file).replace("\n", "")

                    l1, l2, l3 = lines
                    tweet_id = int(l1.split('|')[0])
                    # get tweet details

                    data.update(dict([(col, item) for col, item in zip(columns, l1.split("|"))]))

                    found = False
                    for item in l2.split("|")[:-1]:
                        item = item.split("/")[-1]
                        if item in image_filelist[label]:
                            found = True
                            break

                    data['image'] = image_paths[label] + item

                    l3 = clean_str_sst(l3)
                    seg_list = jieba.cut_for_search(l3)
                    new_seg_list = []
                    for word in seg_list:
                        if word not in stop_words:
                            new_seg_list.append(word)

                    l3 = " ".join(new_seg_list)

                    data['tweet_content'] = l3
                    
                    # there are more than 10 tokens in the text
                    if len(l3) > 10 and tweet_id in id.index:
                        event = id[tweet_id]
                        if event not in map_id:
                            map_id[event] = len(map_id)
                            event = map_id[event]
                        else:
                            event = map_id[event]

                        data['event'] = event
                        data['label'] = label
                        tweet_data.append(data)
                        
                except StopIteration:
                    print("End of file reached")
                    break

                # except Exception as e:
                #     print(e)
                #     # break

    return pd.DataFrame.from_records(tweet_data)

In [None]:
train_dataset = load_tweets("train")[['tweet id', 'tweet_content', 'image', 'event', 'label']]
test_dataset = load_tweets('test')[['tweet id', 'tweet_content', 'image', 'event', 'label']]

# max number of events are 10, but test set contain 14 unique events, we will the everything that is > 9
test_dataset = test_dataset[test_dataset['event'] <= 9]

validation_dataset = load_tweets('validation')[['tweet id', 'tweet_content', 'image', 'event', 'label']]
all_text = pd.concat([train_dataset['tweet_content'] + test_dataset['tweet_content'] + validation_dataset['tweet_content']]).dropna()


In [None]:
image_roots = {
    1: "./weibo/rumor_images",
    0: "./weibo/nonrumor_images",
}
def load_image(path):
    def center_crop(image, dim):
        width, height = image.size
        new_width, new_height = dim, dim

        left = (width - new_width)/2
        top = (height - new_height)/2
        right = (width + new_width)/2
        bottom = (height + new_height)/2

        # Crop the center of the image
        image = image.crop((left, top, right, bottom))
        return image

    image = Image.open(path)
    if len(np.array(image).shape) != 3:
        new_image = Image.new('RGB', image.size)
        new_image.paste(image)
        image = new_image

    image = image.resize((256, 256))
    image = center_crop(image, 224)
    image = np.array(image, dtype=np.float32)/255

    return image

In [None]:
from transformers import AutoTokenizer

### chinese text tokenizer

In [None]:
embedding_path = "./weibo/w2v.pickle"
w2v = pkl.load(open(embedding_path, 'rb'), encoding='latin1')
vocab = list(w2v.keys())

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")
vocab_size = tokenizer.vocab_size

In [None]:
BATCH_SIZE = 32
SEQ_LENGTH = 28
VECTOR_DIM = 32

def tokenize(sentence):
    return tokenizer(str(sentence), max_length=SEQ_LENGTH, padding='max_length', truncation=True)['input_ids']

def get_matrix(sentence):
    vectors = np.zeros((SEQ_LENGTH, VECTOR_DIM), dtype=np.float32)
    for i, word in enumerate(sentence[:SEQ_LENGTH]):
        vectors[i, :] = w2v[word]

    return vectors

def preprocess_image(text, image, event, label):
    image = load_image(image.numpy().decode('utf-8'))
    
    return text, image, event, label

def dict_map(text, image, event, label):
    return {
        "text": text,
        "image": image
    }, event, label

### Data pipelining

In [None]:
train_texts = np.array(train_dataset['tweet_content'].map(tokenize).to_list(), dtype=np.float32)

train_images = train_dataset['image'].to_list()
train_events = train_dataset['event'].to_list()
train_labels = train_dataset['label'].to_list()
train_ds = (tf.data.Dataset.from_tensor_slices((train_texts, train_images, train_events, train_labels))
            .map(lambda text, image, event, label: tf.py_function(preprocess_image, [text, image, event, label], [tf.float32, tf.float32, tf.int32, tf.int32]))
            .map(dict_map)
            .shuffle(1000)
            .batch(BATCH_SIZE)
            .prefetch(tf.data.AUTOTUNE)
            )

test_texts = np.array(test_dataset['tweet_content'].map(tokenize).to_list(), dtype=np.float32)
test_images = test_dataset['image'].to_list()
test_events = test_dataset['event'].to_list()
test_labels = test_dataset['label'].to_list()
test_ds = (tf.data.Dataset.from_tensor_slices((test_texts, test_images, test_events, test_labels))          
            .map(lambda text, image, event, label: tf.py_function(preprocess_image, [text, image, event, label], [tf.float32, tf.float32, tf.int32, tf.int32]))
            .map(dict_map)
            .shuffle(1000)
            .batch(BATCH_SIZE)
            .prefetch(tf.data.AUTOTUNE)
            )

validation_texts = np.array(validation_dataset['tweet_content'].map(tokenize).to_list(), dtype=np.float32)
validation_images = validation_dataset['image'].to_list()
validation_events = validation_dataset['event'].to_list()
validation_labels = validation_dataset['label'].to_list()
validation_ds = (tf.data.Dataset.from_tensor_slices((validation_texts, validation_images, validation_events, validation_labels))
            .map(lambda text, image, event, label: tf.py_function(preprocess_image, [text, image, event, label], [tf.float32, tf.float32, tf.int32, tf.int32]))
            .map(dict_map)
            .shuffle(1000)
            .batch(BATCH_SIZE)
            .prefetch(tf.data.AUTOTUNE)
            )

### Model construction

In [None]:
HIDDEN_DIMS = 32
NUM_FILTERS = 20
WINDOW_SIZE = [1, 2, 3, 4]
EPOCHS = 10

p = np.linspace(0, 1, 10)
alpha = 10
beta = 0.75
lmd = 1

In [None]:
# simple gradient reversal layer that is used in the paper
class GradientReversal(keras.layers.Layer):\

    def __init__(self, λ=1, **kwargs):
        super(GradientReversal, self).__init__(**kwargs)
        self.λ = λ

    @staticmethod
    @tf.custom_gradient
    def reverse_gradient(x, λ):
        return tf.identity(x), lambda dy: (-dy, None)

    def call(self, x):
        return self.reverse_gradient(x, self.λ)

    def compute_mask(self, inputs, mask=None):
        return mask

    def compute_output_shape(self, input_shape):
        return input_shape

    def get_config(self):
        return super(GradientReversal, self).get_config() | {'λ': self.λ}

In [None]:
# Learning rate schedule based on the paper, it does not work for less epochs
# In this task, 10 epoch makes the learning rate ~ 0 by epoch = 4 or 5. Hence, the model fails to converge.
def lr_schedule(epoch, lr, p, alpha, beta):
    
    din = (1+(alpha*p[epoch]))**beta
    return lr/din

In [None]:
# load VGG-19 without the top layer
vgg19 = keras.applications.VGG19(
    include_top=False,
    input_shape=(224,224,3)
)
vgg19.layers[0]._name = "image"
vgg19.trainable = False

text_input = layers.Input((SEQ_LENGTH,), name='text')
text_embeddings = layers.Embedding(vocab_size, HIDDEN_DIMS)(text_input)

# image feature extractor
image_features = layers.Flatten()(vgg19.output)
image_features = layers.Dense(HIDDEN_DIMS, activation='leaky_relu')(image_features)

# text feature extractor
convs = [layers.Conv1D(NUM_FILTERS, k)(text_embeddings) for k in WINDOW_SIZE]
pools = [layers.MaxPooling1D(C.shape[1])(C) for C in convs]

text_cnn = layers.Concatenate()(pools)
text_cnn = layers.Dense(HIDDEN_DIMS, activation='leaky_relu')(text_cnn)

# combined features
features = layers.Concatenate(name="multi_modal_feature")([text_cnn[:, 0, :], image_features])

# Fake news detector
features = layers.Dropout(0.2)(features)
predictor = layers.Dense(2, activation="softmax", name='prediction')(features)

# Event Discriminator
grd_r = GradientReversal(λ=lmd)(features)
event_discriminator = layers.Dense(HIDDEN_DIMS, activation='leaky_relu')(grd_r)
event_discriminator = layers.Dropout(0.2)(event_discriminator)
event_discriminator = layers.Dense(HIDDEN_DIMS, activation='leaky_relu')(event_discriminator)
event_discriminator = layers.Dense(10, activation='softmax', name='event_discriminator')(event_discriminator)

model = Model(inputs=[vgg19.input, text_input], outputs=[predictor, event_discriminator, features])
model.compile()

### Training loop

In [None]:
EPOCHS = 10

p = np.linspace(0, 1, 10)
alpha = 10
beta = 0.75
lmd = 1

# Loss functions 
# binary_ce = keras.losses.BinaryCrossentropy()
categorical_ce = keras.losses.CategoricalCrossentropy()

# optimizer, using AdamW instead of Adam as it gives comparitively better results
optimizer = keras.optimizers.AdamW(learning_rate=0.001)

for i in range(EPOCHS):
    print("EPOCH ", i+1)
    train_ds_iter = train_ds.as_numpy_iterator()
    validation_ds_iter = validation_ds.as_numpy_iterator()

    # progress bar
    train_progbar = Progbar(len(train_ds))
    
    # metrics
    loss_D_metric = keras.metrics.Mean(name="detector_loss")
    loss_E_metric = keras.metrics.Mean(name="event_loss")
    loss_final_metric = keras.metrics.Mean(name="final_loss")

    fake_news_accuracy = keras.metrics.CategoricalAccuracy (name="fake_news_accuracy")
    event_accuracy = keras.metrics.CategoricalAccuracy (name="event_accuracy")

    # training steps
    for step in range(len(train_ds)):
        X, E, Y = train_ds_iter.next()
        E = keras.utils.to_categorical(E, num_classes=10)
        Y = keras.utils.to_categorical(Y, num_classes=2)

        with tf.GradientTape() as tape:
            pred, event, feat = model(X) # 

            Ld = categorical_ce(Y, pred)
            Le = categorical_ce(E, event)

            # final_loss = (lmd * Le) - Ld
            final_loss = Ld / (lmd*Le) # modified, so that loss does not become negative

        # calculate gradients
        grads = tape.gradient(final_loss, model.trainable_variables)

        # calculate Learning rate
        # optimizer.learning_rate = lr_schedule(i, optimizer.learning_rate, p, alpha, beta) # This does not work if the number of epochs is small
        
        # apply gradients
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # metrics
        Acc_F = fake_news_accuracy(Y, pred)
        Acc_E = event_accuracy(E, event)
        m_Ld = loss_D_metric(Ld)
        m_Le = loss_E_metric(Le)
        m_L_final = loss_final_metric(final_loss)

        train_progbar.update(step+1, [
            ('lr', optimizer.learning_rate),
            ('detector loss', m_Ld),
            ('event loss', m_Le),
            ('final loss', m_L_final),
            ('fake news accuracy', Acc_F),
            ('event accuracy', Acc_E),
        ])
    print()
    
    # validation loop, same as above but with adjusting the trainable variables.
    
    loss_D_metric = keras.metrics.Mean(name="detector_loss")
    loss_E_metric = keras.metrics.Mean(name="event_loss")
    loss_final_metric = keras.metrics.Mean(name="final_loss")
    
    fake_news_accuracy = keras.metrics.CategoricalAccuracy (name="fake_news_accuracy")
    event_accuracy = keras.metrics.CategoricalAccuracy (name="event_accuracy")

    validation_progbar = Progbar(len(validation_ds))

    for step in range(len(validation_ds)):
        X, E, Y = validation_ds_iter.next()
        Y = keras.utils.to_categorical(Y, num_classes=2)
        E = keras.utils.to_categorical(E, num_classes=10)

        pred, event, feat = model(X)

        Ld = categorical_ce(Y, pred)
        Le = categorical_ce(E, event)

        # final_loss = (lmd * Le) - Ld
        final_loss = Ld / (lmd*Le)

        # metrics
        Acc_F = fake_news_accuracy(Y, pred)
        Acc_E = event_accuracy(E, event)
        m_Ld = loss_D_metric(Ld)
        m_Le = loss_E_metric(Le)
        m_L_final = loss_final_metric(final_loss)

        validation_progbar.update(step+1, [
            ('val detector loss', m_Ld),
            ('val event loss', m_Le),
            ('val final loss', m_L_final),
            ('val fake news accuracy', Acc_F),
            ('val event accuracy', Acc_E),
        ])
    
    print()

In [None]:
model.save_weights("./models/task1.h5")

In [None]:
model.load_weights("./models/test1.h5")

In [None]:
# Loss functions 
binary_ce = keras.losses.BinaryCrossentropy()
categorical_ce = keras.losses.CategoricalCrossentropy()

test_ds_iter = test_ds.as_numpy_iterator()

loss_D_metric = keras.metrics.Mean(name="detector_loss")
loss_E_metric = keras.metrics.Mean(name="event_loss")
loss_final_metric = keras.metrics.Mean(name="final_loss")

fake_news_accuracy = keras.metrics.CategoricalAccuracy(name="fake_news_accuracy")
event_accuracy = keras.metrics.CategoricalAccuracy(name="event_accuracy")

test_progbar = Progbar(len(test_ds))

for step in range(len(test_ds)):
    X, E, Y = test_ds_iter.next()
    Y = keras.utils.to_categorical(Y, num_classes=2)
    E = keras.utils.to_categorical(E, num_classes=10)

    pred, event, feat = model(X)

    Ld = categorical_ce(Y, pred)
    Le = categorical_ce(E, event)

    final_loss = Ld / (lmd * Le) 

    # metrics
    Acc_F = fake_news_accuracy(Y, pred)
    Acc_E = event_accuracy(E, event)
    m_Ld = loss_D_metric(Ld)
    m_Le = loss_E_metric(Le)
    m_L_final = loss_final_metric(final_loss)

    test_progbar.update(step+1, [
        ('test detector loss', m_Ld),
        ('test event loss', m_Le),
        ('test final loss', m_L_final),
        ('test fake news accuracy', Acc_F),
        ('test event accuracy', Acc_E),
    ])