In [1]:
import transformers

import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    print("Invalid device or cannot modify virtual devices once initialized.")
    pass

In [3]:
tf.__version__

'2.3.0'

In [4]:
train_df = pd.read_json('data/train.jsonl', lines = True)
train_df["img"] = "data/" + train_df["img"]

In [5]:
val1_df = pd.read_json('data/dev_seen.jsonl', lines = True)
val2_df = pd.read_json('data/dev_unseen.jsonl', lines = True)
val_df = pd.concat([val1_df, val2_df])
val_df["img"] = "data/" + val_df["img"]

In [6]:
img_width = 224
img_height = 224

In [7]:
tokenizer = transformers.RobertaTokenizerFast.from_pretrained("sentence-transformers/roberta-base-nli-stsb-mean-tokens")

In [8]:
max_len = 128

### Train

In [9]:
def build_model():
    
    encoder = transformers.TFRobertaModel.from_pretrained("models/tfroberta-nli-stsb")
    encoder.trainable = False

    print(encoder.config)

    input_img = tf.keras.layers.Input(
        shape = (img_width, img_height, 3), name = "image"
    )
    
    input_ids = tf.keras.layers.Input(name = "input_ids", shape = (max_len,), dtype = tf.int32)

    attention_mask = tf.keras.layers.Input(name = "attention_mask", shape = (max_len,), dtype = tf.int32)

    y = encoder(input_ids, attention_mask = attention_mask)[0]
    y = tf.keras.layers.BatchNormalization()(y)
    
    y = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences = True))(y)
    y = tf.keras.layers.BatchNormalization()(y)

    extractor = tf.keras.applications.EfficientNetB0(include_top = False, \
                                                 input_tensor = input_img, weights = None)
    
    extractor.trainable = True
    
    x = tf.keras.layers.BatchNormalization()(extractor.output)
    x = tf.keras.layers.GlobalMaxPooling2D()(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    y = tf.keras.layers.GlobalMaxPooling1D()(y)
    y = tf.keras.layers.BatchNormalization()(y)
    
    x = tf.keras.layers.concatenate([x, y])
    x = tf.keras.layers.BatchNormalization()(x)
    
    x = tf.keras.layers.Dense(256, activation = "relu")(x)
    
    x = tf.keras.layers.Dropout(0.25)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    out = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)

    model = tf.keras.models.Model([input_img, input_ids, attention_mask], out)

    return model

In [10]:
model = build_model()
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
model.summary()

___________
block6l_add (Add)               (None, 7, 7, 384)    0           block6l_drop[0][0]               
                                                                 block6k_add[0][0]                
__________________________________________________________________________________________________
block6m_expand_conv (Conv2D)    (None, 7, 7, 2304)   884736      block6l_add[0][0]                
__________________________________________________________________________________________________
block6m_expand_bn (BatchNormali (None, 7, 7, 2304)   9216        block6m_expand_conv[0][0]        
__________________________________________________________________________________________________
block6m_expand_activation (Acti (None, 7, 7, 2304)   0           block6m_expand_bn[0][0]          
__________________________________________________________________________________________________
block6m_dwconv (DepthwiseConv2D (None, 7, 7, 2304)   57600       block6m_expand_activation[0][0] 

In [11]:
def encode_single_sample(img_path, label, text):
    img = tf.io.read_file(img_path)
    img = tf.io.decode_png(img, channels = 3)
    img = tf.image.convert_image_dtype(img, tf.int32)
    img = tf.image.resize(img, [img_height, img_width])
    text = tokenizer(text, return_tensors = 'tf', max_length = 128, padding = 'max_length')
    return {"image": img, "label": label, "text": text}

In [12]:
class HatefulMemes(tf.keras.utils.Sequence):
    """Helper to iterate over the data (as Numpy arrays)."""

    def __init__(self, batch_size, input_img_paths, label, text):
        self.batch_size = batch_size
        self.input_img_paths = input_img_paths
        self.text = text
        self.label = label

    def __len__(self):
        return len(self.input_img_paths) // self.batch_size

    def __getitem__(self, idx):
        """Returns tuple (input, target) correspond to batch #idx."""
        i = idx * self.batch_size
        batch_input_img_paths = self.input_img_paths[i : i + self.batch_size]
        batch_label = self.label[i : i + self.batch_size]
        batch_text = self.text[i : i + self.batch_size]
        w = np.zeros((self.batch_size,) + (img_height, img_width) + (3,), dtype = "float32")
        x = np.zeros((self.batch_size,max_len,), dtype = "int32")
        y = np.zeros((self.batch_size,max_len,), dtype = "int32")
        z = np.zeros((self.batch_size), dtype = "float32")
        
        for i,j,k,l in zip(range(self.batch_size), batch_input_img_paths, batch_label, batch_text):
            sample = encode_single_sample(j,k,l)
            w[i] = sample["image"].numpy().tolist()
            x[i] = sample["text"]['input_ids'].numpy().tolist()[0]
            y[i] = sample["text"]['attention_mask'].numpy().tolist()[0]
            z[i] = sample["label"]
        
        return [w,x,y], z

In [13]:
train_gen = HatefulMemes(8, train_df["img"].values.tolist(), train_df["label"].values.tolist(), \
                         train_df["text"].values.tolist())
val_gen = HatefulMemes(8, val_df["img"].values.tolist(), val_df["label"].values.tolist(), \
                         val_df["text"].values.tolist())

In [14]:
epochs = 50
early_stopping_patience = 10

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor = "val_accuracy", patience = early_stopping_patience, restore_best_weights = True
)

history = model.fit(train_gen, validation_data = val_gen, epochs = epochs, callbacks = [early_stopping])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50


### Predict

In [38]:
train_df = pd.read_json('data/test_unseen.jsonl', lines = True)
train_df["img"] = "data/" + train_df["img"]

In [39]:
def val_encode_single_sample(img_path, text):
    img = tf.io.read_file(img_path)
    img = tf.io.decode_png(img, channels = 3)
    img = tf.image.convert_image_dtype(img, tf.int32)
    img = tf.image.resize(img, [img_height, img_width])
    text = tokenizer(text, return_tensors = 'tf', max_length = 128, padding = 'max_length')
    return {"image": img, "text": text}

In [40]:
class ValHatefulMemes(tf.keras.utils.Sequence):
    """Helper to iterate over the data (as Numpy arrays)."""

    def __init__(self, batch_size, input_img_paths, text):
        self.batch_size = batch_size
        self.input_img_paths = input_img_paths
        self.text = text

    def __len__(self):
        return len(self.input_img_paths) // self.batch_size

    def __getitem__(self, idx):
        """Returns tuple (input, target) correspond to batch #idx."""
        i = idx * self.batch_size
        batch_input_img_paths = self.input_img_paths[i : i + self.batch_size]
        batch_text = self.text[i : i + self.batch_size]
        w = np.zeros((self.batch_size,) + (img_height, img_width) + (3,), dtype = "float32")
        x = np.zeros((self.batch_size,max_len,), dtype = "int32")
        y = np.zeros((self.batch_size,max_len,), dtype = "int32")
        
        for i,j,k in zip(range(self.batch_size), batch_input_img_paths, batch_text):
            sample = val_encode_single_sample(j,k)
            w[i] = sample["image"].numpy().tolist()
            x[i] = sample["text"]['input_ids'].numpy().tolist()[0]
            y[i] = sample["text"]['attention_mask'].numpy().tolist()[0]
        
        return [w,x,y]

In [41]:
pred_gen = ValHatefulMemes(1, train_df["img"].values.tolist(), train_df["text"].values.tolist())

In [42]:
preds = model.predict(pred_gen, verbose = 1)



In [43]:
def prob2pred(x):
    if x > 0.5:
        return 1
    else:
        return 0

In [44]:
train_df["proba"] = [i[0] for i in preds.tolist()]
train_df["label"] = train_df["proba"].apply(prob2pred)

In [45]:
train_df = train_df.drop(["img","text"], axis = 1)
train_df

Unnamed: 0,id,proba,label
0,16395,0.938238,1
1,37405,0.611175,1
2,94180,0.905847,1
3,54321,0.231216,0
4,97015,0.691661,1
...,...,...,...
995,3869,0.100575,0
996,23817,0.430327,0
997,56280,0.221710,0
998,29384,0.056025,0


In [46]:
train_df.to_csv("submission_new.csv", index = False)