In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import cv2
from keras.utils import to_categorical
import re
import spacy
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.applications.resnet_v2 import ResNet152V2
from keras.utils import plot_model
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
from keras.layers import Dense, Concatenate, Input, Flatten, Embedding, CuDNNLSTM, Bidirectional, Dropout, LSTM
from keras.models import Model
from sklearn.dummy import DummyClassifier
import pickle

In [None]:
#load csv file
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/data_7000_new.csv", names=['image_name', 'Image_URL', 'OCR_extracted_text', 'Corrected_text', 'Humour', 'Sarcasm', 'Offense', 'Motivation', 'Overall_sentiment'])

In [None]:
#clean data by removing rows with junk values
df = df.drop(df[(df.Overall_sentiment != 'very_positive') & (df.Overall_sentiment != 'positive') & (df.Overall_sentiment != 'neutral') & (df.Overall_sentiment != 'negative') & (df.Overall_sentiment != 'very_negative')].index)
df = df.drop(df[(df.Motivation != 'motivational') & (df.Motivation != 'not_motivational')].index)
df = df.drop(df[(df.Offense != 'not_offensive') & (df.Offense != 'slight') & (df.Offense != 'very_offensive') & (df.Offense != 'hateful_offensive')].index)

In [None]:
#store all images in list after resizing them
X_train_pics = []
pic_ind = 0
error_pics = set()
for img in df['image_name']:
    try:
      im = cv2.imread("/content/drive/My Drive/data_7000/" + str(img))
      resized = cv2.resize(im, (256, 256), interpolation = cv2.INTER_AREA)
      X_train_pics.append(resized)
      print("Loaded pic no. " + str(pic_ind))
    except:
      print("Error loading pic no. " + str(pic_ind))
      error_pics.add(pic_ind)
    pic_ind += 1

In [None]:
#delete entries of pics that couldn't be loaded
df = df.drop([df.index[x] for x in error_pics])

In [None]:
#perform text preprocessing
X_train_text = df['Corrected_text'].apply(str).apply(lambda x: re.sub(r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?','',x))
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'
X_train_text = X_train_text.apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
X_train_text = X_train_text.str.lower()
X_train_text = X_train_text.str.replace("[0-9]", " ")
X_train_text = X_train_text.apply(lambda x:' '.join(x.split()))
nlp = spacy.load('en', disable=['parser', 'ner'])

def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output
X_train_text = lemmatization(X_train_text)

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train_text)
X_train_text = tokenizer.texts_to_sequences(X_train_text)

In [None]:
tokenizer_file = open("tokenizer",'wb')
pickle.dump(tokenizer,tokenizer_file)
tokenizer_file.close()

In [None]:
#169 is the length of the longest sequence
X_train_text = pad_sequences(X_train_text, maxlen=169, padding='post')

In [None]:
#import pretrained ResNet
resnet_base = ResNet152V2(include_top=False, weights='imagenet', input_shape=(256, 256, 3))
resnet_base.trainable = False

In [None]:
#normalize images
X_train_pics = np.array(X_train_pics)/255

In [None]:
#download Glove vectors
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip /content/glove.6B.zip

In [None]:
embeddings_index = {}
f = open("/content/glove.6B.200d.txt")
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()
print("Found %s word vectors." %len(embeddings_index))

In [None]:
#generate embedding matrix for all tasks
vocab_size = 5000
word_index = tokenizer.word_index
embedding_dim = 200
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
  if i<vocab_size:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

# TASK 1

In [None]:
input1_task1 = Input(shape=(256, 256, 3))
input2_task1 = Input(shape=(169,))

base_output_task1 = resnet_base(input1_task1)
out1_task1 = Flatten()(base_output_task1)
out1_task1 = Dense(128, activation='relu')(out1_task1)

out2_task1 = Embedding(input_dim=5000, output_dim=200, input_length=169)(input2_task1)
out2_task1 = Bidirectional(CuDNNLSTM(200, return_sequences=True))(out2_task1)
out2_task1 = Bidirectional(CuDNNLSTM(64))(out2_task1)

merged_task1 = Concatenate(axis=1)([out1_task1, out2_task1])
merged_task1 = Dropout(0.5)(merged_task1)
merged_task1 = Dense(128, activation='relu')(merged_task1)
merged_task1 = Dropout(0.5)(merged_task1)
merged_task1 = Dense(64, activation='relu')(merged_task1)
merged_task1 = Dropout(0.5)(merged_task1)

out_task1 = Dense(1, activation='sigmoid')(merged_task1)

model_task1 = Model(inputs=[input1_task1,input2_task1], output=out_task1)

In [None]:
model_task1.layers[3].set_weights([embedding_matrix])

In [None]:
y_train_task1 = (df['Motivation'] == 'motivational')+0

In [None]:
class_weights = compute_class_weight("balanced", np.unique(y_train_task1), y_train_task1)

In [None]:
#define function for F1 score
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


model_task1.compile(loss='binary_crossentropy',
          optimizer= "adam",
          metrics=[f1, 'acc'])

In [None]:
history_task1 = model_task1.fit([X_train_pics, X_train_text], y_train_task1, epochs=50, batch_size=256, validation_split=0.2, class_weight=class_weights)

In [None]:
plt.plot(history_task1.history['f1'])
plt.plot(history_task1.history['val_f1'])
plt.xlabel("Epochs")
plt.ylabel("F1 score")
plt.legend(("Train", "Val"))

In [None]:
plt.plot(history_task1.history['acc'])
plt.plot(history_task1.history['val_acc'])
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(("Train", "Val"))

In [None]:
model_task1.save('Task1_50_epochs.h5')

# **TASK 2**

In [None]:
input1_task2 = Input(shape=(256, 256, 3))
input2_task2 = Input(shape=(169,))

base_output_task2 = resnet_base(input1_task2)
out1_task2 = Flatten()(base_output_task2)
out1_task2 = Dense(128, activation='relu')(out1_task2)

out2_task2 = Embedding(input_dim=5000, output_dim=200, input_length=169)(input2_task2)
out2_task2 = Bidirectional(CuDNNLSTM(200, return_sequences=True))(out2_task2)
out2_task2 = Bidirectional(CuDNNLSTM(64))(out2_task2)

merged_task2 = Concatenate(axis=1)([out1_task2, out2_task2])
merged_task2 = Dropout(0.5)(merged_task2)
merged_task2 = Dense(128, activation='relu')(merged_task2)
merged_task2 = Dropout(0.5)(merged_task2)
merged_task2 = Dense(64, activation='relu')(merged_task2)
merged_task2 = Dropout(0.5)(merged_task2)

out_task2 = Dense(5, activation='softmax')(merged_task2)

model_task2 = Model(inputs=[input1_task2,input2_task2], output=out_task2)

In [None]:
y_train_task2 = df['Overall_sentiment'].replace(to_replace =["very_negative", "negative", "neutral", "positive", "very_positive"],  
                            value =[0, 1, 2, 3, 4]) 
y_train_task2_cat = to_categorical(y_train_task2)

In [None]:
model_task2.layers[3].set_weights([embedding_matrix])

In [None]:
class_weights = compute_class_weight("balanced", np.unique(y_train_task2), y_train_task2)

In [None]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


model_task2.compile(loss='binary_crossentropy',
          optimizer= "adam",
          metrics=[f1, 'acc'])

In [None]:
history_task2 = model_task2.fit([X_train_pics, X_train_text], y_train_task2_cat, epochs=50, batch_size=256, validation_split=0.2, class_weight=class_weights)

In [None]:
plt.plot(history_task2.history['f1'])
plt.plot(history_task2.history['val_f1'])
plt.xlabel("Epochs")
plt.ylabel("F1 score")
plt.legend(("Train", "Val"))

In [None]:
plt.plot(history_task2.history['acc'])
plt.plot(history_task2.history['val_acc'])
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(("Train", "Val"))

In [None]:
model_task2.save('Task2_50_epochs.h5')

# **TASK 3**

In [None]:
input1_task3 = Input(shape=(256, 256, 3))
input2_task3 = Input(shape=(169,))

base_output_task3 = resnet_base(input1_task3)
out1_task3 = Flatten()(base_output_task3)
out1_task3 = Dense(128, activation='relu')(out1_task3)

out2_task3 = Embedding(input_dim=5000, output_dim=200, input_length=169)(input2_task3)
out2_task3 = Bidirectional(CuDNNLSTM(200, return_sequences=True))(out2_task3)
out2_task3 = Bidirectional(CuDNNLSTM(64))(out2_task3)

merged_task3 = Concatenate(axis=1)([out1_task3, out2_task3])
merged_task3 = Dropout(0.5)(merged_task3)
merged_task3 = Dense(128, activation='relu')(merged_task3)
merged_task3 = Dropout(0.5)(merged_task3)
merged_task3 = Dense(64, activation='relu')(merged_task3)
merged_task3 = Dropout(0.5)(merged_task3)

out_task3 = Dense(1)(merged_task3)

model_task3 = Model(inputs=[input1_task3,input2_task3], output=out_task3)

In [None]:
df['Offense'].value_counts()

In [None]:
model_task3.layers[3].set_weights([embedding_matrix])

In [None]:
y_train_task3 = df['Offense'].replace(to_replace =["not_offensive", "slight", "very_offensive", "hateful_offensive"],  
                            value =[0, 1, 2, 3]) 

In [None]:
model_task3.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae', 'mse'])

In [None]:
history_task3 = model_task3.fit([X_train_pics, X_train_text], y_train_task3, epochs=10, batch_size=256, validation_split=0.2)

In [None]:
plt.plot(history_task3.history['mean_absolute_error'])
plt.plot(history_task3.history['val_mean_absolute_error'])
plt.xlabel("Epochs")
plt.ylabel("Mean Absolute Error")
plt.legend(("Train", "Val"))

In [None]:
model_task3.save('Task3_15_epochs.h5')

In [None]:
!cp /content/Task2_50_epochs.h5 /content/drive/My\ Drive/

In [None]:
!cp /content/Task3_15_epochs.h5 /content/drive/My\ Drive/