# Laboratory exercise 2

## Imports

In [1]:
import string
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.metrics import Precision, Recall
from tensorflow_addons.metrics import F1Score
from tensorflow.keras.optimizers import Adam
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag_sents, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

from scripts.word_embeddings import load_embedding_weights

## Read data

In [2]:
df = pd.read_csv('data/MAMI/TRAINING/training.csv', sep='\t')

In [3]:
df

Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,Text Transcription
0,1.jpg,0,0,0,0,0,Milk Milk.zip
1,10.jpg,1,0,0,0,1,"ROSES ARE RED, VIOLETS ARE BLUE IF YOU DON'T S..."
2,1000.jpg,0,0,0,0,0,BREAKING NEWS: Russia releases photo of DONALD...
3,10000.jpg,0,0,0,0,0,MAN SEEKING WOMAN Ignad 18 O
4,10006.jpg,0,0,0,0,0,Me explaining the deep lore of. J.R.R. Tolkein...
...,...,...,...,...,...,...,...
9995,15002.jpg,0,0,0,0,0,WAITING FOR THE END OF THE COVID imgflip.com
9996,15003.jpg,0,0,0,0,0,SMART WOMEN ARE AROUND imgflip.com
9997,15004.jpg,0,0,0,0,0,GOOD GIRLS ARE BEHIND THE CORNER imgflip.com
9998,15005.jpg,0,0,0,0,0,COOKING FOR MY WIFE imgflip.com


## Preprocessing

### Get X and y dataframes

In [4]:
X = df[['Text Transcription']].copy()
Y = df[['misogynous', 'shaming', 'stereotype', 'objectification', 'violence']].copy()

In [5]:
X

Unnamed: 0,Text Transcription
0,Milk Milk.zip
1,"ROSES ARE RED, VIOLETS ARE BLUE IF YOU DON'T S..."
2,BREAKING NEWS: Russia releases photo of DONALD...
3,MAN SEEKING WOMAN Ignad 18 O
4,Me explaining the deep lore of. J.R.R. Tolkein...
...,...
9995,WAITING FOR THE END OF THE COVID imgflip.com
9996,SMART WOMEN ARE AROUND imgflip.com
9997,GOOD GIRLS ARE BEHIND THE CORNER imgflip.com
9998,COOKING FOR MY WIFE imgflip.com


In [6]:
Y

Unnamed: 0,misogynous,shaming,stereotype,objectification,violence
0,0,0,0,0,0
1,1,0,0,0,1
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0
...,...,...,...,...,...
9995,0,0,0,0,0
9996,0,0,0,0,0
9997,0,0,0,0,0
9998,0,0,0,0,0


### Tokenize and filter text

In [7]:
def lower_and_word_tokenize_text_series(text_series: pd.Series) -> pd.Series:
    return text_series.str.lower().apply(word_tokenize)

In [8]:
tokens_list = lower_and_word_tokenize_text_series(X['Text Transcription'])

In [9]:
tokens_list

0                                        [milk, milk.zip]
1       [roses, are, red, ,, violets, are, blue, if, y...
2       [breaking, news, :, russia, releases, photo, o...
3                     [man, seeking, woman, ignad, 18, o]
4       [me, explaining, the, deep, lore, of, ., j.r.r...
                              ...                        
9995    [waiting, for, the, end, of, the, covid, imgfl...
9996             [smart, women, are, around, imgflip.com]
9997    [good, girls, are, behind, the, corner, imgfli...
9998                [cooking, for, my, wife, imgflip.com]
9999    [listen, tomorrow, will, be, monday, imgflip.c...
Name: Text Transcription, Length: 10000, dtype: object

In [10]:
def filter_stopwords_and_punctuation(tokens_list: pd.Series) -> pd.Series:
    stopset = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    custom_set = {
        '...',
        '""',
        '``',
        "''"
    }
    filter_set = stopset | punctuation | custom_set
    
    def filter_token(token):
        return token not in filter_set and not re.match(r'\w+\.(com|net)', token)

    return tokens_list.apply(
        lambda tokens: list(filter(filter_token, tokens))
    )

In [11]:
tokens_list = filter_stopwords_and_punctuation(tokens_list)

In [12]:
tokens_list

0                                        [milk, milk.zip]
1       [roses, red, violets, blue, n't, say, yes, 'll...
2       [breaking, news, russia, releases, photo, dona...
3                        [man, seeking, woman, ignad, 18]
4       [explaining, deep, lore, j.r.r, tolkein, 's, w...
                              ...                        
9995                                [waiting, end, covid]
9996                               [smart, women, around]
9997                        [good, girls, behind, corner]
9998                                      [cooking, wife]
9999                           [listen, tomorrow, monday]
Name: Text Transcription, Length: 10000, dtype: object

In [13]:
def get_vocabulary(tokens_list: pd.Series) -> pd.Series:
    return tokens_list.explode().value_counts()

In [14]:
vocabulary = get_vocabulary(tokens_list)
print(f'{len(vocabulary)=}')

len(vocabulary)=21255


### Remove words with occurence less than k from vocabulary

In [15]:
def vocabulary_with_removed_less_than_k(vocabulary, k):
    return vocabulary[vocabulary > k]

In [16]:
smaller_voc = vocabulary_with_removed_less_than_k(vocabulary, 10)

In [17]:
smaller_voc

's          1820
n't         1416
women       1208
like         863
woman        682
            ... 
asses         11
metoo         11
buying        11
radio         11
prepared      11
Name: Text Transcription, Length: 1648, dtype: int64

In [18]:
print(f'{len(smaller_voc)=}')

len(smaller_voc)=1648


### Load glove weight

In [19]:
glove_weights = load_embedding_weights(
    vocabulary=smaller_voc,
    embedding_size=50,
    embedding_type='glove',
    path='data'
)

In [20]:
print(f'{len(glove_weights)=}')

len(glove_weights)=1648


### Generate Word2Vec

In [21]:
lemmatizer = WordNetLemmatizer()

In [22]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [23]:
def lemmatize_tokens_list(tokens_list):
    lemmatized_tokens_list = []
    for pos_tags in pos_tag_sents(tokens_list):
        lemmatized_tokens_list.append([
            lemmatizer.lemmatize(token, pos=get_wordnet_pos(pos))
            for token, pos in pos_tags
        ])
    return pd.Series(lemmatized_tokens_list)

In [24]:
lemmatized_tokens_list = lemmatize_tokens_list(tokens_list)

In [25]:
model_word2vec = Word2Vec(lemmatized_tokens_list, vector_size=50, min_count=1, window=5, sg=0)

In [26]:
word2vec_weights = []
for word in smaller_voc:
    if word in model_word2vec.wv:
        word2vec_weights.append(model_word2vec.wv[word])
    else:
        word2vec_weights.append(word, np.zeros(50))
word2vec_weights = np.array(word2vec_weights)

In [27]:
word2vec_weights

array([[-1.50008826e-02,  3.97121124e-02,  9.69190383e-04, ...,
        -1.16943054e-01,  1.02244392e-01,  8.00101161e-02],
       [-4.31099860e-03,  6.21587299e-02,  3.89502011e-03, ...,
        -1.21534474e-01,  1.27641901e-01,  8.40811729e-02],
       [-2.29673255e-02,  3.76213565e-02,  3.08051128e-02, ...,
        -1.61225051e-01,  1.38468012e-01,  7.68636689e-02],
       ...,
       [-9.72770974e-02,  4.30889875e-01,  2.32173368e-01, ...,
        -1.20196009e+00,  1.21108460e+00,  7.28609204e-01],
       [-9.72770974e-02,  4.30889875e-01,  2.32173368e-01, ...,
        -1.20196009e+00,  1.21108460e+00,  7.28609204e-01],
       [-9.72770974e-02,  4.30889875e-01,  2.32173368e-01, ...,
        -1.20196009e+00,  1.21108460e+00,  7.28609204e-01]], dtype=float32)

## Problem 1

In [28]:
input_dim = int(X['Text Transcription'].map(len).mean())
input_dim

100

In [29]:
tokenizer = Tokenizer(num_words=len(smaller_voc))

In [30]:
tokenizer.fit_on_texts(X['Text Transcription'])

In [31]:
sequences = tokenizer.texts_to_sequences(X['Text Transcription'])

In [32]:
padded_sequences = pad_sequences(sequences, input_dim)
padded_sequences

array([[   0,    0,    0, ...,    0, 1016, 1016],
       [   0,    0,    0, ...,    3,   87,    7],
       [   0,    0,    0, ...,  300,  112,  296],
       ...,
       [   0,    0,    0, ...,    1,   32,    7],
       [   0,    0,    0, ...,   44,   32,    7],
       [   0,    0,    0, ...,   61, 1191,    7]])

In [33]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, Y['misogynous'].astype(int), test_size=0.2, random_state=42)

In [34]:
metrics_list = []

### Glove

In [35]:
model = Sequential()
model.add(Embedding(input_dim=len(smaller_voc), output_dim=len(glove_weights[0]), weights=[glove_weights]))
model.add(LSTM(units=128))
model.add(Dense(1, activation='sigmoid'))

In [36]:
model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss='binary_crossentropy',
    metrics=['accuracy', Precision(), Recall(), F1Score(1)]
)


In [37]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          82400     
                                                                 
 lstm (LSTM)                 (None, 128)               91648     
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 174,177
Trainable params: 174,177
Non-trainable params: 0
_________________________________________________________________


In [38]:
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x276d6b02310>

In [39]:
loss, accuracy, precision, recall, f1 = model.evaluate(X_test, y_test)
metrics_list.append(
    {
        'weights': 'glove',
        'loss': loss,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
)



### No weights

In [40]:
model = Sequential()
model.add(Embedding(input_dim=len(smaller_voc), output_dim=50))
model.add(LSTM(units=128))
model.add(Dense(1, activation='sigmoid'))

In [41]:
model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss='binary_crossentropy',
    metrics=['accuracy', Precision(), Recall(), F1Score(1)]
)


In [42]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 50)          82400     
                                                                 
 lstm_1 (LSTM)               (None, 128)               91648     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 174,177
Trainable params: 174,177
Non-trainable params: 0
_________________________________________________________________


In [43]:
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x276dd39ae80>

In [44]:
loss, accuracy, precision, recall, f1 = model.evaluate(X_test, y_test)
metrics_list.append(
    {
        'weights': 'no weights',
        'loss': loss,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
)



### Word2Vec 

In [45]:
model = Sequential()
model.add(Embedding(input_dim=len(smaller_voc), output_dim=len(word2vec_weights[0]), weights=[word2vec_weights]))
model.add(LSTM(units=128))
model.add(Dense(1, activation='sigmoid'))

In [46]:
model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss='binary_crossentropy',
    metrics=['accuracy', Precision(), Recall(), F1Score(1)]
)

In [47]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 50)          82400     
                                                                 
 lstm_2 (LSTM)               (None, 128)               91648     
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 174,177
Trainable params: 174,177
Non-trainable params: 0
_________________________________________________________________


In [48]:
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x276e17fd250>

In [49]:
loss, accuracy, precision, recall, f1 = model.evaluate(X_test, y_test)
metrics_list.append(
    {
        'weights': 'word2vec',
        'loss': loss,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
)



### Conclusion

In [50]:
pd.DataFrame(metrics_list)

Unnamed: 0,weights,loss,accuracy,precision,recall,f1
0,glove,0.888117,0.7435,0.760339,0.714143,[0.6684421]
1,no weights,1.126034,0.7265,0.726462,0.73008,[0.6684421]
2,word2vec,0.790129,0.754,0.75498,0.75498,[0.6684421]


We got the highest accuracy with the word2vec weights.

## Problem 2

For problem 2 we need to change Y to have 4 classes. Change the last layer to be 4 sigmoids, 1 for each class.
And change the F1 score to be 4 classes. 

In [51]:
X_train, X_test, Y_train, Y_test = train_test_split(
    padded_sequences, Y[['shaming', 'stereotype', 'objectification', 'violence']].astype(int), test_size=0.2, random_state=42
)

In [52]:
metrics_list = []

### Glove

In [53]:
model = Sequential()
model.add(Embedding(input_dim=len(smaller_voc), output_dim=len(glove_weights[0]), weights=[glove_weights]))
model.add(LSTM(units=128))
model.add(Dense(4, activation='sigmoid'))

In [54]:
model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss='binary_crossentropy',
    metrics=['accuracy', Precision(), Recall(), F1Score(4)]
)


In [55]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 50)          82400     
                                                                 
 lstm_3 (LSTM)               (None, 128)               91648     
                                                                 
 dense_3 (Dense)             (None, 4)                 516       
                                                                 
Total params: 174,564
Trainable params: 174,564
Non-trainable params: 0
_________________________________________________________________


In [56]:
model.fit(X_train, Y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x276e72d6610>

In [57]:
loss, accuracy, precision, recall, f1 = model.evaluate(X_test, Y_test)
metrics_list.append(
    {
        'weights': 'glove',
        'loss': loss,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
)



### No weights

In [58]:
model = Sequential()
model.add(Embedding(input_dim=len(smaller_voc), output_dim=50))
model.add(LSTM(units=128))
model.add(Dense(4, activation='sigmoid'))

In [59]:
model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss='binary_crossentropy',
    metrics=['accuracy', Precision(), Recall(), F1Score(4)]
)


In [60]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 50)          82400     
                                                                 
 lstm_4 (LSTM)               (None, 128)               91648     
                                                                 
 dense_4 (Dense)             (None, 4)                 516       
                                                                 
Total params: 174,564
Trainable params: 174,564
Non-trainable params: 0
_________________________________________________________________


In [61]:
model.fit(X_train, Y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x276ea3124f0>

In [62]:
loss, accuracy, precision, recall, f1 = model.evaluate(X_test, Y_test)
metrics_list.append(
    {
        'weights': 'no weights',
        'loss': loss,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
)



### Word2Vec 

In [63]:
model = Sequential()
model.add(Embedding(input_dim=len(smaller_voc), output_dim=len(word2vec_weights[0]), weights=[word2vec_weights]))
model.add(LSTM(units=128))
model.add(Dense(4, activation='sigmoid'))

In [64]:
model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss='binary_crossentropy',
    metrics=['accuracy', Precision(), Recall(), F1Score(4)]
)

In [65]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, None, 50)          82400     
                                                                 
 lstm_5 (LSTM)               (None, 128)               91648     
                                                                 
 dense_5 (Dense)             (None, 4)                 516       
                                                                 
Total params: 174,564
Trainable params: 174,564
Non-trainable params: 0
_________________________________________________________________


In [66]:
model.fit(X_train, Y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x276efab57f0>

In [67]:
loss, accuracy, precision, recall, f1 = model.evaluate(X_test, Y_test)
metrics_list.append(
    {
        'weights': 'word2vec',
        'loss': loss,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
)



### Conclusion

In [68]:
pd.DataFrame(metrics_list)

Unnamed: 0,weights,loss,accuracy,precision,recall,f1
0,glove,0.714575,0.297,0.460506,0.404861,"[0.24568139, 0.4531365, 0.3494983, 0.2173913]"
1,no weights,0.70665,0.3,0.4816,0.418056,"[0.29038858, 0.4411567, 0.38468552, 0.3160763]"
2,word2vec,0.519872,0.337,0.47593,0.302083,"[0.17335474, 0.46614397, 0.32258064, 0.22492401]"


Accuracy wise we got the best result with word2vec. But it seems to have a bad f1 score for shaming.