In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import MultiLabelBinarizer

url = '/dataset/train.csv'
df_train = pd.read_csv(url)
df_train = df_train[df_train['hate speech']==1]

url = '/dataset/val.csv'
df_val = pd.read_csv(url)
df_val = df_val[df_val['hate speech']==1]

url = '/dataset/test.csv'
df_test = pd.read_csv(url)
df_test = df_test[df_test['hate speech']==1]

df_train = pd.concat([df_train, df_val], ignore_index=True)


train_x = df_train['sentence'].to_list()
train_y = df_train['hate speech'].values

test_x = df_test['sentence'].to_list()
test_y = df_test['hate speech'].values

In [None]:
#importing libraries + pip installing fasttext

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPool1D, Input, Flatten, MaxPooling1D, SpatialDropout1D, Activation

from keras.callbacks import EarlyStopping

import numpy as np
from sklearn.metrics import classification_report

import gensim
from gensim import models
from gensim.models import Word2Vec

!pip install fasttext
import fasttext.util

In [None]:
tokenizer=Tokenizer(oov_token = "<OOV>", split=' ')

tokenizer.fit_on_texts(train_x)

# encoding and padding train_x
train_encoded=tokenizer.texts_to_sequences(train_x)
train_padded= pad_sequences(train_encoded, padding='post')

# padding test_x 
test_encoded=tokenizer.texts_to_sequences(test_x)
test_padded= pad_sequences(test_encoded, padding='post', maxlen=train_padded.shape[1])

In [None]:
max_length = train_padded.shape[1]
vocabulary_size = len(tokenizer.word_index) + 1
EMBEDDING_DIM = 300

# function that takes word vector as input and returned an embedding layer
def embedding_creation(EMBEDDING_DIM, word_vectors):

  vocabulary_size=len(tokenizer.word_index)+1
  word_index=tokenizer.word_index
  embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

  for word, i in word_index.items():
      try:
          embedding_vector=word_vectors[word]
          embedding_matrix[i]=embedding_vector
      except KeyError:
          embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

  embedding_layer=Embedding(vocabulary_size, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)

  return embedding_layer

# creating informal_FastText embedding layer (IFT)
url = ''
wv = fasttext.load_model(url)
IFT = embedding_creation(EMBEDDING_DIM, wv)

# creating multilingual_FastText embedding layer (MFT)
url = ''
wv = fasttext.load_model(url)
MFT = embedding_creation(EMBEDDING_DIM, wv)
del wv

# creating BengFastText embedding layer (BFT)
url = ''
wv = Word2Vec.load(url)
BFT = embedding_creation(EMBEDDING_DIM, wv)
del wv

# creating a randomly initialized embedding layer (RE)
RE = Embedding(vocabulary_size, EMBEDDING_DIM,input_length = max_length, trainable=True)

  


In [None]:
target_names = ['NH', 'HS']
emb_name = []
nh_p = []
nh_r = []
nh_f = []
hs_p = []
hs_r = []
hs_f = []
w_p = []
w_r = []
w_f = []

# defining early stopping; stops training when there is no improvement in val_loss for 3 consecutive ecpoch.
# returns best model with least val_loss
earlystop_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=3,
    verbose=1,
    mode="min",
    restore_best_weights=True,
)

In [None]:
emb_X_name_collection = [ [MFT,'MFT'],[IFT, 'IFT'], [BFT,'BFT'], [RE,'RE'] ]

for emb_X_name in emb_X_name_collection:

  # generating model
  model = Sequential([
  emb_X_name[0],
  Bidirectional(LSTM(100, dropout=0.3, return_sequences=True)),
  GlobalMaxPool1D(),
  Dense(16, activation='relu'),
  Dense(1, activation='sigmoid'),
  ],
  name="Sentiment_Model")
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  model.fit(train_padded, train_y, epochs=100, batch_size=32, validation_data=(test_padded,test_y), callbacks=[earlystop_callback])

  prediction = model.predict(test_padded)
  prediction=prediction.argmax(axis=1)

  r = classification_report(test_y, prediction, target_names=['NH','HS'], output_dict=True)

  emb_name.append(emb_X_name[1])

  nh_p.append(round(r['NH']['precision']*100, 2))
  nh_r.append(round(r['NH']['recall']*100, 2))
  nh_f.append(round(r['NH']['f1-score']*100, 2))
  hs_p.append(round(r['HS']['precision']*100, 2))
  hs_r.append(round(r['HS']['recall']*100, 2))
  hs_f.append(round(r['HS']['f1-score']*100, 2))
  w_p.append(round(r['weighted avg']['precision']*100, 2))
  w_r.append(round(r['weighted avg']['recall']*100, 2))
  w_f.append(round(r['weighted avg']['f1-score']*100, 2))

  del model

nh = not hate speech

hs = hate speech

p = precision

r = recall

f = f1

w: = weighted average

In [None]:
result = {
    'emb name':emb_name,
    'nh_p':nh_p, 'nh_r':nh_r, 'nh_f1':nh_f,
    'h_p':hs_p, 'h_r':hs_r, 'h_f1':hs_f,
    'w_p':w_p, 'w_r':w_r, 'w_f1':w_f
}
dd = pd.DataFrame(result)
dd

Unnamed: 0,emb name,nh_p,nh_r,nh_f1,h_p,h_r,h_f1,w_p,w_r,w_f1
0,MFT,52.6,100.0,68.94,0.0,0.0,0.0,27.67,52.6,36.26
1,IFT,52.6,100.0,68.94,0.0,0.0,0.0,27.67,52.6,36.26
2,BFT,52.6,100.0,68.94,0.0,0.0,0.0,27.67,52.6,36.26
3,RE,52.6,100.0,68.94,0.0,0.0,0.0,27.67,52.6,36.26
