In [1]:
import pandas as pd
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import MultiLabelBinarizer

url = '/dataset/train.csv'
df_train = pd.read_csv(url)
df_train = df_train[df_train['final abusive']==1]

url = '/dataset/val.csv'
df_val = pd.read_csv(url)
df_val = df_val[df_val['final abusive']==1]

url = '/dataset/test.csv'
df_test = pd.read_csv(url)
df_test = df_test[df_test['final abusive']==1]

df_train = pd.concat([df_train, df_val], ignore_index=True)

In [3]:
def cnv(df):
  t=[]
  for index in df.index:
    if '_' in df['type'][index]:
      x = df['type'][index].split('_')
      t.append(x)
    else:
      t.append([df['type'][index]])
  df['type2']=t
  return df

df_train= cnv(df_train)
df_test= cnv(df_test)

train_x = df_train['sentence'].to_list()
test_x = df_test['sentence'].to_list()

mlb = MultiLabelBinarizer()
train_y = mlb.fit_transform(df_train['type2'])
test_y = mlb.transform(df_test['type2'])
target_names = list(mlb.classes_)

In [None]:
#importing libraries + pip installing fasttext

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPool1D, Input, Flatten, MaxPooling1D, SpatialDropout1D, Activation

from keras.callbacks import EarlyStopping

import numpy as np
from sklearn.metrics import classification_report

import gensim
from gensim import models
from gensim.models import Word2Vec

!pip install fasttext
import fasttext.util

In [5]:
tokenizer=Tokenizer(oov_token = "<OOV>", split=' ')
tokenizer.fit_on_texts(train_x)
train_encoded=tokenizer.texts_to_sequences(train_x)
train_padded= pad_sequences(train_encoded, padding='post')

# padding df_validation 
test_encoded=tokenizer.texts_to_sequences(test_x)
test_padded= pad_sequences(test_encoded, padding='post', maxlen=train_padded.shape[1])

In [None]:
max_length = train_padded.shape[1]
vocabulary_size = len(tokenizer.word_index) + 1
EMBEDDING_DIM = 300

# function that takes word vector as input and returned an embedding layer
def embedding_creation(EMBEDDING_DIM, word_vectors):

  vocabulary_size=len(tokenizer.word_index)+1
  word_index=tokenizer.word_index
  embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

  for word, i in word_index.items():
      try:
          embedding_vector=word_vectors[word]
          embedding_matrix[i]=embedding_vector
      except KeyError:
          embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

  embedding_layer=Embedding(vocabulary_size, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)

  return embedding_layer

# creating informal_FastText embedding layer (IFT)
url = ''
wv = fasttext.load_model(url)
IFT = embedding_creation(EMBEDDING_DIM, wv)

# creating multilingual_FastText embedding layer (MFT)
url = ''
wv = fasttext.load_model(url)
MFT = embedding_creation(EMBEDDING_DIM, wv)
del wv

# creating BengFastText embedding layer (BFT)
url = ''
wv = Word2Vec.load(url)
BFT = embedding_creation(EMBEDDING_DIM, wv)
del wv

# creating a randomly initialized embedding layer (RE)
RE = Embedding(vocabulary_size, EMBEDDING_DIM,input_length = max_length, trainable=True)

In [15]:
emb_name = []
slander_p = []
slander_r = []
slander_f = []
religion_p = []
religion_r = []
religion_f = []
gender_p = []
gender_r = []
gender_f = []
cv_p = []
cv_r = []
cv_f = []
w_p = []
w_r = []
w_f = []

# defining early stopping; stops training when there is no improvement in val_loss for 3 consecutive ecpoch.
# returns best model with least val_loss
earlystop_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=3,
    verbose=1,
    mode="min",
    restore_best_weights=True,
)

In [None]:
emb_X_name_collection = [ [MFT,'MFT'],[IFT, 'IFT'], [BFT,'BFT'], [RE,'RE'] ]

for emb_X_name in emb_X_name_collection:
  model = Sequential([
      emb_X_name[0],
      Bidirectional(LSTM(100, dropout=0.3, return_sequences=True)),
      GlobalMaxPool1D(),
      Dense(16, activation='relu'),
      Dense(4, activation='sigmoid'),
  ],
  name="Sentiment_Model")
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  model.fit(train_padded, train_y, epochs=100, batch_size=32, validation_data=(test_padded,test_y), callbacks=[earlystop_callback])
  prediction = model.predict(test_padded)

  p=[]
  for i in range(len(prediction)):
    a=[]
    for j in range(4):
      a.append(round(prediction[i][j]))
    p.append(a)

  r = classification_report(test_y, p, target_names=target_names, output_dict=True)
  
  emb_name.append(emb_X_name[1])
  slander_p.append(round(r['slander']['precision']*100, 2))
  slander_r.append(round(r['slander']['recall']*100, 2))
  slander_f.append(round(r['slander']['f1-score']*100, 2))
  religion_p.append(round(r['religion']['precision']*100, 2))
  religion_r.append(round(r['religion']['recall']*100, 2))
  religion_f.append(round(r['religion']['f1-score']*100, 2))
  cv_p.append(round(r['callToViolence']['precision']*100, 2))
  cv_r.append(round(r['callToViolence']['recall']*100, 2))
  cv_f.append(round(r['callToViolence']['f1-score']*100, 2))
  gender_p.append(round(r['gender']['precision']*100, 2))
  gender_r.append(round(r['gender']['recall']*100, 2))
  gender_f.append(round(r['gender']['f1-score']*100, 2))
  w_p.append(round(r['weighted avg']['precision']*100, 2))
  w_r.append(round(r['weighted avg']['recall']*100, 2))
  w_f.append(round(r['weighted avg']['f1-score']*100, 2))

  del model

nh = not hate speech

hs = hate speech

p = precision

r = recall

f = f1

w: = weighted average

In [None]:
result = {
    'emb name':emb_name,
    'slander_p':slander_p, 'slander_r':slander_r, 'slander_f':slander_f,
    'religion_p':religion_p, 'religion_r':religion_r, 'religion_f':religion_f,
    'gender_p':gender_p, 'gender_r':gender_r, 'gender_f':gender_f,
    'cv_p':cv_p, 'cv_r':cv_r, 'cv_f':cv_f,
    'w_p':w_p, 'w_r':w_r, 'w_f':w_f
}
dd = pd.DataFrame(result)
dd