In [None]:
import pandas as pd
#import numpy as np
import pickle
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import os
from numpy import array
import numpy as np
import torch
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from numpy import argmax
from sklearn.model_selection import train_test_split
import torch.nn as nn


In [None]:
torch.cuda.is_available()
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"

In [None]:
bengali_data = pd.read_csv(r'bengali_hatespeech.csv')


In [None]:
first_set_data= bengali_data.head(100)
last_set_data=bengali_data.tail(100)

In [None]:
frames=[first_set_data, last_set_data]
bengali_data = pd.concat(frames)
bengali_data


In [None]:
#Preprocessing the data to lower text, remove punctuations if any and remove the emoji or emoticons

bengali_data['sentence']= bengali_data['sentence'].str.lower()
bengali_data['sentence'] = bengali_data['sentence'].str.replace('[!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~]','')
bengali_data['sentence'] = bengali_data['sentence'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)


X = bengali_data.iloc[:, 0].values  
Y= bengali_data.iloc[:, 1].values  



Y = np.asarray(Y).astype('float32')

In [None]:
#loading the embedding stored for bengali

model_path = "bengali.pth"
state_dict = torch.load(model_path)
embedding_weights=state_dict['l2.weight']
embedding_weights.shape

In [None]:
entire_text = bengali_data['sentence'].values.tolist()
entire_text= [word for line in entire_text for word in line.split()]


unique_words_and_corresponding_freq = {} 
for item in entire_text: 
    if (item in unique_words_and_corresponding_freq): 
        unique_words_and_corresponding_freq[item] += 1
    else: 
        unique_words_and_corresponding_freq[item] = 1
        

no_of_unique_words_vocabulary=len(unique_words_and_corresponding_freq)
no_of_unique_words_vocabulary

In [None]:
#split into train and test
x_train, x_test, y_train, y_test = train_test_split(X,Y,random_state=1,test_size=0.20 )

#tokenizer that is crated for the entire vocabulary 
tokenizer = Tokenizer(num_words=no_of_unique_words_vocabulary)
tokenizer.fit_on_texts(X)
word_index=tokenizer.word_index

#fitting the tokenizor for each x_train and x_test data
x_train_tokens=tokenizer.texts_to_sequences(x_train)
x_test_tokens=tokenizer.texts_to_sequences(x_test)


len(word_index)

In [None]:
#to add padding based the longest sentence
#code to find the lenght of the lpngest sentence

longest_sentence=0

for each_sentence in X:
    word_list = each_sentence.split()
    number_of_words = len(word_list)
    if number_of_words > longest_sentence:
        longest_sentence=number_of_words 
        

X_train_with_padding=pad_sequences(x_train_tokens,maxlen=longest_sentence,padding='post')
X_test_with_padding=pad_sequences(x_test_tokens,maxlen=longest_sentence,padding='post')

In [None]:
#defining the embedding matrix to use in task2
embedding_matrix_to_use=np.zeros((len(word_index)+1,300))

In [None]:
with open('bengali_freq_dictionary.txt', 'rb') as handle:
    data = handle.read()
    
bengali_freq = pickle.loads(data)
#len(bengali_freq)

In [None]:
def return_index_in_the_embedding_from_task1(word):
    return list(bengali_freq.keys()).index(word)
   
#return_index_in_the_embedding_from_task1('ফজলম')

In [None]:
#retreive corresponding weights of words from weight matrix
skipped_words=0
for word,index in word_index.items():
    
    try:
        corresponding_index_in_embedding= return_index_in_the_embedding_from_task1(word)
        numpy_corresponding= embedding_weights[corresponding_index_in_embedding].data.numpy()
    
    except:
        skipped_words=skipped_words+1
        pass
    
    if corresponding_index_in_embedding is not None:
        embedding_matrix_to_use[index]=numpy_corresponding
        

In [None]:
# reference link- https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456

from tensorflow.keras.layers import *

embedding_layer=Embedding(len(word_index)+1,300,weights=[embedding_matrix_to_use],input_length=longest_sentence,trainable=False)

from tensorflow.keras.models import *
from tensorflow.keras.layers import *

model_keras=Sequential()

model_keras.add(embedding_layer)
model_keras.add(LSTM(units=32,dropout=0.1))
model_keras.add(Dense(1,activation="sigmoid"))

model_keras.compile(loss="binary_crossentropy",optimizer='adam',metrics=["accuracy"])

In [None]:
model_keras.summary()

In [None]:
model_keras.fit(X_train_with_padding,y_train,epochs=10,validation_data=(X_test_with_padding,y_test))