In [None]:
import numpy as np 
import re
import pandas as pd 
from pprint import pprint
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, save_model
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import os
from gensim.models import FastText,KeyedVectors
from keras.utils import to_categorical
import time
from sklearn.metrics import f1_score,confusion_matrix,classification_report
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import ModelCheckpoint
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data=pd.read_csv("/kaggle/input/banglaner/label_data.txt",encoding="utf-8",sep=" ",names=['word','label'],skip_blank_lines=False)
df=pd.DataFrame(data)
df.replace(np.NaN,"Break",inplace=True)

In [None]:
#checking empty list
def remove_empty_list(data):
    a=list(filter(lambda x: [] != x, data))
    return a

#for spiliting sentences
def making_list(data):
    f=[]
    t=[]
    for i in data:
        if (i=="Break"):
            f.append(t)
            t=[]
        else:
            t.append(i)
    return remove_empty_list(f)

def data_preprocessing():
    label=making_list(df['label'])
    sen=making_list(df['word'])
    l=[]
    for i in sen:
        s=" ".join(i)
        l.append(preprocessing(s).split())
    max_len=max([len(i) for i in sen])
    return sen,label,max_len

def preprocessing(sentence):
    whitespace = re.compile(u"[\ufeff\u200d\u200b\u200c\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
    fullspace = re.compile(u"[\s\u0020]+", re.UNICODE)
    bangla_fullstop = u"\u0964"
    punctSeq   = u"['\"“”‘’]+|[.?!,…]+|[:;]+"
    punc = u"[(),$%^&*+={}\[\]:\"|\'\~`<>/,¦!?½£¶¼©⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞⅟↉¤¿º;-]+"
    sentence= whitespace.sub("",sentence)
    sentence= fullspace.sub(" ",sentence)
    sentence = re.sub(r'^https?:\/\/.*[\r\n]*', "", sentence, flags=re.MULTILINE)
    sentence = re.sub(punctSeq, "", sentence)
    sentence = re.sub(bangla_fullstop, "",sentence)
    sentence = re.sub(punc, "", sentence)
    return sentence

In [None]:
sen,label,max_len=data_preprocessing()

In [None]:
#creating unique words/labels
words=list(set(df["word"].values))
tags = list(set(df["label"].values))
tags.remove('Break')

#creating dict with label/words knwon as corpus dict
word_to_int = dict((c, i) for i, c in enumerate(words))
n_words=len(word_to_int)

label_to_int = dict((c, i) for i, c in enumerate(tags))
n_tags=len(label_to_int)

In [None]:
#encoded the sentences
dataX=[]
for i in sen:
    l=[]
    for j in i:
        l.append(word_to_int[j])
    dataX.append(l)
    
#padding sequences
X = pad_sequences(maxlen=max_len, sequences=dataX, padding="post",value=n_words - 1)

In [None]:
#encoded the labels
dataY=[]
for i in label:
    l=[]
    for j in i:
        l.append(label_to_int[j])
    dataY.append(l)

#padding and categorical (for multilabel classification
#it's required) sequences
Y = pad_sequences(maxlen=max_len, sequences=dataY, padding="post",value=label_to_int["0"])
y = [to_categorical(i, num_classes=n_tags) for i in Y]

In [None]:
#pre-trained word embedding start here
#loading pre-trained word embedding
st=time.time()
pre_em_model= KeyedVectors.load_word2vec_format('../input/fastextbn/cc.bn.300.vec')
et=time.time()
dt=et-st
print('dt=',dt)

In [None]:
#creating dictonary with pre-trained model words and vector
embeddings_index={}
for word, vector in zip(pre_em_model.key_to_index, pre_em_model.vectors):
    coefs = np.asarray(vector, dtype='float32')
    embeddings_index[word] = coefs

#creating embedding_matrix for loaded the 300 dim vector(this come form pre-trained model)
#representation of corpus dict
embedding_matrix = np.zeros((len(words),300),dtype='float32')
for i in range(len(words)):
    if pre_em_model.__contains__(words[i]):
        embedding_matrix[i]=pre_em_model.__getitem__(words[i])
        
#For pre-trained model require cells
#end here

**Skip below two cell while use Pre-trained word embeddings**

In [None]:
#Custom train word embeddings start here
#Fasttext word embeddings with spliting sentences
em_model = FastText(vector_size=300, window=3, min_count=1,workers=16)
em_model.build_vocab(sen)
em_model.train(sen,total_examples=len(sen),epochs=10)

In [None]:
#creating embedding_matrix for loaded the 300 dim vector(this come form trained model)
#representation of corpus dict 
embedding_matrix = np.zeros((len(words),300),dtype='float32')
for i in range(len(words)):
    if em_model.wv.__contains__(words[i]):
        embedding_matrix[i]=em_model.wv.__getitem__(words[i])

#For custom train model require cells
#end here

In [None]:
print(X.shape,Y.shape)

In [None]:
#This test set for final model evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
y_train=np.array(y_train)
print(type(X_train),type(y_train))

In [None]:
# Define per-fold score containers
loss_per_fold = []

num_folds=5
# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)


# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(X_train, y_train):
    epochs=20
    units=70
    batch_size=80

    model = Sequential()
    #in embeddings weights pushed the embeddings matrix that build
    #from corpus dict and it's non trainable

    #while use 'UNK' approach run below layer instead of another embedding layer
    # model.add(Embedding(len(word_to_int),300,weights=[embedding_matrix],input_length=max_len,trainable=False))

    model.add(Embedding(len(words),300,weights=[embedding_matrix],input_length=max_len,trainable=False))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(units=units, return_sequences=True,recurrent_dropout=0.1)))
    #softmax output layer
    model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    history = model.fit(X_train[train], y_train[train], batch_size=batch_size, epochs=epochs,
                        validation_data=(X_train[test],y_train[test]),
                        verbose=0)
    
    print("For fold"+str(fold_no)+"loss Graph")
    from matplotlib import pyplot
    pyplot.plot(history.history['loss'])
    pyplot.plot(history.history['val_loss'])
    pyplot.title('model train vs validation loss')
    pyplot.ylabel('loss')
    pyplot.xlabel('epoch')
    pyplot.legend(['train', 'validation'], loc='upper right')
    pyplot.show()
    loss_per_fold.append(history.history['val_loss'])
    # Increase fold no
    fold_no += 1
    
    #This function contain evaluation of model
    p = model.predict(X_train[test])
    y_pred = np.argmax(p, axis=-1)
    y_real=np.argmax(y_train[test],axis=-1)
    print("Average: "+str(f1_score(y_real.flatten(),y_pred.flatten(),average='weighted')))
    print("Macro: "+str(f1_score(y_real.flatten(),y_pred.flatten(),average='macro')))
    print("Micro: "+str(f1_score(y_real.flatten(),y_pred.flatten(),average='micro')))

    print(classification_report(y_pred=y_pred.flatten(), y_true=y_real.flatten(),zero_division=0))


In [None]:
#train
epochs=20
units=75
batch_size=70

#spiliting train/test data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = Sequential()
#in embeddings weights pushed the embeddings matrix that build
#from corpus dict and it's non trainable
model.add(Embedding(len(words),300,weights=[embedding_matrix],input_length=max_len,trainable=False))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(units=units, return_sequences=True,recurrent_dropout=0.1)))
#softmax output layer
model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

history = model.fit(X_train, np.array(y_train), batch_size=batch_size, epochs=epochs, validation_data=(X_test,np.array(y_test)), verbose=1)


In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model train vs validation loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
#This cell contain data information function
def counting_label(string):
    a=0
    for i in range(len(df['label'])):
        if string==df['label'][i]:
            a+=1
    return a

def count(string):
    b,i,o="B-","I-","O-"
    num_of_b=counting_label(b+string)
    num_of_i=counting_label(i+string)
    num_of_o=counting_label(o+string)
    n_of_single_words=num_of_b-num_of_o        
    n_of_double_words=num_of_b-n_of_single_words-num_of_i
    n_of_multiple_words=num_of_i
    return n_of_single_words,n_of_double_words,n_of_multiple_words

def data_details():
    n_of_sen=counting_label("Break")
    n_of_words=len(df["word"])-n_of_sen
    n_of_misc_words=counting_label("0")
    n_of_single_words_name,n_of_double_words_name,n_of_multiple_words_name=count("PER")
    n_of_single_words_loc,n_of_double_words_loc,n_of_multiple_words_loc=count("LOC")
    n_of_single_words_org,n_of_double_words_org,n_of_multiple_words_org=count("ORG")
    n_of_single_words_time,n_of_double_words_time,n_of_multiple_words_time=count("TIME")
    n_of_single_words_unit,n_of_double_words_unit,n_of_multiple_words_unit=count("UNIT")

    d_dict={"Person":[n_of_single_words_name,n_of_double_words_name,n_of_multiple_words_name],
          "Location":[n_of_single_words_loc,n_of_double_words_loc,n_of_multiple_words_loc],
          "Organization":[n_of_single_words_org,n_of_double_words_org,n_of_multiple_words_org],
          "Time":[n_of_single_words_time,n_of_double_words_time,n_of_multiple_words_time],
          "Unit":[n_of_single_words_unit,n_of_double_words_unit,n_of_multiple_words_unit]
         }
    datas=pd.DataFrame(d_dict ,columns = ['Person','Location','Organization','Time','Unit'], index=['Single words','Double words','Multiple words'])

    return n_of_sen,n_of_words,n_of_misc_words,datas

#This function contain the labels distribution
def tag_details():
    tag_distribution = data.groupby("label").size().reset_index(name='counts')
    print(tag_distribution)

In [None]:
int_to_label = {value : key for (key, value) in label_to_int.items()}

In [None]:
#This function contain evaluation of model
p = model.predict(X_test)
y_pred = np.argmax(p, axis=-1)
y_real=np.argmax(y_test,axis=-1)
y_pred=[int_to_label[i] for i in y_pred.flatten()]
y_real=[int_to_label[i] for i in y_real.flatten()]
print("Average: "+str(f1_score(y_real,y_pred,average='weighted')))
print("Macro: "+str(f1_score(y_real,y_pred,average='macro')))
print("Micro: "+str(f1_score(y_real,y_pred,average='micro')))

c_re=classification_report(y_pred=y_pred, y_true=y_real,zero_division=0)
print(c_re)

In [None]:
import seaborn as sns

from pylab import rcParams

In [None]:
cm=confusion_matrix(y_real, y_pred, labels=tags)

In [None]:
sns.set(style='whitegrid', palette='bright', font_scale=1)
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
plt.figure(figsize=(12, 8))
sns_plot=sns.heatmap(cm, xticklabels=tags, yticklabels=tags, annot=True, fmt="d");
plt.title("Bangla NER Classification")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()

In [None]:
model_name="72"+".hdf5"
save_model(model, model_name)