In [29]:
import numpy as np
from numpy import dstack
import tensorflow as tf
import operator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
import multiprocessing as mp
import string
import en_core_web_sm
import spacy
from random import randrange
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Input,Embedding,Dense,LSTM,GRU,Bidirectional,Dropout,SimpleRNN,GlobalAvgPool1D,GlobalMaxPool1D
from tensorflow.keras.layers import Conv1D,SpatialDropout1D,BatchNormalization,Lambda,Concatenate,concatenate,GlobalMaxPooling1D
from tensorflow.keras.callbacks import  EarlyStopping
from keras.utils import to_categorical

%matplotlib inline

In [30]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
warnings.filterwarnings('ignore')
nlp = en_core_web_sm.load()

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno 11001] getaddrinfo failed>


In [31]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19579 entries, 0 to 19578
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      19579 non-null  object
 1   text    19579 non-null  object
 2   author  19579 non-null  object
dtypes: object(3)
memory usage: 459.0+ KB


In [33]:

# Removing id column
df.drop('id',axis=1,inplace=True)

In [34]:
#remove outliers
df = df[df['text'].str.split().map(lambda x:len(x))<100]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19488 entries, 0 to 19578
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    19488 non-null  object
 1   author  19488 non-null  object
dtypes: object(2)
memory usage: 456.8+ KB


## Data Cleaning


In [35]:
# a function to preprocess removing punctuations, normalize, stopwords and lemmatization


class TextPreprocessing(BaseEstimator,TransformerMixin):
    def __init__(self,
                 n_jobs=1):    
      
     self.n_jobs = n_jobs
    """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization
        
        n_jobs - parallel jobs to run
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()
        partitions = 2
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
          partitions = cores
        elif self.n_jobs <= 0:
          return X_copy.apply(self._preprocess_text)
        else:
          partitions = min(self.n_jobs, cores)
        cores = mp.cpu_count()
        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()
        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, verbose=False))
        except:
            return text
    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])

In [36]:
#Converting the categorical column to variable for easier processing 

df['author'] = df['author'].map({'EAP':0,'HPL':1,'MWS':2})
df.head()

Unnamed: 0,text,author
0,"This process, however, afforded me no means of...",0
1,It never once occurred to me that the fumbling...,1
2,"In his left hand was a gold snuff box, from wh...",0
3,How lovely is spring As we looked from Windsor...,2
4,"Finding nothing else, not even gold, the Super...",1


#### Using countvectorizer to convert the sentence into column of words

In [37]:
cv = CountVectorizer()
cv_df = cv.fit_transform(df['text'])

tfidf = TfidfTransformer()
tfidf.fit(cv_df)
tfidf_trans = tfidf.transform(cv_df)

print('Shape of Sparse Matrix: ', cv_df.shape)
print('Amount of Non-Zero occurences: ', cv_df.nnz)
print('Shape of Tfidf Transformed matrix',tfidf_trans.shape)

Shape of Sparse Matrix:  (19488, 24796)
Amount of Non-Zero occurences:  421231
Shape of Tfidf Transformed matrix (19488, 24796)


# Neural Networks Model



In [38]:
df_neural = df
df_neural.head()

Unnamed: 0,text,author
0,"This process, however, afforded me no means of...",0
1,It never once occurred to me that the fumbling...,1
2,"In his left hand was a gold snuff box, from wh...",0
3,How lovely is spring As we looked from Windsor...,2
4,"Finding nothing else, not even gold, the Super...",1


In [39]:
stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def convert_nltk_to_wordnet(text):
#To check if the given word is noun,or a verb or an adjective
  if text.startswith('J'):
    return wordnet.ADJ
  
  elif text.startswith('N'):
    return wordnet.NOUN

  elif text.startswith('V'):
    return wordnet.VERB
  
  elif text.startswith('R'):
    return wordnet.ADV
  
  else:
    return None 
 
def lemmatizes(sentence):
  tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
  wordnet_tagged = map(lambda x : (x[0] , convert_nltk_to_wordnet(x[1])) , tagged)
  lemmatized_sentence = []
  for word , tag in wordnet_tagged:
    if tag is None:
      lemmatized_sentence.append(word)
    else:
      lemmatized_sentence.append(lemmatizer.lemmatize(word,tag))
  return ' '.join(lemmatized_sentence)

def clean(text):

  text = re.sub('/.',' ',text)
  text = text.lower()
  text = re.sub("aren't", "are not",text)
  text = re.sub("can't","cannot",text)
  text = re.sub("don't","do not",text)
  text = re.sub("couldn't","could not",text)
  text = re.sub("doesn't","does not",text)
  text = re.sub("hadn't","had not",text)
  text = re.sub("wouldn't","would not",text)
  text = re.sub("he'll","he will",text)
  text = re.sub("what've","what have",text)
  text = re.sub("who'd","who would",text)
  text = re.sub("who'll","who will",text)
  text = re.sub("I'll","I will",text)
  text = re.sub("you'd","you would",text)
  text = re.sub("you'll","you will",text)
  text = re.sub("you're","you are",text)
  text = re.sub("you've","you have",text)
  text = re.sub("wasn't","was not",text)
  text = re.sub("that's","that is",text)
  text = re.sub("they've","they have",text)
  text = re.sub("they're","they are",text)
  text = re.sub("what's","what is",text)
  text = re.sub("what're","what are",text)
  text = re.sub("what'll","what will",text)
  text = re.sub("that's","that is",text)
  text = re.sub("there's","there is",text)
  text = re.sub("it's","it is",text)
  text = re.sub("it'll","it will",text)
  text = re.sub("could've","could have",text)
  text = re.sub("it'll","it will",text)
  text = re.sub("shouldn't","should not",text)
  text = re.sub("should've","should have",text)
  text = re.sub("shan't","shall not",text)
  text = re.sub("won't","will not",text)
  text = re.sub("we'd","we would",text)
  text = re.sub("weren't","were not",text)
  text = re.sub('[^A-Za-z/.\s]','',text)
  text = text.lower().split()
  text = [word for word in text if word not in stop]
  text = ' '.join(text)
  final_text = lemmatizes(text)
  return final_text

df_neural['text'] = df_neural['text'].apply(lambda x : clean(x))
y = to_categorical(df['author'])
df_neural.head()

Unnamed: 0,text,author
0,process however afford mean ascertain dimensio...,0
1,never occur fumble might mere mistake .,1
2,left hand gold snuff box caper hill cut manner...,0
3,lovely spring look windsor terrace sixteen fer...,2
4,find nothing else even gold superintendent aba...,1


In [40]:
def get_embedding(name,word_index,vocab_len,dim):
  embedding_index = {}
  f = open(name,encoding='utf-8')
  for line in f:
    values = line.split()
    word = values[0]
    coeffs = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coeffs
  f.close()
  embedding_matrix = np.zeros((vocab_len+1,dim))
  for word,index in word_index.items():
    if index > vocab_len:
      break
    else:
      embedding_vector = embedding_index.get(word)
      if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
  return embedding_matrix,embedding_index

In [41]:
corpus = np.asarray(df_neural['text'])
tokenizer = Tokenizer(num_words=21000)
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
data = pad_sequences(sequences=sequences,padding='pre')
vocab_len = len(tokenizer.word_index)+1
max_len = len(data[0])

In [42]:
word_index = tokenizer.word_index
embedding_matrix1,embedding_index1 = get_embedding('glove.6B.300d.txt',word_index,vocab_len,300)

In [43]:
# Checking if we have word embeddings for the words in our vocab
def check_coverage(vocab, embeddings_index):

  known_words = {}
  unknown_words = {}
  nb_known_words = 0
  nb_unknown_words = 0
  for word in vocab.keys():
    try:
        known_words[word] = embeddings_index[word]
        nb_known_words += vocab[word]
    except:
        unknown_words[word] = vocab[word]
        nb_unknown_words += vocab[word]
        pass
  print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
  print('Found embeddings for  {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
  unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

  return unknown_words

In [44]:
print('Glove embeddings:\n')
Glove_embedding = check_coverage(word_index,embedding_index1)
print('\n')

Glove embeddings:

Found embeddings for 87.348% of vocab
Found embeddings for  82.289% of all text




In [45]:
Glove_embedding[:30]

[('brusquerie', 20245),
 ('tremulousness', 20238),
 ('aegidus', 20232),
 ('valentinianus', 20231),
 ('btenoir', 20227),
 ('junianus', 20226),
 ('littlewit', 20224),
 ('schweinkopf', 20219),
 ('apothegm', 20216),
 ('flatzplatz', 20215),
 ('literatim', 20211),
 ('odigies', 20209),
 ('despera', 20208),
 ('chinless', 20207),
 ('herbless', 20202),
 ('trink', 20201),
 ('deathful', 20199),
 ('contemns', 20181),
 ('servox', 20171),
 ('unpossessed', 20170),
 ('signalize', 20153),
 ('carvins', 20149),
 ('otaheit', 20146),
 ('miltonic', 20136),
 ('rayless', 20135),
 ('siroc', 20129),
 ('lascia', 20125),
 ('raggiar', 20124),
 ('lombra', 20121),
 ('othair', 20115)]

In [46]:

embedding_matrix_weights = embedding_matrix1
np.shape(embedding_matrix_weights)

(20252, 300)

In [47]:

# Split the data into training ,test and validation set
X_train,X_test,y_train,y_test=train_test_split(data,y,test_size=0.2)

In [48]:
# using BiDirectional LSTM model

def deep_1st():
  model_deep = Sequential()
  model_deep.add(Embedding(vocab_len+1,300,weights=[embedding_matrix_weights],trainable=True,input_length=max_len))
  model_deep.add(SpatialDropout1D(0.2))
  model_deep.add(Bidirectional(LSTM(128,input_shape=(64,1),return_sequences = True)))
  model_deep.add(Bidirectional(LSTM(64,return_sequences=True)))
  model_deep.add(GlobalMaxPool1D())
  model_deep.add(Dense(128,activation='relu'))
  model_deep.add(Dropout(0.5))
  model_deep.add(BatchNormalization())
  model_deep.add(Dense(3,activation='softmax'))

  callbacks = EarlyStopping(monitor='val_loss',patience=5)

  model_deep.compile(optimizer = 'adam',loss='categorical_crossentropy',metrics=['accuracy'])
  return model_deep

In [49]:
kfold = StratifiedKFold(n_splits=5,shuffle=True)
i=1
score=[]
for train_index , test_index in kfold.split(X_train,y_train.argmax(1)):
    print(f'{i} of KFold {kfold.n_splits}')
    X_train1,X_test1 = X_train[train_index],X_train[test_index]
    y_train1,y_test1 = y_train[train_index],y_train[test_index]
    model_1 = deep_1st()
    history = model_1.fit(X_train1,y_train1,batch_size=512,epochs=8,validation_split=0.2)
    print('\n')
    acc = model_1.evaluate(X_test1,y_test1)
    print('Accuracy :  ',acc[1])
    score.append(acc[1])
    print('\n')
    i+=1

1 of KFold 5
Epoch 1/8
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 2s/step - accuracy: 0.4449 - loss: 1.1810 - val_accuracy: 0.4088 - val_loss: 1.0403
Epoch 2/8
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 1s/step - accuracy: 0.6250 - loss: 0.8416 - val_accuracy: 0.4569 - val_loss: 0.9966
Epoch 3/8
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 1s/step - accuracy: 0.7142 - loss: 0.6887 - val_accuracy: 0.5094 - val_loss: 0.9421
Epoch 4/8
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 1s/step - accuracy: 0.7652 - loss: 0.5751 - val_accuracy: 0.6405 - val_loss: 0.8785
Epoch 5/8
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 1s/step - accuracy: 0.8136 - loss: 0.4731 - val_accuracy: 0.7387 - val_loss: 0.7930
Epoch 6/8
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 1s/step - accuracy: 0.8590 - loss: 0.3780 - val_accuracy: 0.7487 - val_loss: 0.7264
Epoch 7/8
[1m20/20[0m [32m━━━━

In [50]:
pred = model_1.predict(X_test)

print(classification_report(y_test.argmax(1),pred.argmax(1)))

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 101ms/step
              precision    recall  f1-score   support

           0       0.73      0.87      0.79      1580
           1       0.80      0.75      0.77      1113
           2       0.85      0.68      0.76      1205

    accuracy                           0.78      3898
   macro avg       0.79      0.77      0.78      3898
weighted avg       0.79      0.78      0.78      3898



Since the training data set is small , it is preferred to put trainable = False, Let's see what happens if trainable=True

In [51]:
def model_true():

  inp = Input(shape=(max_len,))
  x = Embedding(vocab_len+1, 300, weights=[embedding_matrix_weights], trainable=True)(inp)
  x = SpatialDropout1D(0.3)(x)
  x1 = Bidirectional(LSTM(256, return_sequences=True))(x)
  x2 = Bidirectional(GRU(128, return_sequences=True))(x1)
  max_pool1 = GlobalMaxPool1D()(x1)
  max_pool2 = GlobalMaxPool1D()(x2)
  conc = concatenate([max_pool1, max_pool2])
  x = Dense(128,activation='relu')(conc)
  x = Dropout(0.5)(x)
  predictions = Dense(3, activation='softmax')(x)

  model = Model(inputs=inp, outputs=predictions)
  callbacks = EarlyStopping(monitor='val_loss',patience=3)

  from tensorflow.keras.optimizers import Adam,RMSprop
  adam = Adam()
  model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [52]:
kfold=StratifiedKFold(n_splits=5,shuffle=True)
score=[]
i=1
for train_index , test_index in kfold.split(X_train,y_train.argmax(1)):
    print(f'{i} of KFold {kfold.n_splits}')
    X_train_main,X_val = X_train[train_index],X_train[test_index]
    y_train_main,y_val = y_train[train_index],y_train[test_index]
    model_2nd = model_true()
    history = model_2nd.fit(X_train_main,y_train_main,epochs=5,batch_size=128,validation_split=0.2)
    print('\n')
    acc = model_2nd.evaluate(X_val,y_val)
    score.append(acc[1])
    print('Accuracy:  ',acc[1])
    i+=1

1 of KFold 5
Epoch 1/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 1s/step - accuracy: 0.4965 - loss: 0.9953 - val_accuracy: 0.6922 - val_loss: 0.7169
Epoch 2/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 1s/step - accuracy: 0.7339 - loss: 0.6533 - val_accuracy: 0.7475 - val_loss: 0.6100
Epoch 3/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 1s/step - accuracy: 0.8253 - loss: 0.4502 - val_accuracy: 0.7828 - val_loss: 0.5361
Epoch 4/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 1s/step - accuracy: 0.8756 - loss: 0.3167 - val_accuracy: 0.7904 - val_loss: 0.5446
Epoch 5/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 1s/step - accuracy: 0.9133 - loss: 0.2321 - val_accuracy: 0.7876 - val_loss: 0.5796


[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 131ms/step - accuracy: 0.7692 - loss: 0.6181
Accuracy:   0.7873637080192566
2 of KFold 5
Epoch 1/5
[1m78/78[0m [32m━━

In [53]:
print('Accuracy :  ',np.mean(score))

Accuracy :   0.7875561237335205


In [54]:
pred_2 = model_2nd.predict(X_test)

print(classification_report(y_test.argmax(1),pred_2.argmax(1)))

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 254ms/step
              precision    recall  f1-score   support

           0       0.81      0.77      0.79      1580
           1       0.79      0.78      0.79      1113
           2       0.76      0.82      0.79      1205

    accuracy                           0.79      3898
   macro avg       0.79      0.79      0.79      3898
weighted avg       0.79      0.79      0.79      3898



The new complex performs the same as our previous model with an accuracy of 85%, and it seems that the recall of some of the classes are better

In [58]:
def model_3():

  sequence_input = Input(shape=(max_len,))
  embedding_layer = Embedding(vocab_len+1,300,weights=[embedding_matrix_weights],trainable = True,input_length=max_len)
  x = embedding_layer(sequence_input)
  x = SpatialDropout1D(0.2)(x)
  x = Bidirectional(LSTM(128,return_sequences=True))(x)
  x = Conv1D(64,kernel_size=2,padding='valid',kernel_initializer="he_uniform")(x)
  avg_pool = GlobalAvgPool1D()(x)
  max_pool = GlobalMaxPool1D()(x)
  x = concatenate([avg_pool,max_pool])
  x = Dense(128,activation='relu')(x)
  x = Dropout(0.5)(x)
  pred = Dense(3,activation='softmax')(x)

  model_3 = Model(sequence_input, pred)
  model_3.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
  return model_3

In [59]:
kfold=StratifiedKFold(n_splits=5,shuffle=True)
i=1
score=[]
for train_index , test_index in kfold.split(X_train,y_train.argmax(1)):
    print(f'{i} of KFold {kfold.n_splits}')
    X_train_main,X_val = X_train[train_index],X_train[test_index]
    y_train_main,y_val = y_train[train_index],y_train[test_index]
    model_3rd = model_3()
    history = model_3rd.fit(X_train_main,y_train_main,epochs=5,batch_size=512,validation_split=0.2)
    print('\n')
    acc = model_3rd.evaluate(X_val,y_val)
    score.append(acc[1])
    print('\n')
    print('Accuracy:  ',acc[1])
    i+=1

1 of KFold 5
Epoch 1/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 1s/step - accuracy: 0.4158 - loss: 1.0717 - val_accuracy: 0.4557 - val_loss: 1.0372
Epoch 2/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 2s/step - accuracy: 0.5664 - loss: 0.9310 - val_accuracy: 0.5523 - val_loss: 0.9789
Epoch 3/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 2s/step - accuracy: 0.5953 - loss: 0.8875 - val_accuracy: 0.6553 - val_loss: 0.7914
Epoch 4/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 1s/step - accuracy: 0.6490 - loss: 0.7997 - val_accuracy: 0.6806 - val_loss: 0.7573
Epoch 5/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 1s/step - accuracy: 0.6822 - loss: 0.7412 - val_accuracy: 0.6745 - val_loss: 0.7535


[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 117ms/step - accuracy: 0.6911 - loss: 0.7368


Accuracy:   0.679602324962616
2 of KFold 5
Epoch 1/5
[1m20/20[0m [32m━━━━━━

In [63]:
pred = model_3rd.predict(X_test)

print(classification_report(y_test.argmax(1),pred.argmax(1)))

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 70ms/step
              precision    recall  f1-score   support

           0       0.70      0.76      0.72      1580
           1       0.74      0.61      0.67      1113
           2       0.68      0.71      0.69      1205

    accuracy                           0.70      3898
   macro avg       0.70      0.69      0.69      3898
weighted avg       0.70      0.70      0.70      3898



In [64]:
def model_4():

  inp = Input(shape=(max_len,))
  embedding_layer = Embedding(vocab_len+1,300,weights=[embedding_matrix_weights],trainable=True)
  x = embedding_layer(inp)
  x = SpatialDropout1D(0.2)(x)
  x1 = Bidirectional(LSTM(64,return_sequences=True))(x)
  x1 = Conv1D(64 ,kernel_size=3,padding='same',activation='linear')(x1)
  x1 = BatchNormalization()(x1)
  x1 = Conv1D(64,kernel_size=3,padding='same',activation='linear')(x1)
  x1 = BatchNormalization()(x1)
  x2 = Conv1D(64,kernel_size=1,padding='same',activation='linear')(x)
  xmain = concatenate([x1,x2])
  xmain1 = Conv1D(64,kernel_size=3,padding='same',activation='linear')(xmain)
  xmain1 = BatchNormalization()(xmain1)
  xmain1 = Conv1D(64,kernel_size=3,padding='same',activation='linear')(xmain1)
  xmain1 = BatchNormalization()(xmain1)
  x = concatenate([xmain,xmain1])
  x = GlobalMaxPool1D()(x)
  x = Dense(182,activation='relu')(x)
  x = BatchNormalization()(x)
  x = Dropout(0.5)(x)
  x = Dense(3,activation = 'softmax')(x)

  model_last = Model(inp,x)
  model_last.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=0.001),loss='categorical_crossentropy',metrics=['accuracy'])
  return model_last

In [65]:
kfold=StratifiedKFold(n_splits=5,shuffle=True)
i=1
score=[]
for train_index , test_index in kfold.split(X_train,y_train.argmax(1)):
    print(f'{i} of KFold {kfold.n_splits}')
    X_train_main,X_val = X_train[train_index],X_train[test_index]
    y_train_main,y_val = y_train[train_index],y_train[test_index]
    model_4th = model_4()
    history = model_4th.fit(X_train_main,y_train_main,epochs=5,batch_size=128,validation_split=0.2)
    print('\n')
    acc = model_4th.evaluate(X_val,y_val)
    score.append(acc[1])
    print('\n')
    print('Accuracy:  ',acc[1])
    i+=1
     

1 of KFold 5
Epoch 1/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 521ms/step - accuracy: 0.4613 - loss: 1.3522 - val_accuracy: 0.4665 - val_loss: 0.9778
Epoch 2/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 546ms/step - accuracy: 0.6568 - loss: 0.8233 - val_accuracy: 0.5651 - val_loss: 0.9025
Epoch 3/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 558ms/step - accuracy: 0.7402 - loss: 0.6369 - val_accuracy: 0.6449 - val_loss: 0.8153
Epoch 4/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 582ms/step - accuracy: 0.8175 - loss: 0.4565 - val_accuracy: 0.7150 - val_loss: 0.6825
Epoch 5/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 542ms/step - accuracy: 0.8617 - loss: 0.3552 - val_accuracy: 0.7403 - val_loss: 0.6520


[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 49ms/step - accuracy: 0.7467 - loss: 0.6205


Accuracy:   0.7581782937049866
2 of KFold 5
Epoch 1/5
[1m78/78[

In [66]:
print('Accuracy :  ' , np.mean(score))

Accuracy :   0.7500962257385254


In [67]:
pred = model_4th.predict(X_test)

print(classification_report(y_test.argmax(1),pred.argmax(1)))

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 47ms/step
              precision    recall  f1-score   support

           0       0.62      0.94      0.75      1580
           1       0.91      0.49      0.64      1113
           2       0.85      0.63      0.72      1205

    accuracy                           0.72      3898
   macro avg       0.79      0.69      0.70      3898
weighted avg       0.77      0.72      0.71      3898



# Model Ensembling  (Stacking Method)

In [68]:
def stacked_dataset(members, inputX):
	stackX = None
	for model in members:
		yhat = model.predict(inputX, verbose=0)
		# stack predictions into [rows, members, probabilities]
		if stackX is None:
			stackX = yhat
		else:
			stackX = dstack((stackX, yhat))
	# flatten predictions to [rows, members x probabilities]
	stackX = stackX.reshape((stackX.shape[0], stackX.shape[1]*stackX.shape[2]))
	return stackX
 
def fit_stacked_model(members, inputX, inputy):
	# create dataset using ensemble
	stackedX = stacked_dataset(members, inputX)
	model = LogisticRegression()
	model.fit(stackedX, inputy)
	return model
 
# make a prediction with the stacked model
def stacked_prediction(members, model, inputX):
	stackedX = stacked_dataset(members, inputX)
	yhat = model.predict(stackedX)
	return yhat

In [69]:
members = [model_1,model_2nd,model_3rd,model_4th]
for models in members:
  _,acc = models.evaluate(X_test,y_test)
  print('Model Accuracy: ', acc)
 
model = fit_stacked_model(members, X_test, y_test.argmax(1))
yhat = stacked_prediction(members, model, X_test)

print(classification_report(y_test.argmax(1),yhat))

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 110ms/step - accuracy: 0.7860 - loss: 0.5942
Model Accuracy:  0.778091311454773
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 213ms/step - accuracy: 0.7839 - loss: 0.5870
Model Accuracy:  0.7888661026954651
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 88ms/step - accuracy: 0.7123 - loss: 0.6985
Model Accuracy:  0.6993330121040344
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - accuracy: 0.7257 - loss: 0.6737
Model Accuracy:  0.7165213227272034
              precision    recall  f1-score   support

           0       0.79      0.84      0.82      1580
           1       0.83      0.77      0.80      1113
           2       0.81      0.79      0.80      1205

    accuracy                           0.81      3898
   macro avg       0.81      0.80      0.81      3898
weighted avg       0.81      0.81      0.81      3898

