In [1]:
import numpy as np
from numpy import dstack
import tensorflow as tf
import operator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
import multiprocessing as mp
import string
import en_core_web_sm
import spacy
from random import randrange
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Input,Embedding,Dense,LSTM,GRU,Bidirectional,Dropout,SimpleRNN,GlobalAvgPool1D,GlobalMaxPool1D
from tensorflow.keras.layers import Conv1D,SpatialDropout1D,BatchNormalization,Lambda,Concatenate,concatenate,GlobalMaxPooling1D
from tensorflow.keras.callbacks import  EarlyStopping
from keras.utils import to_categorical

%matplotlib inline

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
warnings.filterwarnings('ignore')
nlp = en_core_web_sm.load()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Marc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Marc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Marc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19579 entries, 0 to 19578
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      19579 non-null  object
 1   text    19579 non-null  object
 2   author  19579 non-null  object
dtypes: object(3)
memory usage: 459.0+ KB


In [5]:

# Removing id column
df.drop('id',axis=1,inplace=True)

In [6]:
#remove outliers
df = df[df['text'].str.split().map(lambda x:len(x))<100]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19488 entries, 0 to 19578
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    19488 non-null  object
 1   author  19488 non-null  object
dtypes: object(2)
memory usage: 456.8+ KB


## Data Cleaning


In [7]:
# a function to preprocess removing punctuations, normalize, stopwords and lemmatization


class TextPreprocessing(BaseEstimator,TransformerMixin):
    def __init__(self,
                 n_jobs=1):    
      
     self.n_jobs = n_jobs
    """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization
        
        n_jobs - parallel jobs to run
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()
        partitions = 2
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
          partitions = cores
        elif self.n_jobs <= 0:
          return X_copy.apply(self._preprocess_text)
        else:
          partitions = min(self.n_jobs, cores)
        cores = mp.cpu_count()
        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()
        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, verbose=False))
        except:
            return text
    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])

In [8]:
#Converting the categorical column to variable for easier processing 

df['author'] = df['author'].map({'EAP':0,'HPL':1,'MWS':2})
df.head()

Unnamed: 0,text,author
0,"This process, however, afforded me no means of...",0
1,It never once occurred to me that the fumbling...,1
2,"In his left hand was a gold snuff box, from wh...",0
3,How lovely is spring As we looked from Windsor...,2
4,"Finding nothing else, not even gold, the Super...",1


#### Using countvectorizer to convert the sentence into column of words

In [9]:
cv = CountVectorizer()
cv_df = cv.fit_transform(df['text'])

tfidf = TfidfTransformer()
tfidf.fit(cv_df)
tfidf_trans = tfidf.transform(cv_df)

print('Shape of Sparse Matrix: ', cv_df.shape)
print('Amount of Non-Zero occurences: ', cv_df.nnz)
print('Shape of Tfidf Transformed matrix',tfidf_trans.shape)

Shape of Sparse Matrix:  (19488, 24796)
Amount of Non-Zero occurences:  421231
Shape of Tfidf Transformed matrix (19488, 24796)


In [10]:
# Splitting the model into train and text split
X_train,X_test,y_train,y_test = train_test_split(df['text'],df['author'],test_size = 0.3)

# Neural Networks Model



In [11]:
df_neural = df
df_neural.head()

Unnamed: 0,text,author
0,"This process, however, afforded me no means of...",0
1,It never once occurred to me that the fumbling...,1
2,"In his left hand was a gold snuff box, from wh...",0
3,How lovely is spring As we looked from Windsor...,2
4,"Finding nothing else, not even gold, the Super...",1


In [12]:
stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def convert_nltk_to_wordnet(text):
#To check if the given word is noun,or a verb or an adjective
  if text.startswith('J'):
    return wordnet.ADJ
  
  elif text.startswith('N'):
    return wordnet.NOUN

  elif text.startswith('V'):
    return wordnet.VERB
  
  elif text.startswith('R'):
    return wordnet.ADV
  
  else:
    return None 
 
def lemmatizes(sentence):
  tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
  wordnet_tagged = map(lambda x : (x[0] , convert_nltk_to_wordnet(x[1])) , tagged)
  lemmatized_sentence = []
  for word , tag in wordnet_tagged:
    if tag is None:
      lemmatized_sentence.append(word)
    else:
      lemmatized_sentence.append(lemmatizer.lemmatize(word,tag))
  return ' '.join(lemmatized_sentence)

def clean(text):

  text = re.sub('/.',' ',text)
  text = text.lower()
  text = re.sub("aren't", "are not",text)
  text = re.sub("can't","cannot",text)
  text = re.sub("don't","do not",text)
  text = re.sub("couldn't","could not",text)
  text = re.sub("doesn't","does not",text)
  text = re.sub("hadn't","had not",text)
  text = re.sub("wouldn't","would not",text)
  text = re.sub("he'll","he will",text)
  text = re.sub("what've","what have",text)
  text = re.sub("who'd","who would",text)
  text = re.sub("who'll","who will",text)
  text = re.sub("I'll","I will",text)
  text = re.sub("you'd","you would",text)
  text = re.sub("you'll","you will",text)
  text = re.sub("you're","you are",text)
  text = re.sub("you've","you have",text)
  text = re.sub("wasn't","was not",text)
  text = re.sub("that's","that is",text)
  text = re.sub("they've","they have",text)
  text = re.sub("they're","they are",text)
  text = re.sub("what's","what is",text)
  text = re.sub("what're","what are",text)
  text = re.sub("what'll","what will",text)
  text = re.sub("that's","that is",text)
  text = re.sub("there's","there is",text)
  text = re.sub("it's","it is",text)
  text = re.sub("it'll","it will",text)
  text = re.sub("could've","could have",text)
  text = re.sub("it'll","it will",text)
  text = re.sub("shouldn't","should not",text)
  text = re.sub("should've","should have",text)
  text = re.sub("shan't","shall not",text)
  text = re.sub("won't","will not",text)
  text = re.sub("we'd","we would",text)
  text = re.sub("weren't","were not",text)
  text = re.sub('[^A-Za-z/.\s]','',text)
  text = text.lower().split()
  text = [word for word in text if word not in stop]
  text = ' '.join(text)
  final_text = lemmatizes(text)
  return final_text

df_neural['text'] = df_neural['text'].apply(lambda x : clean(x))
y = to_categorical(df['author'])
df_neural.head()

Unnamed: 0,text,author
0,process however afford mean ascertain dimensio...,0
1,never occur fumble might mere mistake .,1
2,left hand gold snuff box caper hill cut manner...,0
3,lovely spring look windsor terrace sixteen fer...,2
4,find nothing else even gold superintendent aba...,1


In [13]:
def get_embedding(name,word_index,vocab_len,dim):
  embedding_index = {}
  f = open(name,encoding='utf-8')
  for line in f:
    values = line.split()
    word = values[0]
    coeffs = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coeffs
  f.close()
  embedding_matrix = np.zeros((vocab_len+1,dim))
  for word,index in word_index.items():
    if index > vocab_len:
      break
    else:
      embedding_vector = embedding_index.get(word)
      if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
  return embedding_matrix,embedding_index

In [14]:
corpus = np.asarray(df_neural['text'])
tokenizer = Tokenizer(num_words=21000)
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
data = pad_sequences(sequences=sequences,padding='pre')
vocab_len = len(tokenizer.word_index)+1
max_len = len(data[0])