## DNN based all class classfier

In [None]:
!pip install -U keras
!pip install -U tensorflow
!pip install -U pandas
!pip install -U scikit-learn

#### Load the required libraries

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input,Embedding,Dense,Flatten,concatenate
from keras.models import Model

from IPython.display import Image

### Flowchart of the problem approach

In [None]:
Image(filename='keras_func_api.png')

## Prep Data

In [2]:
data = pd.read_csv("train.csv",delimiter=",",encoding='utf-8', index_col=False)
test_csv = pd.read_csv("test.csv",delimiter=",",encoding='utf-8', index_col=False)
valid_csv = pd.read_csv("valid.csv",delimiter=",",encoding='utf-8', index_col=False)

#### add length feature

In [4]:
data['Text_len'] = [len(i) for i in data["Text"]]
test_csv['Text_len'] = [len(i) for i in test_csv["Text"]]
valid_csv['Text_len'] = [len(i) for i in valid_csv["Text"]]

In [5]:
target_attr = 'ParagraphType'

#### segregate the data into train and test

In [6]:
data_numerical_train, data_string_train, Y_train = data[['Text_len']], data[["Text"]],data[["ParagraphType"]]
data_numerical_valid, data_string_valid, Y_valid = valid_csv[['Text_len']], valid_csv[["Text"]],valid_csv[["ParagraphType"]]
data_numerical_test, data_string_test, Y_test = test_csv[['Text_len']], test_csv[["Text"]],test_csv[["ParagraphType"]]

#### Target variable

In [7]:
data['ParagraphType'].unique()

array(['TI', 'AB', 'H1', 'PA', 'H2', 'LI', 'BY', 'H3', 'HA', 'CO'],
      dtype=object)

In [8]:
no_of_levels=len(data['ParagraphType'].unique())

Since there are 10 different classes

In [9]:
from sklearn.preprocessing import OneHotEncoder

In [10]:
onehotencoder = OneHotEncoder(handle_unknown='ignore')

In [11]:
Y_train = pd.DataFrame(Y_train)
Y_test = pd.DataFrame(Y_test)
Y_valid = pd.DataFrame(Y_valid)

In [12]:
OneHotEncoder = onehotencoder.fit(Y_train)

In [13]:
OneHotEncoder_target_train = OneHotEncoder.transform(Y_train).toarray()
OneHotEncoder_target_test = OneHotEncoder.transform(Y_test).toarray()
OneHotEncoder_target_valid = OneHotEncoder.transform(Y_valid).toarray()

In [14]:
OneHotEncoder_target_test.shape

(4985, 10)

In [15]:
OneHotEncoder_target_train[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

### Pre-Processing of Text

#### I used 1000 as max length of the paragraph

#### Tokenize the words

In [42]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_string_train['Text'])
review_text_train = tokenizer.texts_to_sequences(data_string_train['Text'])
review_text_test = tokenizer.texts_to_sequences(data_string_test['Text'])
review_text_valid = tokenizer.texts_to_sequences(data_string_valid['Text'])

word_index_review_text = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index_review_text))
NUM_WORDS_REVIEW_TEXT = len(word_index_review_text)+1

review_text_seq_train = pad_sequences(review_text_train, maxlen=1000)
review_text_seq_test = pad_sequences(review_text_test, maxlen=1000)
review_text_seq_valid = pad_sequences(review_text_valid, maxlen=1000)


Found 232531 unique tokens.


###### Load the GloVe word embedding file into memory as a dictionary of word to embedding array.

__Note__: Filter the embedding for the unique words in the training data.

In [17]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


#### Next, create a matrix of one embedding for each word in the training dataset. We can do that by enumerating all unique words in the Tokenizer.word_index and locating the embedding weight vector from the loaded GloVe embedding.

#### The result is a matrix of weights only for words we will see during training.

#### Also count the number of words not present in the glove to decide whether we need to train or not


In [18]:
# create a weight matrix for words in training docs
review_embedding_matrix = np.zeros((NUM_WORDS_REVIEW_TEXT,200))
review_word_not_in_glove_count = 0
review_word_not_in_glove =[]
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        review_embedding_matrix[i] = embedding_vector
    else:
        review_word_not_in_glove.append(word)
        review_word_not_in_glove_count = review_word_not_in_glove_count+1

In [None]:
print(review_embedding_matrix)

In [None]:
print(review_word_not_in_glove)

In [None]:
print(review_word_not_in_glove_count)

## 3.a 10 class classfier

#### Embedding layer for Paragraph Text

#### If there are more than one word in the training data which are not present in Glove then train the embedding layer

In [20]:
num_cat_inputs = Input(shape=(data_numerical_train.shape[1],),name='num_cat_inputs')
out_num_cat = Dense(64, activation='relu')(num_cat_inputs)

In [23]:
text_input= Input(shape=(1000,),name='text_input')

if (review_word_not_in_glove_count<=1):
    text_embed = Embedding(input_dim=NUM_WORDS_REVIEW_TEXT,output_dim=200,weights=[review_embedding_matrix],trainable=False)(text_input)
else:
    text_embed = Embedding(input_dim=NUM_WORDS_REVIEW_TEXT,output_dim=200,weights=[review_embedding_matrix],trainable=True)(text_input)

out_text = Flatten()(text_embed)

#### Concatenate the output of above layers.

In [24]:
concatenated = concatenate([out_num_cat,out_text])
X = Dense(8, activation='relu')(concatenated)
final_out = Dense(no_of_levels, activation='softmax')(X)

In [25]:
model = Model(inputs=[num_cat_inputs,text_input], outputs=final_out)

In [26]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_input (InputLayer)         [(None, 1000)]       0                                            
__________________________________________________________________________________________________
num_cat_inputs (InputLayer)     [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1000, 200)    46506400    text_input[0][0]                 
__________________________________________________________________________________________________
dense (Dense)                   (None, 64)           128         num_cat_inputs[0][0]             
______________________________________________________________________________________________

In [27]:
model.compile(loss='categorical_crossentropy', optimizer='adagrad', metrics=['accuracy'])

In [40]:
model.fit([data_numerical_train,review_text_seq_train], 
          y=OneHotEncoder_target_train, 
          epochs=10,validation_split=0.20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f66c3ac0>

## 3.b accuracy and confusion matrix on validation dataset?

In [60]:
pred_valid=model.predict([data_numerical_valid,review_text_seq_valid])
# model.evaluate([data_numerical_valid,review_text_seq_valid], 
#                y=OneHotEncoder_target_valid)

In [67]:
from sklearn.metrics import accuracy_score
print('Accuracy: ', accuracy_score(OneHotEncoder_target_valid.argmax(axis=1), pred_valid.argmax(axis=1)))
pd.crosstab(OneHotEncoder_target_valid.argmax(axis=1), pred_valid.argmax(axis=1), rownames=['Actual'], colnames=['Predicted'], margins=True)

Accuracy:  0.742952603885321


Predicted,0,1,3,4,7,8,9,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,2,0,8,599,0,609
1,0,411,18,40,317,0,120,906
2,0,0,16,7,81,15,3,122
3,0,2,7610,96,654,15,30,8407
4,0,7,2089,284,1081,30,198,3689
5,0,1,248,99,106,1,23,478
6,0,0,79,21,0,0,21,121
7,0,12,919,197,5802,1356,163,8449
8,3,30,525,249,1778,22625,216,25426
9,0,47,277,184,862,3,403,1776


## 3.c Explain

I choose a feed forward NN with two inputs 1. the length of paragraph plus 2. word embedding with 1000 words max paragraphs, smaller and larger paragraphs are padded, i choose 1000 because almost all paragraph lengths are covered with this max number. As we are using word embeddings this also solves major problem ie most similar words are grouped in similar spaces in embedding hence results improve. 

#### Improvements
We can do hyper parameter optimization, we can also do sampling and reduce the biasness in input samples for each class.
I do not think use of more complex networks like LSTMs, transformers etc will improve on performance much because most important features are length of paragraph and some important keywords like "refrences, bibliography" etc and their occurences in each text.
#### That is the reason bag of words classification model in Question 2 for binary classifier worked very good.
#### Hence on the same lines if I would have got time i would have done bag of words using Glove, this would have improved more.

## 3.d new test.csv

In [44]:
pred=model.predict([data_numerical_test,review_text_seq_test])

In [50]:
y_classes=pred.argmax(axis=1)
y_classes
test_preds = onehotencoder.inverse_transform(pred)

In [59]:
new_results_for_test = [item for sublist in test_preds  for item in sublist]
test_csv['ParagraphType'] = new_results_for_test
test_csv = test_csv.drop('Text_len', 1)
test_csv.to_csv("test.csv", index=False)
