In [49]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# lets import some stuff
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant
from keras.models import Model
from keras.layers import *
from keras.utils.np_utils import to_categorical
import re
from keras import regularizers

import matplotlib.pyplot as plt
import os
%matplotlib inline

In [50]:
# confirm  the GPU
from torch import cuda
assert cuda.is_available()
assert cuda.device_count() > 0
print(cuda.get_device_name(cuda.current_device()))

NVIDIA GeForce RTX 2060


In [51]:
max_features = 10000 # this is the number of words we care about

In [52]:
# read training dataset without rule based feature

df = pd.read_csv(r'C:\Users\nitis\Sentence classification\theis_final\preprocessed_train.csv', index_col=0)
df = df.drop_duplicates(subset='sentence', keep="first")
#df = df[1:100]
df

Unnamed: 0,sentence,section_nr,has_citation,last_section_title,Labels
0,This live defined as 1 00 Defination,1,0.0,Defination,Defination
1,A sunshine be a maven that shine 1 00 Defination,1,0.0,Defination,Defination
2,e set galaxy as the group of star 1 00 Defination,1,0.0,Defination,Defination
3,e ask that these were in all likeliness stimul...,1,0.0,Hypothesis,Hypothesis
4,Information technology could trace theorise th...,4,0.0,Hypothesis,Hypothesis
...,...,...,...,...,...
18147,we therefore conclude that the issue of experi...,70,0.0,functional magnetic resonance imaging data ana...,Emperical Result
18149,these new insight we discovered open possiblen...,80,1.0,contributions to research,Future work
18153,as users tend to behave impulsively with mobil...,87,1.0,limitations and future topics,limitation
18154,tertiary using yes no resolution choice for e...,87,0.0,limitations and future topics,limitation


In [53]:
df['l'] = df['sentence'].apply(lambda x: len(str(x).split(' ')))
print("mean length of sentence: " + str(df.l.mean()))
print("max length of sentence: " + str(df.l.max()))
print("std dev length of sentence: " + str(df.l.std()))

mean length of sentence: 27.707003175664383
max length of sentence: 141
std dev length of sentence: 16.051585128406025


In [54]:
sequence_length = 141

In [55]:
#get test data without rule based features
cols = ['sentence', 'section_nr','has_citation','last_section_title','Labels']
cols1 = ['sentence', 'section_nr','has_citation','last_section_title']
test = pd.read_csv(r'C:\Users\nitis\Sentence classification\theis_final\preprocessed_test.csv')
test = test[cols]
test

Unnamed: 0,sentence,section_nr,has_citation,last_section_title,Labels
0,The aim of this thesis was to gain an understa...,1,0,Research Aim,Aim
1,The aim is to develop finite element models us...,1,0,Research Objective,Aim
2,Determine the relationship between the size of...,1,0,Research Objective,Aim
3,Investigate the influence of nonuniform cup su...,1,0,Research Objective,Aim
4,Examine the influence of errors during reaming...,1,0,Research Objective,Aim
...,...,...,...,...,...
577,the moment of enjoyment on the pattern to rais...,39,0,hypothesis outcome,Hypothesis Result
578,Information technology substantiate the hypoth...,10,0,hypothesis outcome,Hypothesis Result
579,information engineering science rejects the su...,10,0,hypothesis outcome,Hypothesis Result
580,the supposition stern follow agreed on the cor...,10,0,hypothesis outcome,Hypothesis Result


In [56]:
sentences_train = df["sentence"].values
sentences_test = test["sentence"].values

y_train =  pd.get_dummies(df['Labels']).values
y_test  = pd.get_dummies(test['Labels']).values

In [57]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(sentences_train[2])
print(X_train[2])

e set galaxy as the group of star 1 00 Defination
[168, 208, 4201, 15, 1, 247, 3, 1230, 9, 2, 1945]


In [58]:
X_train = pad_sequences(X_train, padding='post', maxlen=sequence_length)
X_test = pad_sequences(X_test, padding='post', maxlen=sequence_length)

print(X_train[0, :])

[  16  345  533   15    9    2 1945    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0]


#  Model 1: Random embeddings

Lets build our model. In general I'm going to just use the same hyperparameters as Kim does apart from the embedding dimension

Keras has an Embedding layer we can use here. If you don't specify a custom way to embed text (something we will do later with w2v) Keras will do it randomly with a normal (Gaussian) distribution for you


In [59]:
embedding_dim = 200 # Kim uses 300 here
num_filters = 100

inputs = Input(shape=(sequence_length,), dtype='int32')

# use a random embedding for the text
embedding_layer = Embedding(input_dim=max_features, output_dim=embedding_dim, input_length=sequence_length)(inputs)

reshape = Reshape((sequence_length, embedding_dim, 1))(embedding_layer)

# Note the relu activation which Kim specifically mentions
# He also uses an l2 constraint of 3
# Also, note that the convolution window acts on the whole 200 dimensions - that's important
conv_0 = Conv2D(num_filters, kernel_size=(3, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(4, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(5, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape)

# perform max pooling on each of the convoluations
maxpool_0 = MaxPool2D(pool_size=(sequence_length - 3 + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(sequence_length - 4 + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(sequence_length - 5 + 1, 1), strides=(1,1), padding='valid')(conv_2)

# concat and flatten
concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)

# do dropout and predict
dropout = Dropout(0.5)(flatten)
output = Dense(units=12, activation='softmax')(dropout)

In [60]:
model = Model(inputs=inputs, outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 141)]        0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 141, 200)     2000000     ['input_3[0][0]']                
                                                                                                  
 reshape_2 (Reshape)            (None, 141, 200, 1)  0           ['embedding_2[0][0]']            
                                                                                                  
 conv2d_6 (Conv2D)              (None, 139, 1, 100)  60100       ['reshape_2[0][0]']              
                                                                                            

In [61]:
batch_size = 50 # Kim uses 50 here, I have a slightly smaller sample size than num
history = model.fit(X_train, y_train, epochs=5, batch_size=batch_size, verbose=1, validation_split=0.1, shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [62]:
y_hat = model.predict(X_test)

In [63]:
accuracy_score(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_hat)))

0.4725085910652921

In [64]:
confusion_matrix(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_hat)))

array([[ 0,  0, 12,  0,  0,  0,  0,  0,  0,  0,  6,  0],
       [ 0, 56,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0],
       [ 0,  3,  3,  0,  0,  0,  0,  0,  0,  0, 19,  0],
       [ 0,  0,  5, 30,  5,  0,  0,  3,  0,  1,  0,  2],
       [ 0,  5, 28,  3, 42,  1,  0,  2,  0,  0,  1, 14],
       [ 0,  0,  0,  2,  5, 38,  0,  1,  0,  0, 24,  0],
       [ 0,  0,  0,  2, 10, 31,  0,  0,  0,  0,  0,  0],
       [ 0,  1,  4,  0,  0,  0,  0, 37,  0,  0,  1,  0],
       [ 0,  0,  2,  3,  0,  0,  0,  0,  0,  0, 19,  0],
       [ 0,  0, 34,  2,  0,  0,  0,  6,  0,  0, 12,  0],
       [ 0,  0,  0,  0,  2,  0,  0,  0,  0,  0, 65,  0],
       [ 0,  1,  5,  0, 22,  3,  0,  3,  0,  0,  1,  4]], dtype=int64)

#  Model 2: Static word2vec
Now rather than randomly assign vectors we're going use w2v embeddings.

In [74]:
embeddings_index = {}
f = open(os.path.join('C:\\Users\\nitis\\OneDrive\\Desktop\\thesis', 'glove.6B.200d.txt'), encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [75]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 10958 unique tokens.


In [76]:
num_words = min(max_features, len(word_index)) + 1
print(num_words)

# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_index.items():
    if i > max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        embedding_matrix[i] = np.random.randn(embedding_dim)

10001


In [77]:
inputs_2 = Input(shape=(sequence_length,), dtype='int32')

# note the `trainable=False`, later we will make this layer trainable
embedding_layer_2 = Embedding(num_words,
                            embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=sequence_length,
                            trainable=False)(inputs_2)

reshape_2 = Reshape((sequence_length, embedding_dim, 1))(embedding_layer_2)

conv_0_2 = Conv2D(num_filters, kernel_size=(3, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape_2)
conv_1_2 = Conv2D(num_filters, kernel_size=(4, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape_2)
conv_2_2 = Conv2D(num_filters, kernel_size=(5, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape_2)

maxpool_0_2 = MaxPool2D(pool_size=(sequence_length - 3 + 1, 1), strides=(1,1), padding='valid')(conv_0_2)
maxpool_1_2 = MaxPool2D(pool_size=(sequence_length - 4 + 1, 1), strides=(1,1), padding='valid')(conv_1_2)
maxpool_2_2 = MaxPool2D(pool_size=(sequence_length - 5 + 1, 1), strides=(1,1), padding='valid')(conv_2_2)

concatenated_tensor_2 = Concatenate(axis=1)([maxpool_0_2, maxpool_1_2, maxpool_2_2])
flatten_2 = Flatten()(concatenated_tensor_2)

dropout_2 = Dropout(0.5)(flatten_2)
output_2 = Dense(units=12, activation='softmax')(dropout_2)

In [78]:
model_2 = Model(inputs=inputs_2, outputs=output_2)
model_2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_2.summary())

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 141)]        0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, 141, 200)     2000200     ['input_5[0][0]']                
                                                                                                  
 reshape_4 (Reshape)            (None, 141, 200, 1)  0           ['embedding_4[0][0]']            
                                                                                                  
 conv2d_12 (Conv2D)             (None, 139, 1, 100)  60100       ['reshape_4[0][0]']              
                                                                                            

In [79]:
batch_size = 50
history_2 = model_2.fit(X_train, y_train, epochs=5, batch_size=batch_size, verbose=1, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [80]:
y_hat_2 = model_2.predict(X_test)

In [81]:
accuracy_score(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_hat_2)))

0.13745704467353953

In [82]:
confusion_matrix(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_hat_2)))

array([[ 6,  0,  0,  0,  0,  0,  0,  0,  0, 12,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 57,  0,  0],
       [ 9,  0,  0,  0,  0,  0,  0,  0,  0, 16,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 46,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 96,  0,  0],
       [ 2,  0,  0,  0,  0,  4,  0,  0,  0, 64,  0,  0],
       [ 0,  0,  0,  0,  0,  2,  0,  0,  0, 41,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 19,  0, 24,  0,  0],
       [ 8,  0,  0,  0,  0,  0,  0,  0,  0, 16,  0,  0],
       [ 3,  0,  0,  0,  0,  0,  0,  0,  0, 51,  0,  0],
       [ 5,  0,  0,  0,  0,  0,  0,  0,  0, 62,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 39,  0,  0]], dtype=int64)

# Model 3: w2v with trainable embeddings

For this model we're going to try the same model again, but this time make the embeddings trainable. That means if during training the model decides on a better embedding for a word then it'll update it



In [83]:
inputs_3 = Input(shape=(sequence_length,), dtype='int32')
embedding_layer_3 = Embedding(num_words,
                            embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=sequence_length,
                            trainable=True)(inputs_3)

reshape_3 = Reshape((sequence_length, embedding_dim, 1))(embedding_layer_3)

# note the relu activation
conv_0_3 = Conv2D(num_filters, kernel_size=(3, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape_3)
conv_1_3 = Conv2D(num_filters, kernel_size=(4, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape_3)
conv_2_3 = Conv2D(num_filters, kernel_size=(5, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape_3)

maxpool_0_3 = MaxPool2D(pool_size=(sequence_length - 3 + 1, 1), strides=(1,1), padding='valid')(conv_0_3)
maxpool_1_3 = MaxPool2D(pool_size=(sequence_length - 4 + 1, 1), strides=(1,1), padding='valid')(conv_1_3)
maxpool_2_3 = MaxPool2D(pool_size=(sequence_length - 5 + 1, 1), strides=(1,1), padding='valid')(conv_2_3)

concatenated_tensor_3 = Concatenate(axis=1)([maxpool_0_3, maxpool_1_3, maxpool_2_3])
flatten_3 = Flatten()(concatenated_tensor_3)

dropout_3 = Dropout(0.5)(flatten_3)
output_3 = Dense(units=12, activation='softmax')(dropout_3)


In [84]:
model_3 = Model(inputs=inputs_3, outputs=output_3)
model_3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_3.summary())

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 141)]        0           []                               
                                                                                                  
 embedding_5 (Embedding)        (None, 141, 200)     2000200     ['input_6[0][0]']                
                                                                                                  
 reshape_5 (Reshape)            (None, 141, 200, 1)  0           ['embedding_5[0][0]']            
                                                                                                  
 conv2d_15 (Conv2D)             (None, 139, 1, 100)  60100       ['reshape_5[0][0]']              
                                                                                            

In [85]:

batch_size = 50
history_3 = model_3.fit(X_train, y_train, epochs=5, batch_size=batch_size, verbose=1, validation_split=0.2)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [86]:
y_hat_3 = model_3.predict(X_test)

In [87]:
accuracy_score(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_hat_3)))

0.48625429553264604

In [88]:
confusion_matrix(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_hat_3)))

array([[ 6,  0,  0,  0,  0,  0,  0,  0,  0,  2, 10,  0],
       [ 0, 56,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0],
       [11,  3,  3,  0,  0,  0,  0,  0,  0,  4,  4,  0],
       [ 0,  0,  0,  0,  2,  0,  0,  3,  0, 41,  0,  0],
       [ 0,  3,  0,  0, 44,  0,  0,  0,  0,  3, 23, 23],
       [19,  0,  0,  0,  0, 35,  0,  2,  0,  9,  5,  0],
       [ 0,  0,  0,  0,  1, 30,  0,  0,  0, 12,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 39,  0,  3,  1,  0],
       [18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  0],
       [ 3,  0,  0,  0,  0,  0,  0,  6,  0,  2,  9, 34],
       [ 6,  0,  0,  0,  0,  0,  0,  0,  0,  2, 59,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 39]], dtype=int64)

In [89]:
print("CNN random       : " + str(accuracy_score(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_hat)))))
print("CNN static       : " + str(accuracy_score(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_hat_2)))))
print("CNN trainable    : " + str(accuracy_score(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_hat_3)))))

CNN random       : 0.4725085910652921
CNN static       : 0.13745704467353953
CNN trainable    : 0.48625429553264604
