In [1]:
import keras 
import xlrd
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, Dropout, Embedding, LSTM, Flatten
from keras.models import Model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
fgs = pd.read_excel('Claim-5.xlsx')

In [3]:
fgs.head()

Unnamed: 0,Claims,Scope
0,1. A system for synchronizing time of arrival ...,UE
1,4. A system for synchronizing a wireless comm...,UE
2,10. A system for synchronizing time of arriva...,UE
3,1. A method of predicting data comprising: re...,UE
4,"11. A method of providing predicted data, sai...",UE


In [4]:
fgs.Scope.value_counts()

UE               2316
Network          1587
UE OR Network     937
UE & Network      144
Other               4
Name: Scope, dtype: int64

In [6]:
fgs['target'] = fgs.Scope.astype('category').cat.codes

In [7]:
num_class = len(np.unique(fgs.Scope.values))
y = fgs['target'].values

In [8]:
MAX_LENGTH = 500
tokenizer = Tokenizer()
tokenizer.fit_on_texts(fgs.Claims.values)
post_seq = tokenizer.texts_to_sequences(fgs.Claims.values)
post_seq_padded = pad_sequences(post_seq, maxlen=MAX_LENGTH)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(post_seq_padded, y, test_size=0.25)

In [10]:
vocab_size = len(tokenizer.word_index) + 1

In [11]:

inputs = Input(shape=(MAX_LENGTH, ))
embedding_layer = Embedding(vocab_size,
                            128,
                            input_length=MAX_LENGTH)(inputs)
x = Flatten()(embedding_layer)
x = Dense(32, activation='relu')(x)

predictions = Dense(num_class, activation='softmax')(x)
model = Model(inputs=[inputs], outputs=predictions)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model.summary()
filepath="weights-simple.hdf5"
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
history = model.fit([X_train], batch_size=64, y=to_categorical(y_train), verbose=1, validation_split=0.25, 
          shuffle=True, epochs=5, callbacks=[checkpointer])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 500, 128)          509696    
_________________________________________________________________
flatten_1 (Flatten)          (None, 64000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2048032   
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 165       
Total params: 2,557,893
Trainable params: 2,557,893
Non-trainable params: 0
_________________________________________________________________
Train on 2805 samples, validate on 936 samples
Epoch 1/5
Epoch 00001: val_acc improved from -inf to 0.47115, saving model to weights

In [13]:
predicted = model.predict(X_test)
predicted = np.argmax(predicted, axis=1)
accuracy_score(y_test, predicted)

0.7193263833199679

In [14]:

inputss = Input(shape=(MAX_LENGTH, ))
embedding_layer = Embedding(vocab_size,
                            128,
                            input_length=MAX_LENGTH)(inputs)

x = LSTM(64)(embedding_layer)
x = Dense(32, activation='relu')(x)
predictions = Dense(num_class, activation='softmax')(x)
model = Model(inputs=[inputs], outputs=predictions)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 500, 128)          509696    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 165       
Total params: 561,349
Trainable params: 561,349
Non-trainable params: 0
_________________________________________________________________


In [15]:
filepath="weights.hdf5"
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
history = model.fit([X_train], batch_size=64, y=to_categorical(y_train), verbose=1, validation_split=0.25, 
          shuffle=True, epochs=10, callbacks=[checkpointer])

Train on 2805 samples, validate on 936 samples
Epoch 1/10
Epoch 00001: val_acc improved from -inf to 0.47009, saving model to weights.hdf5
Epoch 2/10
Epoch 00002: val_acc improved from 0.47009 to 0.56731, saving model to weights.hdf5
Epoch 3/10
Epoch 00003: val_acc improved from 0.56731 to 0.61966, saving model to weights.hdf5
Epoch 4/10
Epoch 00004: val_acc improved from 0.61966 to 0.66239, saving model to weights.hdf5
Epoch 5/10
Epoch 00005: val_acc improved from 0.66239 to 0.67735, saving model to weights.hdf5
Epoch 6/10
Epoch 00006: val_acc improved from 0.67735 to 0.71474, saving model to weights.hdf5
Epoch 7/10
Epoch 00007: val_acc improved from 0.71474 to 0.76603, saving model to weights.hdf5
Epoch 8/10
Epoch 00008: val_acc improved from 0.76603 to 0.77778, saving model to weights.hdf5
Epoch 9/10
Epoch 00009: val_acc did not improve
Epoch 10/10
Epoch 00010: val_acc did not improve


In [62]:
predicted = model.predict(X_test)
predicted = np.argmax(predicted, axis=1)
accuracy_score(y_test, predicted)

0.7746591820368885