# LSTM

In [1]:
''' use the one hold-out method for evaluation'''
# load data
import codecs
import pandas as pd
import numpy as np

dt=pd.read_csv("data\\samsung_final_14may.csv", encoding='utf-8',
               usecols=['id','text_original','text_preprocessed','polarityNum'],skip_blank_lines=True) 
X=np.array(dt['text_preprocessed'])
y=np.array(dt['polarityNum']).astype('int')
print(X.shape)
print(y.shape)

from  sklearn.model_selection import train_test_split
x_train,x_test=train_test_split(X, test_size=0.1, random_state=4)
y_train,y_test=train_test_split(y,test_size=0.1,random_state=4)

(32466,)
(32466,)


In [10]:
''' use the gold standard for evaluation'''
# load data
import codecs
import pandas as pd
import numpy as np
from  sklearn.model_selection import train_test_split

dt=pd.read_csv("data\\samsung_final_14may.csv", encoding='utf-8',
               usecols=['id','text_original','text_preprocessed','polarityNum'],skip_blank_lines=True) 
x_train=np.array(dt['text_preprocessed'])
y_train=np.array(dt['polarityNum']).astype('int')
print(x_train.shape)
print(y_train.shape)
dt1=pd.read_csv("data\\samsung_test_final_14may.csv", encoding='utf-8',
               usecols=['id','text_original','text_preprocessed','polarityNum'],skip_blank_lines=True) 
x_test=np.array(dt1['text_preprocessed'])
y_test=np.array(dt1['polarityNum']).astype('int')

(32466,)
(32466,)


In [2]:
# some overview of the data, and change label to numpy arrary
import numpy as np
print('Loading data...')
print(len(x_train), 'number of train samples')

max_len=len(max(x_train, key=len))
num_classes=3

Loading data...
29219 number of train samples


In [3]:
max_len = 100   # [50,100,na]
#max_word=10000    # original vocab_size=11315

In [4]:
## build the char index, uni-gram model
from keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer(split=" ")
#tokenizer=Tokenizer(num_words=max_word,split=" ")
tokenizer.fit_on_texts(x_train)
max_word=len(tokenizer.word_index)+1

x_train=tokenizer.texts_to_sequences(x_train) # convert to np matrix
x_test=tokenizer.texts_to_sequences(x_test)

## convert x-train to np array by pad seq to fix the dim
from keras.preprocessing import sequence
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train,max_len)
x_test = sequence.pad_sequences(x_test,max_len)

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Pad sequences (samples x time)
x_train shape: (29219, 100)
x_test shape: (3247, 100)


In [5]:
import keras
print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test_bi = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test_bi.shape)

Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (29219, 3)
y_test shape: (3247, 3)


## build model

### LSTM

In [6]:
batch_size = 64 # [32,64,128]
num_epochs =5  # early stopping, 1

In [7]:
from keras.models import Sequential  
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout,Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import LSTM, Bidirectional
from keras.layers.merge import Concatenate

In [12]:
# define the model
def define_model(length, vocab_size):
    inputs = Input(shape=(length,))
    lstm= Embedding(vocab_size, 300, input_length=length)(inputs)  #100,150,300
    lstm=Bidirectional(LSTM(150))(lstm) #100,150,300
    lstm=Dropout(0.5)(lstm) # [0.2,0.5,0.8]
    outputs = Dense(num_classes, activation='softmax')(lstm)
    model = Model(inputs=inputs, outputs=outputs)
    # compile
    model.compile('adam', 'categorical_crossentropy', metrics=['accuracy']) #[adam,]
    # summarize
    print(model.summary())
    #plot_model(model, show_shapes=True, to_file='multichannel.png')
    return model

In [13]:

model=define_model(max_len, max_word)
# fit model
print('Train...')

from keras.callbacks import EarlyStopping
es=EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=0, mode='auto')
from keras.callbacks import ModelCheckpoint
mc = ModelCheckpoint('model\\weights2.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=0, save_best_only=True, mode='min', period=1)
history=model.fit(x_train, y_train, epochs=num_epochs, batch_size=batch_size,validation_split=0.1,callbacks=[es,mc]) #
model.save('model\\wm_lstm2.h5')


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 300)          8664600   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 300)               541200    
_________________________________________________________________
dropout_2 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 903       
Total params: 9,206,703
Trainable params: 9,206,703
Non-trainable params: 0
_________________________________________________________________
None
Train...
Train on 26297 samples, validate on 2922 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5


In [17]:
# load weights with least var_loss into new model
from keras.models import load_model
model = load_model('model\\wm_lstm_d5_allwd_adam_e2.h5')
#model.load_weights("model\\weights2.02-0.62.hdf5")

#evaluate
score = model.evaluate(x_test, y_test_bi,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

y_pred=model.predict(x_test, batch_size=batch_size)
y_pred=np.argmax(y_pred, axis=1)
np.place(y_pred,y_pred==2,(-1))

from sklearn.metrics import f1_score,classification_report
print('F1-macro score for 3 classes:', f1_score(y_test,y_pred,average='macro'))
print(classification_report(y_test, y_pred))


Test score: 0.5914144688912601
Test accuracy: 0.765321835191719
F1-macro score for 3 classes: 0.7644080622380697
             precision    recall  f1-score   support

         -1       0.74      0.85      0.79      1092
          0       0.82      0.68      0.74      1162
          1       0.74      0.78      0.76       993

avg / total       0.77      0.77      0.76      3247



## Save Model

In [16]:
from keras.models import Sequential             # save model
from keras.layers import Dense
from keras.models import model_from_json
import numpy
import os
model.save('model\\wm_lstm_d5_allwd_adam_e2.h5')