In [1]:
# load data
import codecs
import pandas as pd
import numpy as np

dt=pd.read_csv("data\\Merged_final.csv", encoding='utf-8',
               usecols=['text_preprocessed','polarityNum'],skip_blank_lines=True) # 33295 lines
X=np.array(dt['text_preprocessed'])
y=np.array(dt['polarityNum']).astype('int')
print(X.shape)
print(y.shape)


(33294,)
(33294,)


In [2]:
from  sklearn.model_selection import train_test_split
x_train,x_test=train_test_split(X, test_size=0.1, random_state=42)
y_train,y_test=train_test_split(y,test_size=0.1,random_state=42)

In [3]:
# some overview of the data, and change label to numpy arrary
import numpy as np
print('Loading data...')
print(len(x_train), 'number of train samples')

max_len=len(max(x_train, key=len))
num_classes=3

Loading data...
29964 number of train samples


In [4]:
max_len = 100
max_word=10000    # original vocab_size=11315

In [5]:
## build the char index, uni-gram model
from keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer(num_words=max_word,split=" ")
tokenizer.fit_on_texts(x_train)

x_train=tokenizer.texts_to_sequences(x_train) # convert to np matrix
x_test=tokenizer.texts_to_sequences(x_test)

## convert x-train to np array by pad seq to fix the dim
from keras.preprocessing import sequence
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train,max_len)
x_test = sequence.pad_sequences(x_test,max_len)

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Pad sequences (samples x time)
x_train shape: (29964, 100)
x_test shape: (3330, 100)


In [6]:
import keras
print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test_bi = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test_bi.shape)

Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (29964, 3)
y_test shape: (3330, 3)


## build model

### LSTM

In [7]:
batch_size = 64
num_epochs =10 

In [8]:
from keras.models import Sequential  
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout,Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import LSTM, Bidirectional
from keras.layers.merge import Concatenate

In [9]:
# define the model
def define_model(length, vocab_size):
    inputs = Input(shape=(length,))
    lstm= Embedding(vocab_size, 300, input_length=length)(inputs)
    lstm=Bidirectional(LSTM(150))(lstm)
    lstm=Dropout(0.5)(lstm)
    outputs = Dense(num_classes, activation='softmax')(lstm)
    model = Model(inputs=inputs, outputs=outputs)
    # compile
    model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
    # summarize
    print(model.summary())
    #plot_model(model, show_shapes=True, to_file='multichannel.png')
    return model

In [10]:

model=define_model(max_len, max_word)
# fit model
print('Train...')

from keras.callbacks import EarlyStopping
es=EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
from keras.callbacks import ModelCheckpoint
mc = ModelCheckpoint('weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=0, save_best_only=True, mode='min', period=1)
history=model.fit(x_train, y_train, epochs=num_epochs, batch_size=batch_size,validation_split=0.1,callbacks=[es,mc]) #



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          3000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 300)               541200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 903       
Total params: 3,542,103
Trainable params: 3,542,103
Non-trainable params: 0
_________________________________________________________________
None
Train...
Train on 26967 samples, validate on 2997 samples
Epoch 1/2
Epoch 2/2


In [11]:
#evaluate
score = model.evaluate(x_test, y_test_bi,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

y_pred=model.predict(x_test, batch_size=batch_size)
y_pred=np.argmax(y_pred, axis=1)
np.place(y_pred,y_pred==2,(-1))


# with codecs.open("output_multichlCnn.txt",'a',encoding='utf-8') as file:
#     for output in outputs:
#         file.writeline(output)

from sklearn.metrics import f1_score,classification_report
print('F1-macro score for 3 classes:', f1_score(y_test,y_pred,average='macro'))
print(classification_report(y_test, y_pred))


Test score: 0.5727925745216575
Test accuracy: 0.7711711711711712
F1-macro score for 3 classes: 0.7695091314745787
             precision    recall  f1-score   support

         -1       0.75      0.87      0.81      1124
          0       0.81      0.70      0.75      1170
          1       0.76      0.75      0.75      1036

avg / total       0.77      0.77      0.77      3330



In [None]:
# from keras.models import Sequential             # save model
# from keras.layers import Dense
# from keras.models import model_from_json
# import numpy
# import os
# model_json = model.to_json()
# with open("model_multichl_cnn.json", "w") as json_file:
#     json_file.write(model_json)
# # serialize weights to HDF5
# model.save_weights("model_multichl_cnn.h5")
