In [1]:
# load data
import codecs
import pandas as pd
import numpy as np

dt=pd.read_csv("data\\Merged_final.csv", encoding='utf-8',
               usecols=['text_preprocessed','polarityNum'],skip_blank_lines=True) # 33295 lines
X=np.array(dt['text_preprocessed'])
y=np.array(dt['polarityNum']).astype('int')
print(X.shape)
print(y.shape)


(33290,)
(33290,)


In [10]:

from  sklearn.model_selection import train_test_split
x_train,x_test=train_test_split(X, test_size=0.1, random_state=42)
y_train,y_test=train_test_split(y,test_size=0.1,random_state=42)

In [11]:
# some overview of the data, and change label to numpy arrary
import numpy as np
print('Loading data...')
print(len(x_train), 'number of train samples')

#max_len=len(max(x_train, key=len))
num_classes=3

Loading data...
29961 number of train samples


In [12]:
max_len = 100
max_word=10000    # original vocab_size=11315

In [13]:
## build the char index, uni-gram model
from keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer(nb_words=max_word,split=" ")
tokenizer.fit_on_texts(x_train)

x_train=tokenizer.texts_to_sequences(x_train) # convert to np matrix
x_test=tokenizer.texts_to_sequences(x_test)

## convert x-train to np array by pad seq to fix the dim
from keras.preprocessing import sequence
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train,max_len)
x_test = sequence.pad_sequences(x_test,max_len)

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)



Pad sequences (samples x time)
x_train shape: (29961, 100)
x_test shape: (3329, 100)


In [14]:
import keras
print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test_bi = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test_bi.shape)

Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (29961, 3)
y_test shape: (3329, 3)


## build model

### multi-channel CNN

In [6]:
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers import MaxPooling1D,GlobalMaxPooling1D
from keras.layers.merge import Concatenate

In [7]:
# parameters of cnn model

filter_length=[5,3]
nb_filters=[128,128]
dropout_prob = [0.5,0.5]
pool_length = [2,2]
batch_size = 64


In [8]:
# define the model
def define_model(length, vocab_size):
    
    # multi-channel cnn
    channel=[]
    inputs=[]
    for i in range(len(filter_length)):
        input_channel = Input(shape=(length,))
        embedding = Embedding(vocab_size, 100)(input_channel)
        conv = Conv1D(filters=nb_filters[i], kernel_size=filter_length[i], activation='relu')(embedding)
        drop = Dropout(dropout_prob[i])(conv)
        pool = MaxPooling1D(pool_size=pool_length[i])(drop)
        
        inputs.append(input_channel)
        channel.append(pool)

    # feed into 1 more cnn layer
    merged = Concatenate(axis=1)([channel[0], channel[1]] )
    pool = GlobalMaxPooling1D()(merged)
            
    # interpretation
    outputs = Dense(num_classes, activation='softmax')(pool)
    model = Model(inputs=inputs, outputs=outputs)
    # compile
    model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
    # summarize
    print(model.summary())
    #plot_model(model, show_shapes=True, to_file='multichannel.png')
    return model

In [15]:
num_epochs = 5
#define model
model= define_model(max_len,max_word)
# fit model
print('Train...')
from keras.callbacks import EarlyStopping
es=EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=0, mode='auto')
from keras.callbacks import ModelCheckpoint
mc = ModelCheckpoint('weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=0, save_best_only=True, mode='min', period=1)
history=model.fit([x_train, x_train], y_train, epochs=num_epochs, batch_size=batch_size,validation_split=0.1,callbacks=[es,mc]) #



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 100, 100)     1000000     input_3[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 100, 100)     1000000     input_4[0][0]                    
__________________________________________________________________________________________________
conv1d_3 (

In [16]:
#evaluate
score = model.evaluate([x_test, x_test], y_test_bi,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

y_pred=model.predict([x_test,x_test], batch_size=batch_size)
y_pred=np.argmax(y_pred, axis=1)
np.place(y_pred,y_pred==2,(-1))

# with codecs.open("output_multichlCnn.txt",'a',encoding='utf-8') as file:
#     for output in outputs:
#         file.writeline(output)

from sklearn.metrics import f1_score,classification_report
print('F1-macro score for 3 classes:', f1_score(y_test,y_pred,average='macro'))
print(classification_report(y_test, y_pred))


Test score: 0.700002763195416
Test accuracy: 0.7164313607689997
F1-macro score for 3 classes: 0.7162152058805763
             precision    recall  f1-score   support

         -1       0.72      0.71      0.72      1089
          0       0.75      0.67      0.71      1132
          1       0.69      0.77      0.72      1108

avg / total       0.72      0.72      0.72      3329

