In [13]:
# load data
import codecs
import pandas as pd
import numpy as np

dt=pd.read_csv("data\\Merged_final.csv", encoding='utf-8',
               usecols=['text_preprocessed','polarityNum'],skip_blank_lines=True) # 33295 lines
X=np.array(dt['text_preprocessed'])
y=np.array(dt['polarityNum']).astype('int')
print(X.shape)
print(y.shape)


(33294,)
(33294,)


In [15]:
from  sklearn.model_selection import train_test_split
x_train,x_test=train_test_split(X, test_size=0.1, random_state=42)
y_train,y_test=train_test_split(y, test_size=0.1, random_state=42)

In [16]:
# some overview of the data, and change label to numpy arrary
import numpy as np
print('Loading data...')
print(len(x_train), 'number of train samples')

max_len=len(max(x_train, key=len))
num_classes=3

Loading data...
8989 number of train samples


In [17]:
max_len = 100
max_word=10000    # original vocab_size=11315

In [18]:
## build the char index, uni-gram model
from keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer(nb_words=max_word,split=" ")
tokenizer.fit_on_texts(x_train)

x_train=tokenizer.texts_to_sequences(x_train) # convert to np matrix
x_test=tokenizer.texts_to_sequences(x_test)

## convert x-train to np array by pad seq to fix the dim
from keras.preprocessing import sequence
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train,max_len)
x_test = sequence.pad_sequences(x_test,max_len)

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)



Pad sequences (samples x time)
x_train shape: (8989, 100)
x_test shape: (999, 100)


In [19]:
import keras
print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test_bi = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test_bi.shape)

Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (8989, 3)
y_test shape: (999, 3)


## build model

### vanilla CNN

In [8]:
from keras.models import Sequential  
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout,Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate

In [20]:
# vanilla cnn
# set parameters:
batch_size = 64
embedding_dims = 300
nb_filters = 128
kernel_size = 3
hidden_dims = 128


print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_word,
                    embedding_dims,
                    input_length=max_len))
          
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(nb_filters,
                 kernel_size,
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(3))
model.add(Activation('softmax'))

model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])


Build model...


In [21]:
num_epochs = 5
# fit model
print('Train...')
print(model.summary())
from keras.callbacks import EarlyStopping
es=EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
from keras.callbacks import ModelCheckpoint
mc = ModelCheckpoint('weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=0, save_best_only=True, mode='min', period=1)
history=model.fit(x_train, y_train, epochs=num_epochs, batch_size=batch_size,validation_split=0.1,callbacks=[es,mc]) #



Train...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 300)          3000000   
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 98, 128)           115328    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
activation_3 (Activation)    (None, 128)               0         
_

In [11]:
#            # save model
# from keras.layers import Dense
# from keras.models import model_from_json
# import numpy
# import os
# model_json = model.to_json()
# with open(wm_vanillacnn.json", "w") as json_file:
#     json_file.write(model_json)
# # serialize weights to HDF5
# model.save_weights("wm_vanillacnn.h5")


In [23]:
#evaluate
score = model.evaluate(x_test, y_test_bi,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

y_pred=model.predict(x_test, batch_size=batch_size)
y_pred=np.argmax(y_pred, axis=1)
np.place(y_pred,y_pred==2,(-1))

# with codecs.open("output_multichlCnn.txt",'a',encoding='utf-8') as file:
#     for output in outputs:
#         file.writeline(output)

from sklearn.metrics import f1_score,classification_report
print('F1-macro score for 3 classes:', f1_score(y_test,y_pred,average='macro'))
print(classification_report(y_test, y_pred))


Test score: 0.9968377252718111
Test accuracy: 0.6856856862823287
F1-macro score for 3 classes: 0.6849213644400951
             precision    recall  f1-score   support

         -1       0.66      0.74      0.70       324
          0       0.73      0.61      0.67       331
          1       0.68      0.70      0.69       344

avg / total       0.69      0.69      0.68       999

