# vanilla CNN

In [30]:
''' use the one hold-out method for evaluation'''
# load data
import codecs
import pandas as pd
import numpy as np

dt=pd.read_csv("data\\samsung_final_14may.csv", encoding='utf-8',
               usecols=['id','text_original','text_preprocessed','polarityNum'],skip_blank_lines=True) 
X=np.array(dt['text_preprocessed'])
y=np.array(dt['polarityNum']).astype('int')
print(X.shape)
print(y.shape)

from  sklearn.model_selection import train_test_split
x_train,x_test=train_test_split(X, test_size=0.1, random_state=4)
y_train,y_test=train_test_split(y,test_size=0.1,random_state=4)

(32466,)
(32466,)


In [31]:
# some overview of the data, and change label to numpy arrary
import numpy as np
print('Loading data...')
print(len(x_train), 'number of train samples')

max_len=len(max(x_train, key=len))
num_classes=3

Loading data...
29219 number of train samples


In [32]:
max_len = 100
#max_word=10000    # original vocab_size=11315

In [33]:
## build the char index, uni-gram model
from keras.preprocessing.text import Tokenizer

tokenizer=Tokenizer(split=" ")
#tokenizer=Tokenizer(num_words=max_word,split=" ")
tokenizer.fit_on_texts(x_train)
max_word=len(tokenizer.word_index)+1

x_train=tokenizer.texts_to_sequences(x_train) # convert to np matrix
x_test=tokenizer.texts_to_sequences(x_test)

## convert x-train to np array by pad seq to fix the dim
from keras.preprocessing import sequence
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train,max_len)
x_test = sequence.pad_sequences(x_test,max_len)

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (29219, 100)
x_test shape: (3247, 100)


In [34]:
import keras
print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test_bi = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test_bi.shape)

Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (29219, 3)
y_test shape: (3247, 3)


## build model

### vanilla CNN

In [35]:
from keras.models import Sequential  
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout,Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate

In [45]:
# vanilla cnn
# set parameters:
batch_size = 64 #[32,64,128]
embedding_dims = 300 #[100,150,300]
nb_filters = 128  #[64,128,256]
kernel_size = 3 #[3,7]
hidden_dims = 128 #[64,128,256]


print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_word,
                    embedding_dims,
                    input_length=max_len))
          
model.add(Dropout(0.2)) #[0.2,0.5,0.8]

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(nb_filters,
                 kernel_size,
                 activation='linear', #[relu,tanh,linear]
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(3))
model.add(Activation('softmax'))

model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])


Build model...


## Fit Model

In [46]:
num_epochs = 1
# fit model
print('Train...')
print(model.summary())
from keras.callbacks import EarlyStopping
es=EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=0, mode='auto')
from keras.callbacks import ModelCheckpoint
mc = ModelCheckpoint('model\\weights0.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=0, save_best_only=True, mode='min', period=1)
history=model.fit(x_train, y_train, epochs=num_epochs, batch_size=batch_size,validation_split=0.1,callbacks=[es,mc]) #
model.save('model\\wm_cnn_d2_allwd_linear_e1.h5')


Train...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 100, 300)          8664600   
_________________________________________________________________
dropout_15 (Dropout)         (None, 100, 300)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 98, 128)           115328    
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 128)               16512     
_________________________________________________________________
dropout_16 (Dropout)         (None, 128)               0         
_________________________________________________________________
activation_15 (Activation)   (None, 128)               0         
_

## Test Model

In [47]:
# load weights with least var_loss into new model
from keras.models import load_model
model = load_model('model\\wm_cnn_d2_allwd_linear_e1.h5')
#model.load_weights("model\\weights0.01-0.64.hdf5")

#evaluate
score = model.evaluate(x_test, y_test_bi,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

y_pred=model.predict(x_test, batch_size=batch_size)
y_pred=np.argmax(y_pred, axis=1)
np.place(y_pred,y_pred==2,(-1))

from sklearn.metrics import f1_score,classification_report
print('F1-macro score for 3 classes:', f1_score(y_test,y_pred,average='macro'))
print(classification_report(y_test, y_pred))


Test score: 0.6279504761021799
Test accuracy: 0.7320603632838828
F1-macro score for 3 classes: 0.7318716398751296
             precision    recall  f1-score   support

         -1       0.74      0.74      0.74      1092
          0       0.77      0.70      0.74      1162
          1       0.68      0.76      0.72       993

avg / total       0.74      0.73      0.73      3247

