# Vanilla CNN Model
    apply for both binary and 3-class classifications

## load data 

In [15]:
# load data
import codecs
import pandas as pd
import numpy as np
import keras
dt=pd.read_csv("samsung_final_14may.csv", encoding='utf-8',
               usecols=['id','text_original','text_preprocessed','polarityNum'],skip_blank_lines=True)

dt.drop(dt[dt.polarityNum == 0].index, inplace=True) # uncomment it for binary classification, to filter out neutral tweets
X=np.array(dt['text_preprocessed'])
y=np.array(dt['polarityNum']).astype('int')
np.place(y,y==-1,(0)) # uncomment it for binary classification


## split data into train,test set

In [2]:
from  sklearn.model_selection import train_test_split
x_train,x_test=train_test_split(X, test_size=0.1, random_state=4)
y_train,y_test=train_test_split(y,test_size=0.1,random_state=4)

## process data

In [3]:

print(len(x_train), 'number of train samples')

max_len=len(max(x_train, key=len))
num_classes=len(np.unique(y_train))

max_len = 100      # max length of input sequence
#max_word=10000    # max words used

19417 number of train samples


In [4]:
## build the word index, uni-gram model
from keras.preprocessing.text import Tokenizer

tokenizer=Tokenizer(split=" ")
#tokenizer=Tokenizer(num_words=max_word,split=" ")
tokenizer.fit_on_texts(x_train)
max_word=len(tokenizer.word_index)+1

x_train=tokenizer.texts_to_sequences(x_train) # convert to np matrix
x_test=tokenizer.texts_to_sequences(x_test)

## convert x-train to np array by pad seq to fix the dim
from keras.preprocessing import sequence
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train,max_len)
x_test = sequence.pad_sequences(x_test,max_len)

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Pad sequences (samples x time)
x_train shape: (19417, 100)
x_test shape: (2158, 100)


In [5]:
# comment this cell for binary classification
if num_classes==3:
    print('Convert class vector to binary class matrix '
          '(for use with categorical_crossentropy)')
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test_bi = keras.utils.to_categorical(y_test, num_classes)
    print('y_train shape:', y_train.shape)
    print('y_test shape:', y_test_bi.shape)

## build model

In [6]:
from keras.models import Sequential  
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout,Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate

In [11]:
# vanilla cnn for multi-class classification
def define_model_multi(length, vocab_size):
    # set parameters:
    embedding_dims = 300 #[100,150,300]
    nb_filters = 128  #[64,128,250]
    kernel_size = 3 #[3,7]
    hidden_dims = 128 #[64,128,256]

    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    inputs = Input(shape=(length,))
    cnn=Embedding(max_word,
                    embedding_dims,
                    input_length=max_len)(inputs)
          
    cnn=Dropout(0.2)(cnn) #[0.2,0.5,0.8]

    # we add a Convolution1D, which will learn filters
    # word group filters of size filter_length:
    cnn=Conv1D(nb_filters,
                 kernel_size,
                 activation='linear', #[relu,tanh,linear]
                 strides=1)(cnn)
    # we use max pooling:
    cnn=GlobalMaxPooling1D()(cnn)

    # We add a vanilla hidden layer:
    cnn=Dense(hidden_dims)(cnn)
    cnn=Dropout(0.2)(cnn)
    cnn=Activation('relu')(cnn)

    outputs=Dense(3,activation='softmax')(cnn)
    model = Model(inputs=inputs, outputs=outputs)
    # compile
    model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
    # summarize
    print(model.summary())
    return model

In [8]:
# vanilla cnn for binary classification

def define_model_binary(length, vocab_size):
    # set parameters:
    embedding_dims = 300 #[100,150,300]
    nb_filters = 250  #[64,128,250]
    kernel_size = 3 #[3,7]
    hidden_dims = 250 #[64,128,250]

    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    inputs = Input(shape=(length,))
    cnn=Embedding(max_word,
                    embedding_dims,
                    input_length=max_len)(inputs)
          
    cnn=Dropout(0.2)(cnn) #[0.2,0.5,0.8]

    # we add a Convolution1D, which will learn filters
    # word group filters of size filter_length:
    cnn=Conv1D(nb_filters,
                 kernel_size,
                 activation='relu', #[relu,tanh,linear]
                 strides=1)(cnn)
    # we use max pooling:
    cnn=GlobalMaxPooling1D()(cnn)

    # We add a vanilla hidden layer:
    cnn=Dense(hidden_dims)(cnn)
    cnn=Dropout(0.2)(cnn)
    cnn=Activation('relu')(cnn)

    outputs=Dense(1,activation='sigmoid')(cnn)
    model = Model(inputs=inputs, outputs=outputs)
    # compile
    model.compile('adam', 'binary_crossentropy', metrics=['binary_accuracy']) #[rmsprop,adam]
    # summarize
    print(model.summary())
    return model

## fit model and save intermediate results

In [12]:
num_epochs = 1 #optimal after earlystopping tuning
batch_size = 64 #[32,64,128]
# fit model
if num_classes==3:
    model=define_model_multi(max_len, max_word)
elif num_classes==2:
    model=define_model_binary(max_len, max_word)

print('Train...')

from keras.callbacks import EarlyStopping
es=EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=0, mode='auto')
from keras.callbacks import ModelCheckpoint
mc = ModelCheckpoint('model\\weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=0, save_best_only=True, mode='min', period=1)
history=model.fit(x_train, y_train, epochs=num_epochs, batch_size=batch_size,validation_split=0.1,callbacks=[es,mc]) #
model.save('model\\wm_cnn.h5')


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 300)          6486300   
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 98, 250)           225250    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 250)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_4 (Dropout)          (None, 250)               0         
__________

## test model

In [13]:
from keras.models import load_model
model = load_model('model\\wm_cnn.h5')
# load weights with least var_loss into new model for comparision
#model.load_weights("model\\weights0.01-0.64.hdf5")

#evaluate
if num_classes==3:
    score = model.evaluate(x_test, y_test_bi,
                       batch_size=batch_size, verbose=1)
elif num_classes==2:
    score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
    
print('Test score:', score[0])
print('Test accuracy:', score[1])

y_pred=model.predict(x_test, batch_size=batch_size)

if num_classes ==3:
    y_pred=np.argmax(y_pred, axis=1)
    np.place(y_pred,y_pred==2,(-1))
elif num_classes==2:
    y_pred=y_pred.flatten().tolist()
    

Test score: 0.38027802670410765
Test accuracy: 0.8271547734903119


## evaluate model

In [14]:
from sklearn.metrics import f1_score,classification_report
if num_classes==2:
    from decimal import Decimal, ROUND_HALF_EVEN
    threshold = Decimal(0.5).quantize(Decimal('.01'), rounding=ROUND_HALF_EVEN)
    y_output=[]
    for i in range(len(y_pred)):
        a = Decimal(y_pred[i]).quantize(Decimal('.01'), rounding=ROUND_HALF_EVEN)
        Diff = a - threshold
        cmp = Decimal(0.00).quantize(Decimal('.01'), rounding=ROUND_HALF_EVEN)

        if Diff.compare(cmp) >= 0:
            y_output.append(1)
        else:
            y_output.append(0)
    print('F1 score for 2 classes:', f1_score(y_test,y_output,average='binary'))
    print(classification_report(y_test, y_output))

elif num_classes==3:
    print('F1-macro score for 3 classes:', f1_score(y_test,y_pred,average='macro'))
    print(classification_report(y_test, y_pred))

F1 score for 2 classes: 0.8329641116526364
             precision    recall  f1-score   support

          0       0.87      0.77      0.82      1096
          1       0.79      0.89      0.83      1062

avg / total       0.83      0.83      0.82      2158

