In [2]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
from keras.models import load_model
from keras import regularizers

Using TensorFlow backend.


In [4]:
df = pd.read_csv('sentiment_5sent_clean')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,sentiment,author,content
0,1,1956967666,sadness,wannamama,layin bed headache ughwaitin
1,2,1956967696,sadness,coolfunky,funeral ceremonygloomy friday
2,4,1956968416,neutral,xkilljoyx,dannycastillo want trade houston ticket
3,5,1956968477,worry,xxxPEACHESxxx,repinge prom bc bf like friend
4,6,1956968487,sadness,ShansBee,sleep think old friend want s married damn amp...


In [6]:
df['content'] = df['content'].astype('str')

In [7]:
X = df['content']

In [8]:
y = pd.get_dummies(df['sentiment']).values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

In [10]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [11]:
# calculate the maximum document length
def max_length(lines):
    return max([len(s.split()) for s in lines])

In [12]:
# encode a list of lines
def encode_text(tokenizer, lines, length):
    # integer encode
    encoded = tokenizer.texts_to_sequences(lines)
    # pad encoded sequences
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

In [13]:
# create tokenizer
tokenizer = create_tokenizer(X_train)

In [14]:
length = max_length(X_train)
print('Max document length: %d' % length)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)

Max document length: 22
Vocabulary size: 32788


In [15]:
# encode data
trainX = encode_text(tokenizer, X_train, length)
testX = encode_text(tokenizer, X_test, length)

In [16]:
# define the model
def define_model(length, vocab_size):
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu', kernel_regularizer=regularizers.l2(l=0.08))(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu', kernel_regularizer=regularizers.l2(l=0.08))(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)

    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu', kernel_regularizer=regularizers.l2(l=0.08))(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    
    # merge
    merged = concatenate([flat1, flat2, flat3])
    # interpretation
    dense1 = Dense(5, activation='relu')(merged)
    outputs = Dense(5, activation='softmax')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    # compile
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize
    model.summary()
    plot_model(model, show_shapes=True, to_file='multichannel.png')
    return model

In [18]:
# define model
model = define_model(length, vocab_size)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 22)           0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 22)           0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 22)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 22, 100)      3278800     input_4[0][0]                    
____________________________________________________________________________________________

In [19]:
# fit model
model.fit([trainX,trainX,trainX], y_train, epochs=7, batch_size=16)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.callbacks.History at 0x21cd63b5188>

In [20]:
# save the model
model.save('model.h5')

In [21]:
# load the model
model = load_model('model.h5')

In [22]:
# evaluate model on training dataset
_, acc = model.evaluate([trainX,trainX,trainX], y_train, verbose=0)
print('Train Accuracy: %.2f' % (acc*100))

Train Accuracy: 81.13


In [23]:
# evaluate model on test dataset dataset
_, acc = model.evaluate([testX,testX,testX], y_test, verbose=0)
print('Test Accuracy: %.2f' % (acc*100))

Test Accuracy: 79.99
