In [1]:
TRAIN_PATH = '/home/mgimenez/Dev/corpora/Semeval2016/stancedataset/StanceDataset/train_ok.csv'
TEST_PATH = '/home/mgimenez/Dev/corpora/Semeval2016/stancedataset/StanceDataset/test_ok.csv'

In [2]:
import pandas as pd

In [3]:
def load_data(path):
    tweets = []
    stances = []
    with open(path) as f:
        next(f)
        for line in f:
            splited_line = line.split('\t')
            if len(splited_line) != 5:
                print(splited_line)
            tweet, target, stance, opinion, sentiment = splited_line
            opinion = opinion[0]
            tweets.append(tweet)
            if stance == 'AGAINST':
                stances.append(0)
            else:
                stances.append(1)
    return tweets, stances

In [4]:
tweets_training, stance_training = load_data(TRAIN_PATH)
tweets_test, stance_test = load_data(TEST_PATH)

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

MAX_NB_WORDS = 20000
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(tweets_training)
sequences_train = tokenizer.texts_to_sequences(tweets_training)
sequences_test = tokenizer.texts_to_sequences(tweets_test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Found 9427 unique tokens.


In [6]:
MAX_SEQUENCE_LENGTH=1000
data_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

labels_train = to_categorical(np.asarray(stance_training))
labels_test = to_categorical(np.asarray(stance_test))
print('Shape of the training data tensor:', data_train.shape)
print('Shape of the testing data tensor:', data_test.shape)
print('\nShape of the training label tensor:', labels_train.shape)
print('Shape of the testing label tensor:', labels_test.shape)

Shape of the training data tensor: (2914, 1000)
Shape of the testing data tensor: (1956, 1000)

Shape of the training label tensor: (2914, 2)
Shape of the testing label tensor: (1956, 2)


In [7]:
GLOVE_DIR = '/home/mgimenez/Dev/resources/GloVe/twitter_dataset/'
EMBEDDING_DIM = 100

In [8]:
from os.path import join
embeddings_index = {}
f = open(join(GLOVE_DIR, 'glove.twitter.27B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [9]:
from sklearn.metrics import f1_score

In [26]:
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Concatenate, Dropout
from keras.models import Model

In [11]:
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
print(num_words)
embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words+1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Preparing embedding matrix.
9427


In [12]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print(model.summary())

model.fit(data_train, labels_train, epochs=10, batch_size=128)
predictions = model.predict(data_test)
f1 = f1_score(labels_test, np.around(predictions), average='macro')
print('\n F1 result: {}'.format(f1))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         942800    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 35, 128)           82048     
__________

In [13]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adagrad',
              metrics=['acc'])

print(model.summary())

model.fit(data_train, labels_train, epochs=10, batch_size=128)
predictions = model.predict(data_test)
f1 = f1_score(labels_test, np.around(predictions), average='macro')
print('\n F1 result: {}'.format(f1))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         942800    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 35, 128)           82048     
__________

In [14]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

print(model.summary())

model.fit(data_train, labels_train, epochs=10, batch_size=128)
predictions = model.predict(data_test)
f1 = f1_score(labels_test, np.around(predictions), average='macro')
print('\n F1 result: {}'.format(f1))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         942800    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 35, 128)           82048     
__________

In [34]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

filters = [1,3,5]
convs = []
for f in filters:
    conv = Conv1D(128, f, activation='relu')(embedded_sequences)
    pool = MaxPooling1D(f)(conv)
    flatten = Flatten()(pool)
    convs.append(flatten)
    
l_merge = Concatenate(axis=1)(convs)
    
x = Dense(128, activation='relu')(l_merge)
preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adagrad',
              metrics=['acc'])

print(model.summary())

model.fit(data_train, labels_train, epochs=10, batch_size=128)
predictions = model.predict(data_test)
f1 = f1_score(labels_test, np.around(predictions), average='macro')
print('\n F1 result: {}'.format(f1))

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_19 (InputLayer)            (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1000, 100)     942800      input_19[0][0]                   
____________________________________________________________________________________________________
conv1d_85 (Conv1D)               (None, 1000, 128)     12928       embedding_1[18][0]               
____________________________________________________________________________________________________
conv1d_86 (Conv1D)               (None, 998, 128)      38528       embedding_1[18][0]               
___________________________________________________________________________________________

  'precision', 'predicted', average, warn_for)


In [37]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

filter_sizes = [3,4,5]
convs = []

for f in filter_sizes:
    l_conv = Conv1D(filters=128, activation="relu", kernel_size=f)(embedded_sequences)
    l_pool = MaxPooling1D(5)(l_conv)
    convs.append(l_pool)
    
l_merge = Concatenate(axis=1)(convs)
l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(30)(l_cov2)
l_flat = Flatten()(l_pool2)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(2, activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adagrad',
              metrics=['acc'])

print(model.summary())

model.fit(data_train, labels_train, epochs=10, batch_size=128)
predictions = model.predict(data_test)
f1 = f1_score(labels_test, np.around(predictions), average='macro')
print('\n F1 result: {}'.format(f1))

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_22 (InputLayer)            (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1000, 100)     942800      input_22[0][0]                   
____________________________________________________________________________________________________
conv1d_93 (Conv1D)               (None, 998, 128)      38528       embedding_1[21][0]               
____________________________________________________________________________________________________
conv1d_94 (Conv1D)               (None, 997, 128)      51328       embedding_1[21][0]               
___________________________________________________________________________________________