In [10]:
import os
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
from collections import Counter

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import  Dense, Embedding, LSTM, Bidirectional, Dropout, Activation,  Conv1D, GlobalMaxPooling1D
from keras import regularizers, initializers

In [81]:
def create_dictionary(texts, vocab_size):
    """
    Creates a dictionary that maps words to ids. More frequent words have lower ids.
    The dictionary contains at the vocab_size-1 most frequent words (and a placeholder '<unk>' for unknown words).
    The place holder has the id 0.
    """
    counter = Counter()
    for tokens in texts:
        counter.update(tokens)
    vocab = [w for w, c in counter.most_common(vocab_size - 1)]
    word_to_id = {w: (i + 1) for i, w in enumerate(vocab)}
    word_to_id[UNKNOWN_TOKEN] = 0
    return word_to_id


def to_ids(words, dictionary):
    """
    Takes a list of words and converts them to ids using the word2id dictionary.
    """
    ids = []
    for word in words:
        ids.append(dictionary.get(word, dictionary[UNKNOWN_TOKEN]))
    return ids


def read_data(train_file, dev_file):
    tokenizer = TweetTokenizer()
    trainDF = pd.read_csv(train_file, sep='\t')
    devDF = pd.read_csv(dev_file, sep='\t')

    allDF = pd.concat([trainDF, devDF], ignore_index=True)
    allDF = allDF.reindex(np.random.permutation(allDF.index))
    allDF.insert(1, 'tweet_tokenized', (allDF['Tweet'].apply(lambda x: tokenizer.tokenize(x))))

    word2id = create_dictionary(allDF["tweet_tokenized"], VOCAB_SIZE)

    allDF.insert(1, 'tweet_ids', (allDF['Tweet'].apply(lambda x: to_ids(x, dictionary=word2id))))

    allDF['all'] = allDF.iloc[:, -11:].values.tolist()
    total = len(allDF)
    trainend = int(total * 0.8)
    devend = trainend + int(total * 0.1)
    return allDF.iloc[:trainend, :], allDF.iloc[trainend:devend, :], allDF.iloc[devend:, :]

In [130]:
data_dir = 'D:/3_Programming/1_Studium/Python/SemEval2018_Task1_5/data/'
train_file = os.path.join(data_dir, '2018-E-c-En-train.txt')
dev_file = os.path.join(data_dir, '2018-E-c-En-dev.txt')

VOCAB_SIZE = 100000
MAX_LEN = 50
BATCH_SIZE = 64
EMBEDDING_SIZE = 20
HIDDEN_SIZE = 100
EPOCHS = 10  # Standard 10
UNKNOWN_TOKEN = "<unk>"
EMOTIONS = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love',
            'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

In [131]:
trainDF, devDF, testDF = read_data(train_file, dev_file)
x_train = sequence.pad_sequences(np.array(trainDF['tweet_ids']), maxlen=MAX_LEN)
x_dev = sequence.pad_sequences(np.array(devDF['tweet_ids']), maxlen=MAX_LEN)
x_test = sequence.pad_sequences(np.array(testDF['tweet_ids']), maxlen=MAX_LEN)
y_train = np.array([trainDF['all']])[0]
y_dev = np.array([devDF['all']])[0]


In [132]:
cnn_model = Sequential()
cnn_model.add(Embedding(VOCAB_SIZE, EMBEDDING_SIZE))
cnn_model.add(Conv1D(2 * HIDDEN_SIZE,
                     kernel_size=3,
                     activation='tanh',
                     strides=1,
                     padding='valid',
                     kernel_regularizer=regularizers.l1(0.001),))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(HIDDEN_SIZE, activation='tanh'))
cnn_model.add(Dense(y_train.shape[1], activation='sigmoid'))

In [133]:
from keras.optimizers import rmsprop
opt=rmsprop(lr=0.0001, decay=1e-6)
cnn_model.compile(loss='categorical_crossentropy',
                   optimizer=opt,
                   metrics=['accuracy'])
#opt war adam

In [135]:
cnn_model.fit(
    x_train,
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_dev, y_dev),
    verbose=2
)

Train on 6179 samples, validate on 772 samples
Epoch 1/10


 - 6s - loss: 6.0573 - acc: 0.2763 - val_loss: 5.9521 - val_acc: 0.2591


Epoch 2/10


 - 5s - loss: 5.7743 - acc: 0.2275 - val_loss: 5.6991 - val_acc: 0.0972


Epoch 3/10


 - 5s - loss: 5.5889 - acc: 0.1622 - val_loss: 5.5839 - val_acc: 0.0570


Epoch 4/10


 - 5s - loss: 5.5000 - acc: 0.0918 - val_loss: 5.5144 - val_acc: 0.0570


Epoch 5/10


 - 5s - loss: 5.4364 - acc: 0.1055 - val_loss: 5.4566 - val_acc: 0.0570


Epoch 6/10


 - 5s - loss: 5.3821 - acc: 0.1172 - val_loss: 5.4079 - val_acc: 0.0570


Epoch 7/10


 - 5s - loss: 5.3374 - acc: 0.1134 - val_loss: 5.3674 - val_acc: 0.0570


Epoch 8/10


 - 5s - loss: 5.3012 - acc: 0.1168 - val_loss: 5.3356 - val_acc: 0.0596


Epoch 9/10


 - 5s - loss: 5.2742 - acc: 0.1787 - val_loss: 5.3122 - val_acc: 0.0583


Epoch 10/10


 - 5s - loss: 5.2532 - acc: 0.1316 - val_loss: 5.2961 - val_acc: 0.0829


<keras.callbacks.History at 0x27b0526ee80>

In [136]:
cnn_model.predict(x_test)

array([[ 0.79008341,  0.30411407,  0.80335492, ...,  0.63104343,
         0.10794483,  0.11674609],
       [ 0.78734958,  0.30633661,  0.80123734, ...,  0.63051552,
         0.10968722,  0.11920068],
       [ 0.78783017,  0.30551025,  0.80141258, ...,  0.63141537,
         0.10904864,  0.1185305 ],
       ..., 
       [ 0.78441739,  0.30887508,  0.79674363, ...,  0.62840843,
         0.1129552 ,  0.12210647],
       [ 0.78722936,  0.30642501,  0.80032092, ...,  0.62980491,
         0.10976435,  0.11918981],
       [ 0.76941305,  0.31876999,  0.78167897, ...,  0.61964798,
         0.12614734,  0.13672405]], dtype=float32)