In [1]:
import os
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
from collections import Counter

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout, Activation, Conv1D, GlobalMaxPooling1D
from keras import regularizers, initializers



Using TensorFlow backend.


In [27]:
def create_dictionary(texts, vocab_size):
    """
    Creates a dictionary that maps words to ids. More frequent words have lower ids.
    The dictionary contains at the vocab_size-1 most frequent words (and a placeholder '<unk>' for unknown words).
    The place holder has the id 0.
    """
    counter = Counter()
    for tokens in texts:
        counter.update(tokens)
    vocab = [w for w, c in counter.most_common(vocab_size - 1)]
    word_to_id = {w: (i + 1) for i, w in enumerate(vocab)}
    word_to_id[UNKNOWN_TOKEN] = 0
    return word_to_id


def to_ids(words, dictionary):
    """
    Takes a list of words and converts them to ids using the word2id dictionary.
    """
    ids = []
    for word in words:
        ids.append(dictionary.get(word, dictionary[UNKNOWN_TOKEN]))
    return ids


def read_data(train_file, dev_file):
    tokenizer = TweetTokenizer()
    trainDF = pd.read_csv(train_file, sep='\t')
    devDF = pd.read_csv(dev_file, sep='\t')

    allDF = pd.concat([trainDF, devDF], ignore_index=True)
    allDF = allDF.reindex(np.random.permutation(allDF.index))
    allDF.insert(1, 'tweet_tokenized', (allDF['Tweet'].apply(lambda x: tokenizer.tokenize(x))))

    word2id = create_dictionary(allDF["tweet_tokenized"], VOCAB_SIZE)

    allDF.insert(1, 'tweet_ids', (allDF['Tweet'].apply(lambda x: to_ids(x, dictionary=word2id))))

    allDF['all'] = allDF.iloc[:, -11:].values.tolist()

    # calc class weights
    # class_weights = compute_class_weights(allDF)
    class_weights = None    
    # print(class_weights)

    total = len(allDF)
    trainend = int(total * 0.8)
    devend = trainend + int(total * 0.1)
    return allDF.iloc[:trainend, :], allDF.iloc[trainend:devend, :], allDF.iloc[devend:, :], class_weights


def evaluate(predictions, y_test):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    all_correct = 0
    for i, pred in enumerate(predictions):
        for j, em in enumerate(pred):
            if em >= 0.5:
                if y_test[i][j] == 1:
                    tp += 1
                else:
                    fp += 1
            if em < 0.5:
                if y_test[i][j] == 1:
                    fn += 1
                else:
                    tn += 1
            if tp + tn == y_test.shape[1]:
                all_correct += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)

    print("F1: {}\nPrecision: {}\nRecall: {}\nCompletely correct: {}".format(f1, precision, recall, all_correct))


In [3]:
data_dir = 'D:/3_Programming/1_Studium/Python/SemEval2018_Task1_5/data/'
train_file = os.path.join(data_dir, '2018-E-c-En-train.txt')
dev_file = os.path.join(data_dir, '2018-E-c-En-dev.txt')

VOCAB_SIZE = 100000
MAX_LEN = 100
BATCH_SIZE = 32
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 50
EPOCHS = 10  # Standard 10
UNKNOWN_TOKEN = "<unk>"
EMOTIONS = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love',
            'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

In [4]:
trainDF, devDF, testDF, class_weights = read_data(train_file, dev_file)
x_train = sequence.pad_sequences(np.array(trainDF['tweet_ids']), maxlen=MAX_LEN)
x_dev = sequence.pad_sequences(np.array(devDF['tweet_ids']), maxlen=MAX_LEN)
x_test = sequence.pad_sequences(np.array(testDF['tweet_ids']), maxlen=MAX_LEN)
y_train = np.array([trainDF['all']])[0]
y_dev = np.array([devDF['all']])[0]
y_test = np.array([testDF['all']])[0]


In [5]:
cnn_model = Sequential()
cnn_model.add(Embedding(VOCAB_SIZE, EMBEDDING_SIZE))
cnn_model.add(Conv1D(2 * HIDDEN_SIZE,
                     kernel_size=3,
                     activation='tanh',
                     strides=1,
                     padding='valid',
                     ))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(11, activation='sigmoid')) #11 = no of classes


In [35]:
from keras.optimizers import SGD, adam

opt = adam(lr=0.01)
cnn_model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'],
                  sample_weight_mode='temporal'
                  )


In [36]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

early_stopper = EarlyStopping(monitor='val_acc', patience=5, mode='max')
checkpoint = ModelCheckpoint(data_dir+'model.m', save_best_only=True, monitor='val_acc', mode='max')


In [8]:
def compute_class_weights(y_train):
    class_weights = np.sum(y_train, axis=0)
    sum_class_weights = np.sum(y_train)
    
    class_weights = [((sum_class_weights - i) / sum_class_weights)**10 for i in class_weights]
    
    return class_weights
compute_class_weights(y_train)

[0.18252407795206593,
 0.5302046507916747,
 0.1759452443364328,
 0.46780844029116636,
 0.17351598806386132,
 0.6274344503564779,
 0.2562887503102258,
 0.6074081062585732,
 0.2682319716618163,
 0.7973375187498287,
 0.8035216677691579]

In [33]:
def compute_sample_weights(y_train, factor):
    class_weights = np.array(y_train)
    class_weights[class_weights == 1] = factor
    class_weights[class_weights == 0] = 1
    # return np.reshape(class_weights, class_weights.shape + (1,))
    return class_weights


sample_weights = compute_sample_weights(y_train, 2)
print(sample_weights)
# print(x_train)
# print(y_train)
# print(sample_weights)
# y_train = np.reshape(y_train, y_train.shape + (1,))
# y_dev = np.reshape(y_dev, y_dev.shape + (1,))


[[2 1 2 ..., 1 1 1]
 [2 1 2 ..., 1 1 1]
 [2 1 2 ..., 1 1 1]
 ..., 
 [1 1 2 ..., 2 1 1]
 [2 1 2 ..., 1 1 1]
 [2 1 1 ..., 1 1 1]]


In [10]:
# print(y_train.shape)
# print(np.expand_dims(y_train,0))
# y_train = np.reshape(y_train,[1,y_train.shape[0],y_train.shape[1]])

In [37]:
cnn_model.fit(
    x_train,
    y_train,
    # batch_size=BATCH_SIZE,
    callbacks=[early_stopper, checkpoint],
    epochs=EPOCHS,
    validation_data=(x_dev, y_dev),
    class_weight=class_weights,
    # sample_weight=[sample_weights],
    verbose=1
)


ValueError: Found a sample_weight array for an input with shape (6179, 11). Timestep-wise sample weighting (use of sample_weight_mode="temporal") is restricted to outputs that are at least 3D, i.e. that have a time dimension.

In [30]:
from keras.models import load_model
best_model = load_model(data_dir+'model.m')

In [31]:
predictions = best_model.predict(x_test)
print(y_test[0])
print(predictions[0])

[0 0 0 0 0 1 0 0 1 0 0]
[ 0.10966518  0.06967428  0.13928385  0.05048795  0.56468403  0.29400647
  0.27725214  0.2429215   0.75032073  0.02757213  0.03927426]


In [32]:
evaluate(predictions,y_test)


F1: 0.49065743944636675
Precision: 0.6589219330855018
Recall: 0.3908489525909592
Completely correct: 2


In [25]:
F1: 0.4036478984932593
Precision: 0.6559278350515464
Recall: 0.29152348224513175
Completely correct: 2


SyntaxError: invalid syntax (<ipython-input-25-d7075f8dcc52>, line 4)

Unweighted 
	F1: 0.27632915678970543
	Precision: 0.3633125556544969
	Recall: 0.22295081967213115
	Completely correct: 2

In [None]:
Softmax
F1: 0.13215859030837004
Precision: 0.5844155844155844
Recall: 0.07450331125827815
Completely correct: 1

#Todo:Functional API class:weights 0,1 11 outputs
"It depends on your application. Class weights are useful when training on highly skewed data sets; for example, a classifier to detect fraudulent transactions. Sample weights are useful when you don't have equal confidence in the samples in your batch. A common example is performing regression on measurements with variable uncertainty." https://stackoverflow.com/questions/43459317/keras-class-weight-vs-sample-weights-in-the-fit-generator

            class_weight: Optional dictionary mapping class indices (integers)
                to a weight (float) value, used for weighting the loss function
                (during training only).
                This can be useful to tell the model to
                "pay more attention" to samples from
                an under-represented class.
            sample_weight: Optional Numpy array of weights for
                the training samples, used for weighting the loss function
                (during training only). You can either pass a flat (1D)
                Numpy array with the same length as the input samples
                (1:1 mapping between weights and samples),
                or in the case of temporal data,
                you can pass a 2D array with shape
                `(samples, sequence_length)`,
                to apply a different weight to every timestep of every sample.
                In this case you should make sure to specify
                `sample_weight_mode="temporal"` in `compile()`.


sample weights mode="temporal"
matrix mitgewicht für jede klasse in jedem sample