In [1]:
import numpy as np

from gensim.models import word2vec

from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D

from sklearn.model_selection import train_test_split
np.random.seed(2)

Using TensorFlow backend.


# Get the data

In [2]:
from os.path import join
import xml.etree.ElementTree as ET
import numpy as np

from corpus import *

def load_dataset(dataset_filepath, truth_filepath):
    """ Given the paths where the corpus is, load it within a Corpus class

    Args:
        dataset_filepath (str): path where the PAN XMLs files are.
        truth_filepath (str): path where the ground truht is.

    Returns:
        Corpus: return all the samples within a Corpus class
    """
    dataset = Corpus()
    with open(truth_filepath) as truth_file:
        for line in truth_file.readlines():

            line_split = line.strip().split(':::')
            author_id, gender, age_range = line_split[:3]
            personality = Personality(extrovert=float(line_split[3]),
                                      stable=float(line_split[4]),
                                      agreeable=float(line_split[4]),
                                      conscientious=float(line_split[5]),
                                      open_trait=float(line_split[6]))

            # Read the tweets for the author in hand
            author_tweets_filepath = join(dataset_filepath, author_id + ".xml")
            tree = ET.parse(author_tweets_filepath)
            
            root = tree.getroot()
            current_author = Author(author_id, gender, age_range, personality)
            for num, child in enumerate(root):
                current_author.add(child.text)
            dataset.add(current_author)

    return dataset

def load_data():
    PAN_DATASET = "/Users/maite/Dev/corpora/PAN/"
    DATASET_FILEPATH = join(PAN_DATASET, "pan15-author-profiling-training-dataset-english-2015-04-23")
    TRUTH_FILEPATH = join(DATASET_FILEPATH, "truth.txt")
    training_dataset = load_dataset(DATASET_FILEPATH, TRUTH_FILEPATH)

    TEST_FILEPATH = join(PAN_DATASET, "pan-ap2015-test")
    DATASET_FILEPATH = join(TEST_FILEPATH, "en")
    TRUTH_FILEPATH = join(TEST_FILEPATH, "en.txt")
    test_dataset = load_dataset(DATASET_FILEPATH, TRUTH_FILEPATH)

    return training_dataset, test_dataset

In [3]:
training_dataset, test_dataset = load_data()

In [4]:
max_sequence = training_dataset.pad()
_ = test_dataset.pad(max_sequence)

In [5]:
x_train = [tweet for author in training_dataset.samples for tweet in author.padded ]
x_test = [tweet for author in test_dataset.samples for tweet in author.padded ]
num_test = [len(author.padded) for author in test_dataset.samples]
seq_len = len(x_train[0])

In [6]:
from collections import Counter
from itertools import chain

def build_vocab(dataset, to_lower=False):
    """ Create a lookup table and a list with the vocabulary.

    Args:
        dataset (list): a matrix with the words from the dataset

    Returns:
        vocab_sorted (list): list of the words sorted by its frequency
        lookup (dict): a dictionary with the lookup table. The keys are
            the words and the values are the indexes.
    """
    # Count how many times a word appear in the dataset
    word_counts = Counter(chain(*dataset))
    # Create a list with the most common words sorted.
    # The position will be the index of the lookup table.
    vocab_sorted = []
    for word, _ in word_counts.most_common():
        if to_lower:
            word = word.lower()
        if word not in vocab_sorted:
            vocab_sorted.append(word)
    vocab_sorted.append('<oov>')
    # Create a lookup table using a dictionary. Map each index with a word
    lookup = {word: index for index, word in enumerate(vocab_sorted)}

    # TODO: Move this to the unittest
    assert len(list(lookup.keys())) == len(vocab_sorted)

    return vocab_sorted, lookup

In [7]:
vocab_sorted, lookup = build_vocab(x_train)

In [8]:
def to_indexes(sentences, vocabulary):
    """ Convert a list of sentences to its corresponding indices.

    Args:
        sentences: a list where each element is a lists of words (sentences)
        vocabulary: a dictionary of words and its corresponding index

    Returns:
        a list where each element is a list of indexes (sentences)

    """
    senteces_idx = []
    for sentence in sentences:
        aux_idx = []
        for word in sentence:
            # If the word is in the vocabulary get its index otherwise use the <oov> index
            if word in vocabulary:
                word_idx = vocabulary[word]
            else:
                word_idx = vocabulary['<oov>']
            aux_idx.append(word_idx)
        senteces_idx.append(aux_idx)
    return np.array(senteces_idx)


In [9]:
x_train_idx = to_indexes(x_train, lookup)
x_test_idx = to_indexes(x_test, lookup)

In [10]:
tweets_user = np.array([len(author.padded) for author in test_dataset.samples])

# Build the model

In [11]:
from keras import metrics
from keras.layers.merge import Concatenate
from sklearn.metrics import mean_squared_error

In [12]:
# Model Hyperparameters
sequence_length = len(x_train[0])
embedding_dim = 100
num_filters = 100
dropout_prob = (0.5,)
hidden_dims = (1024, 128)
filters_h = (3, 4, 5)

batch_size = 100
num_epochs = 10

output_classes =  9

In [13]:
from keras.wrappers.scikit_learn import KerasRegressor
from math import sqrt

def cnn_regressor(sequence_length, embedding_dim, num_filters, filtes_h, dropout_prob, 
                  hidden_dims, verbose=False, embedding_weights=None, is_trainable=False):

    input_shape = (sequence_length,)
    model_input = Input(shape=input_shape)
    embedding_input = Embedding(len(vocab_sorted), 
                                embedding_dim, 
                                input_length=sequence_length, 
                                weights=embedding_weights, 
                                trainable=is_trainable)(model_input)
    # CNNs
    convs = []
    for fh in filtes_h:
        conv = Convolution1D(filters=num_filters,
                             kernel_size=fh,
                             padding='valid',
                             activation='relu',
                             strides=1)(embedding_input)
        pool = MaxPooling1D(pool_size=2)(conv)
        flatten = Flatten()(pool)
        convs.append(flatten)

    concat_layer = Concatenate(axis=-1)(convs)
    dropout_cnn = Dropout(dropout_prob[0])(concat_layer)

    h1 = Dense(hidden_dims[0], activation="relu")(dropout_cnn)
    h2 = Dense(hidden_dims[1], activation="relu")(h1)
    model_output = Dense(1, activation="sigmoid")(h2)

    model = Model(inputs=model_input, outputs=model_output)

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=[metrics.mean_squared_error])
    if verbose:
        print(model.summary())
    return model

In [14]:
def group_prediction(predictions_decoded, tweets_user):
    groups = []
    ini = 0
    for num_tweets in tweets_user:
        groups.append(np.mean(predictions_decoded[ini:ini + num_tweets]))
        ini += num_tweets

    return groups

## Get pre-trained word embeddings

In [15]:
GLOVE_DIR = "../res/glove.6B"
embeddings_index = {}
with open(join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print('Found {} word vectors.'.format(len(embeddings_index)))

Found 400000 word vectors.


In [16]:
glove_w = np.zeros((len(vocab_sorted), embedding_dim))
for word, index in lookup.items():
    if word in embeddings_index:
        glove_w[index] = embeddings_index[word]
    else:
        np.random.uniform(-0.25,0.25, embedding_dim)

In [17]:
def encode_trait(trait, training_dataset, test_dataset):
    """ Encode the traits for training and testing. """
    y_train = np.array([getattr(author.personality, trait)
                        for author in training_dataset.samples
                        for _ in author.padded])
    y_test = np.array([getattr(author.personality, trait)
                       for author in test_dataset.samples
                       for _ in author.padded])
    # Get only a label for each author
    y_test_group = np.array([getattr(author.personality, trait)
                             for author in test_dataset.samples])
    return y_train, y_test, y_test_group

# Train the model

In [None]:
traits = ['extroverted', 'stable', 'agreeable', 'conscientious', 'open']
rmses_static = []
verbose = True
for trait in traits:
    y_train, y_test, y_test_group = encode_trait(trait, training_dataset, test_dataset)
    model_static = cnn_regressor(sequence_length, embedding_dim, num_filters, filters_h, 
                                 dropout_prob, hidden_dims, verbose=verbose, 
                                 embedding_weights=[glove_w], is_trainable=False)
    model_static.fit(x_train_idx[:13000], y_train[:13000], 
                 batch_size=batch_size, epochs=num_epochs, 
                 validation_data=(x_train_idx[13000:], y_train[13000:]), 
                     verbose=1)
    loss, rmse = model_static.evaluate(x_test_idx, y_test)
    rmse_str = 'RMSE for trait {}: {:.4f} RMSE/per trait'
    print(rmse_str.format(trait, sqrt(rmse)))
    verbose=False
    
    predictions = model_static.predict(x_test_idx)
    grouped_predictions = group_prediction(predictions, tweets_user)
    rmse_person = np.sqrt(mean_squared_error(y_test_group, grouped_predictions))
    print("RMSE for the trait {} for each person is: {:.4f}".format(trait.upper(), rmse_person))
    rmses_static.append(rmse_person)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 44)            0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 44, 100)       3324000                                      
____________________________________________________________________________________________________
conv1d_4 (Conv1D)                (None, 42, 100)       30100                                        
____________________________________________________________________________________________________
conv1d_5 (Conv1D)                (None, 41, 100)       40100                                        
___________________________________________________________________________________________

In [None]:
print("The mean RMSE of the system is {:.4f}".format(np.mean(np.array(rmses_static))))