In [1]:
from os.path import join

import pandas as pd
import numpy as np
import re 

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
from gensim.models import word2vec

from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D

Using TensorFlow backend.


In [3]:
CORPORA_PATH = "/home/mgimenez/Dev/corpora/SemEval/SemEval_2017/2017_English_final/2017_English_final/GOLD/Subtask_A"

In [4]:
TRAIN_FILE = join(CORPORA_PATH, "twitter-2016train-A.txt")
TEST_FILE = join(CORPORA_PATH, "twitter-2016test-A.txt")

In [5]:
def clean_str(text):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    text = re.sub(r"[^A-Za-z0-9(),!?\'\`#@]", " ", text)
    text = re.sub(r"\'s", " \'s", text)
    text = re.sub(r"\'ve", " \'ve", text)
    text = re.sub(r"n\'t", " n\'t", text)
    text = re.sub(r"\'re", " \'re", text)
    text = re.sub(r"\'d", " \'d", text)
    text = re.sub(r"\'ll", " \'ll", text)
    text = re.sub(r",", " , ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\(", " \( ", text)
    text = re.sub(r"\)", " \) ", text)
    text = re.sub(r"\?", " \? ", text)
    text = re.sub(r"c\'", " c\' ", text)
    # Delete more than 2 spaces
    text = re.sub(r"\s{2,}", " ", text)
    # Delete every character that appear more than 3 times
    text = re.sub(r'(.)\1{3,}', r'\1\1\1', text)
    return text.strip().lower()

In [12]:
def load_file(filepath, tokenize=False, labeled=True):
    """ 
    Load a dataset file, split it in two lists: tweets and their tags; 
    without repeatead instances. 
    Also, if the flag tokenize is true, tokenize and clean each tweet. 
    """
    # Dictionary to encode the output variable
    labels_tags = {'neutral':[0, 1, 0], 'negative':[1, 0, 0], 'positive':[0, 0, 1]}
    keys, tweets, labels = set(), [], []
    with open(filepath) as dataset_file:
        for line in dataset_file:
            # TODO: This IMPLIES THERE IS AN ERROR IN THE INPUT FILE 
            if len(line.strip().split('\t')) != 3:
                # print("--->", len(line.strip().split('\t')), line, line.strip().split('\t'))
                # print("--->", len(line.strip().split('\t')))
                continue
                    
            # If this file is labeled because it's train file save the labels.
            if labeled:
                key, sentiment, tweet = line.strip().split('\t')[:3]
            else:
                key, tweet = line.strip().split('\t')
                
            # Save the tweets that are not repeated
            if key not in keys:
                if tokenize:
                    tweet = clean_str(tweet)
                tweets.append(tweet.split(" "))
                if labeled:
                    labels.append(labels_tags[sentiment])
                keys.add(key)
    
    # TODO: Move this to the unittest
    if labeled:
        assert len(tweets) == len(labels)
    assert len(keys) ==  len(tweets)
    
    if labeled:
        return tweets, np.asarray(labels)
    else:
        return tweets

In [13]:
def pad_tweets(dataset, max_sequence=None):
    """
    Pad each tweet to match the longest tweet present in the dataset. 
    Returns the padded dataset
    """
    if not max_sequence:
        max_sequence = max([len(tweet) for tweet in dataset])
    padded_dataset = []
    for tweet in dataset:
        len_padding = max_sequence - len(tweet)
        padded_dataset.append(tweet + ['<pad>'] * len_padding)

    # TODO: Move this to the unittest
    assert all([len(tweet) for tweet in padded_dataset]) is True
    assert max([len(tweet) for tweet in padded_dataset]) is max_sequence
    assert len(padded_dataset) == len(dataset)
    
    return padded_dataset, max_sequence

In [14]:
# Load and tokenize the dataset
x_train, y_train = load_file(TRAIN_FILE, True)
x_test, y_test = load_file(TEST_FILE, True)

# Pad the datasets
x_train_padded, max_sequence = pad_tweets(x_train)
x_test_padded, _ = pad_tweets(x_test, max_sequence)
print(len(x_train_padded[0]), len(x_test_padded[0]))

51 51


In [11]:
from collections import Counter
from itertools import chain

In [15]:
def build_vocab(dataset):
    """
    Given a list of lists of words, create a lookup table and a list with the vocabulary.
    """
    # Count how many times a word appear in the dataset
    word_counts = Counter(chain(*dataset))
    # Create a list with the most common words sorted. 
    # The position will be the index of the lookup table.
    vocab_sorted = [word for word, freq in word_counts.most_common()]
    vocab_sorted.append('<oov>')
    # Create a lookup table using a dictionary. Map each index with a word
    lookup = {word: index for index, word in enumerate(vocab_sorted)}

    # TODO: Move this to the unittest
    assert len(lookup.keys()) == len(vocab_sorted)
    
    return vocab_sorted, lookup

In [16]:
vocab_sorted, lookup_table = build_vocab(x_train_padded)

In [17]:
def create_input(tweets, lookup_table):
    # Map each word with its index from the lookup table. 
    # If the word wasn't seen during training assign the OOV token index.
    indexes_dataset = []
    for tweet in tweets:
        index_tweet = [] 
        for word in tweet:
            if word not in lookup_table.keys():
                word = '<oov>'
            index_tweet.append(lookup_table[word])
        # TODO: Move this to the unittest
        assert len(index_tweet) == len(tweet)
        indexes_dataset.append(index_tweet)
    # TODO: Move this to the unittest
    assert len(indexes_dataset) == len(tweets)
    
    return np.asarray(indexes_dataset)

In [40]:
def load_dataset(train_filepath, test_filepath):
    # Load and tokenize the dataset
    x_train, y_train = load_file(train_filepath, True, True)
    x_test, y_test = load_file(test_filepath, True, True)

    # Pad the datasets
    x_train_padded, max_sequence = pad_tweets(x_train)
    x_test_padded, _ = pad_tweets(x_test, max_sequence)
    
    # Build the lookup table and the vocabulary
    vocab_sorted, lookup_table = build_vocab(x_train_padded)
    
    # Create the matrices of indexes to train the system
    x_train_indexes = create_input(x_train_padded, lookup_table)
    x_test_indexes = create_input(x_test_padded, lookup_table)

    assert x_train_indexes.shape[0] == y_train.shape[0]
    assert x_train_indexes.shape[1] == x_test_indexes.shape[1]
    assert x_test_indexes.shape[0] == y_test.shape[0]
    assert y_train.shape[1] == y_test.shape[1]
    return x_train_indexes, y_train, x_test_indexes, y_test

In [41]:
x_train, y_train, x_test, y_test = load_dataset(TRAIN_FILE, TEST_FILE) 