In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt

  'Matplotlib is building the font cache using fc-list. '


In [None]:
# Get some data

In [2]:
# n-grams
def n_gram_set(input_list, ngram_length=2):
    return set(zip(*[input_list[i:] for i in range(ngram_length)]))

In [10]:
def add_n_gram_value(sequences, token_index, ngram_range=2):
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for i in range(len(new_list) - ngram_range + 1):
            for ngram_value in range(2, ngram_range + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_index:
                    new_list.append(token_index[ngram])
        new_sequences.append(new_list)

    return new_sequences

In [3]:
from keras.datasets import imdb

Using TensorFlow backend.


In [12]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.datasets import imdb

In [13]:
ngram_range = 1
max_features = 20000
maxlen = 400
batch_size = 32
embedding_dims = 50
epochs = 5

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
25000 train sequences
25000 test sequences
Average train sequence length: 238
Average test sequence length: 230
Pad sequences (samples x time)
x_train shape: (25000, 400)
x_test shape: (25000, 400)


In [22]:
data = pd.read_csv("./trainingandtestdata/testdata.manual.2009.06.14.csv",
                   index_col=2,
                   names=['polarity', 'tweet_id', 'query', 'user', 'tweet_text'])

In [23]:
data = data[data['polarity'] != 2]
data['polarity'] = data['polarity']//4

In [51]:
import re

In [52]:
def tokenize_usernames(tweet_text):
    return re.sub(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)', 'USERNAME', tweet_text)

def tokenize_hashtags(tweet_text):
    return re.sub(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))#([A-Za-z]+[A-Za-z0-9]+)', 'HASHTAG', tweet_text)

In [69]:
data['tweet_text'] = data['tweet_text'].apply(tokenize_usernames)
data['tweet_text'] = data['tweet_text'].apply(tokenize_hashtags)

In [71]:
from keras.preprocessing.text import Tokenizer

In [83]:
input_list = ['all', 'this', 'happened', 'more', 'or', 'less', 'this', 'dog']

def find_ngrams(input_list, n):
  return list(set(zip(*[input_list[i:] for i in range(n)])))

In [84]:
for gram in find_ngrams(input_list, 2):
    print(gram)

('this', 'happened')
('more', 'or')
('all', 'this')
('happened', 'more')
('or', 'less')
('less', 'this')
('this', 'dog')


In [85]:
input_list = 'cats and dogs and dogs and cats'

In [86]:
find_ngrams(input_list, 2)

[('a', 'n'),
 ('n', 'd'),
 ('a', 't'),
 ('d', ' '),
 ('o', 'g'),
 ('d', 'o'),
 ('s', ' '),
 ('g', 's'),
 ('c', 'a'),
 (' ', 'a'),
 (' ', 'd'),
 ('t', 's'),
 (' ', 'c')]

In [88]:
sequences = [['all', 'this', 'happened', 'more', 'or', 'less', 'this', 'dog'], ['all', 'this', 'happened', 'more', 'to', 'this', 'dog']]

In [89]:
for seq in sequences:
    seq.extend(list(set(zip(*[seq[i:] for i in range(2)]))))