In [1]:
import pandas as pd
import re
import numpy as np
import random
import pickle

### Path to a .csv with the tweets to learn the embeddings from

In [1]:
sourcePath = 'data/ALLtweets_2017.csv'

### Helper methods

#### log_progress is a helper method for diplaying a progression bar

In [3]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

In [4]:
def reduce_lengthening(text):
    '''
    An ad-hoc spellcheck. No swedish words contains more than two identical characters
    in a sequence. This method finds those and shortens them to two - irregardless of
    whether it's a noun or whatever. ALL occurrences are reduced to only two in order to
    ensure words that are commonly emphazised, i.e såååååå, are included in the vocabulary.
    '''
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

### Clean all tweets

In [None]:
def cleanAllTweets(sourcePath):    
    cleaned_tweets = []
    tweets = pd.read_csv(sourcePath, delimiter=',', quotechar='|' )['Tweet']
    size = len(tweets)
    for tweet in log_progress(tweets, size = size):
        tweet = cleanTweet(tweet)
        cleaned_tweets.append(tweet)
    
    return cleaned_tweets

def cleanTweet(tweet):
    
    # Here we have each tweet. Process and add to file.

    # Replaces various URLs with "<URL>"
    filtered = re.sub(r"http\S+", " <URL>", tweet)

    # Replace mentions with anonymous <ID>
    filtered = re.sub(r"@\S+", " <ID>", filtered)

    # Replace haschtags with anonymous <HASCHTAG>
    filtered = re.sub(r"#\S+", " <HASCHTAG>", filtered)

    filtered = reduce_lengthening(filtered)

    # Replace each '-' and '/' with ' - ' and ' / ' because they are common in text
    filtered = filtered.replace("-", " - ")
    filtered = filtered.replace("/", " / ")

    filtered = re.sub('[\'&]+', '', filtered)

    #Remove non-alpha numerical and <> from tweet
    filtered = re.sub('[^0-9a-zA-Z åäöÅÄÖ<>]+', ' ', filtered)

    prev = ''
    sentence = ''
    for word in filtered.split():
        word = word.strip().lower()

        if not (word == prev and (word == '<id>' or word == '<haschtag>' or word == '<url>')) and not word == 'amp':
            sentence = sentence + ' ' + word
        prev = word

    return sentence

cleaned_tweets = cleanAllTweets(sourcePath)

VBox(children=(HTML(value=''), IntProgress(value=0, max=13904701)))

In [None]:
def createVocabulary(tweets):
    size = len(tweets)
    dic = {}
    for tweet in log_progress(tweets, size=size):
        for word in tweet.split():
            word = word.strip() #Removes blanks and '\n' from the end when needed.
            dic[word] = dic.get(word, 0) + 1
                
    return dic
vocabulary = createVocabulary(cleaned_tweets)

### Create the word2index and index2word mappers. These ensures that we can map a word to an index and vice versa

In [None]:
import collections

def createWordMapper(vocabulary, vocabSize):
    '''
    Takes a vocabulary (dictionary) of variable size and a desired vocabulary size as input. 
    Returns the two mappers: word2index & index2word.
    
    NB, when the mappers are created we don't use the original vocabulary anymore (i think).
    '''

    # Order the dictinoary in order to extract the most frequent
    sortedVocabulary = sorted(vocabulary.items(), key=lambda kv: kv[1], reverse=True)

    word2index = collections.OrderedDict()
    index2word = collections.OrderedDict()

    for i in log_progress(range(vocabSize), size=vocabSize):
        index = len(word2index)
        word = sortedVocabulary[i][0]
        
        word2index[word] = index
        index2word[index] = word
    word2index['<UNK>'] = index + 1
    index2word[index + 1] = '<UNK>'
            
    return word2index, index2word
    
# Set the number of most frequent words to create embeddings for. 
vocabularySize = 50000
word2index, index2word = createWordMapper(vocabulary, vocabularySize - 1)

print('Size of vocabulary: ' + str(len(word2index)))
print('Maximum size of vocabulary: ' + str(len(vocabulary)))


### Word2Vec requires the training corpus to be one long document. [data] is one massive array containing the integer representation of all the tweets, stacked after eachother.

In [None]:
data = []

size = len(cleaned_tweets)

for tweet in log_progress(cleaned_tweets, size=size):
    for word in tweet.split(' '):
        #print(word.lower().strip())
        word = word.lower().strip()
        if word2index.get(word) is not None:
            data.append(word2index.get(word))
        else:
            data.append(word2index.get('<UNK>'))

print('The first few words (represented by their integer) of all stacked tweets:')
data[:10]    

In [None]:
print('Amount of words in total in all tweets')
print(len(data))

In [None]:
# Limit tests to the first 10 million words.
data = data[:10000000]

In [None]:
# Start at the beginning of the document
data_index = 0

### Where to store the logs for the graph?

In [None]:
log_dir = '/tf/data/twitter/logs'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

In [None]:

# Function to generate a training batch for the skip-gram model.

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        # context_words is a list of the words around the target word with size 2*skip_window
        # [ skip_window target skip_window ]
        context_words = [w for w in range(span) if w != skip_window]
        # From our context_words, draw num_skips random words to use as input, that is num_skip of the following
        # pairs: [target, context], where context is a random word from context_words
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
    
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

# Batch_size is how many to process at a time
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)

for i in range(8):
    print(batch[i], index2word.get(batch[i]), '->', labels[i, 0],
          index2word.get(labels[i, 0]))


In [None]:
# Build and train a skip-gram model.

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.
num_sampled = 64  # Number of negative examples to sample.

# We pick a random validation set to sample nearest neighbors. Here we limit
# the validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)


In [None]:
len(data)

In [None]:
import tensorflow as tf
import math

In [None]:
graph = tf.Graph()

In [None]:
with graph.as_default():
    # Input data.
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        # Valid dataset is valid_size (number) random integers from the top valid_window (i.e top 100) words.
        # That is, those words (integers) is our validation set on which top_k nearest neighbours are found.
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
        
    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        with tf.name_scope('embeddings'):
            # Create a random uniformly ditributed [-1.0, 1.0] matrix, with vocabularySize rows and
            # embedding_size columns.
            embeddings = tf.Variable(
                tf.random_uniform([vocabularySize, embedding_size], -1.0, 1.0))
            
            # From the embeddings matrix, grab the rows (embeddings) indexed by the train_inputs.
            # if train_inputs = [1, 4, 6], then grab row [1, 4, 6] form the embeddings matrix.
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)

      # Construct the variables for the NCE loss
        with tf.name_scope('weights'):
            nce_weights = tf.Variable(
                tf.truncated_normal([vocabularySize, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))

        with tf.name_scope('biases'):
            nce_biases = tf.Variable(tf.zeros([vocabularySize]))
            
    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    # Explanation of the meaning of NCE loss:
    #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(
                weights=nce_weights,
                biases=nce_biases,
                labels=train_labels,
                inputs=embed,
                num_sampled=num_sampled,
                num_classes=vocabularySize))
        
    # Add the loss value as a scalar to summary.
    tf.summary.scalar('loss', loss)

    # Construct the SGD optimizer using a learning rate of 1.0.
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all
    # embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    
    # Grab the embeddings from normalized_embeddings with id/index/row specified in valid_dataset. 
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                              valid_dataset)
    # Compute similarity
    # [valid_size, embedding_size] * transpose([vocabularySize, embedding_size]) ~
    # ~ [valid_size, embedding_size] * [embedding_size, vocabularySize] --> 
    # --> [valid_size, vocabularySize]. i.e [16, 40'000]
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)

    # Merge all summaries.
    merged = tf.summary.merge_all()

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # Create a saver.
    saver = tf.train.Saver()

In [None]:
epochs = 5

num_spest_per_epoch = len(data) // batch_size
print(num_spest_per_epoch)

In [None]:
session = tf.InteractiveSession(graph=graph)

In [None]:
# Step 5: Begin training.
num_steps = 1500

#with tf.Session(graph=graph) as session: #this is required when not using an tf.InteractiveSession()
# Open a writer to write summaries.
writer = tf.summary.FileWriter(log_dir, session.graph)

# We must initialize all variables before we use them.
init.run()
print('Initialized')

average_loss = 0
for step in range(num_steps):
    batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
                                          skip_window)
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

    # Define metadata variable.
    run_metadata = tf.RunMetadata()

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    # Also, evaluate the merged op to get all summaries from the returned
    # "summary" variable. Feed metadata variable to session for visualizing
    # the graph in TensorBoard.
    _, summary, loss_val = session.run([optimizer, merged, loss],
                                 feed_dict=feed_dict,
                                 run_metadata=run_metadata)
    average_loss += loss_val

    # Add returned summaries to writer in each step.
    writer.add_summary(summary, step)
    # Add metadata to visualize the graph for the last run.
    if step == (num_steps - 1):
        writer.add_run_metadata(run_metadata, 'step%d' % step)

    if step % 10000 == 0:
        if step > 0:
            average_loss /= 10000
        # The average loss is an estimate of the loss over the last 10000
        # batches.
        print('Average loss at step ', step, ': ', average_loss)
        average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 100000 == 0:
        sim = similarity.eval()
        for i in range(valid_size):
            valid_word = index2word.get(valid_examples[i])
            top_k = 8  # number of nearest neighbors
            nearest = (-sim[i, :]).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = index2word.get(nearest[k])
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)


### Save embeddings and word2index mapper for use in train_eval-w2v.ipynb to train classifier

In [None]:
final_embeddings = normalized_embeddings.eval(session=session)

pickle.dump(final_embeddings, open("w2c-model/final_embeddings.pkl","wb"))
pickle.dump(word2index, open("w2c-model/word2index.pkl","wb"))

In [None]:
final_embeddings[1]

In [None]:
saver.save(session, os.path.join(log_dir, 'model.ckpt'))

In [None]:
session.close()