# Problem 3: Don't predict, count!

Should only attempt after after class about word2vec (1/25).

Classical word-counting models still have some merit. They are easy to build and train and have surprisingly good results.

In this problem, you will build a context-counting model on the dataset of your choice. You can download the text8 dataset (downloaded to data/text8). The data is already processed, you can get tokens just by splitting the text at the spaces.

Steps to building your context-counting model:
1. Read in your data, build the vocabulary.
2. Build co-occurrence matrix.
3. Use SVD to reduce the dimensionality of your co-occurrence matrix to your embedding size. You should use tf.svd for this.

The resulting co-occurrence matrix is your embedding matrix, each row corresponds to the vector representation of one vector.

I advise you to keep your vocabulary under 10,000 words. You can choose the 9,999 most frequent tokens and replace any other tokens with UNK. The embedding size can be anywhere from 50 to 300.

In [2]:
from six.moves import urllib
import os
import zipfile
from collections import Counter
import tensorflow as tf
import numpy as np
from datetime import datetime

# Parameters for the model
VOCAB_SIZE = 1000
EMBED_SIZE = 150 # Between 50 and 300 is good.
SKIP_WINDOW = 1

# Parameters for downloading data
DOWNLOAD_URL = 'http://mattmahoney.net/dc/'
EXPECTED_BYTES = 31344016
DATA_FOLDER = 'data/'
FILE_NAME = 'text8.zip'

def download(file_name, expected_bytes):
    file_path = DATA_FOLDER + file_name
    if (os.path.exists(file_path)):
        print("Dataset ready")
        return file_path
    file_name, _ = urllib.request.urlretrieve(DOWNLOAD_URL + file_name, file_path)
    file_stat = os.stat(file_path)
    if (file_stat.st_size == expected_bytes):
        print("Successfully downloaded the file", file_name)
    else:
        raise Exception("File " + file_name + " might be corrupted. You should try downloading it with a browser.")
    return file_path

def read_data(file_path):
    with zipfile.ZipFile(file_path) as f:
        words = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return words

def build_vocab(words, vocab_size):
    dictionary = dict()
    count = [("UNK", -1)]
    count.extend(Counter(words).most_common(vocab_size - 1))
    index = 0
    with open("processed/vocab_1000.tsv", "w") as f:
        for word, _ in count:
            dictionary[word] = index
            if (index < 1000):
                f.write(word + "\n")
            index += 1
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, index_dictionary

def convert_words_to_index(words, dictionary):
    return [dictionary[word] if word in dictionary else 0 for word in words]
    
def build_cooccurrence_matrix(dictionary, index_words, context_window_size):
    n = len(dictionary)
    mat = np.zeros((n, n))
    for center in index_words:
        for context in range(1, context_window_size+1):
            # process targets before the center word
            target = index_words[max(0, center - context)]
            mat[center, target] += 1
            # process targets after the center word
            target = index_words[min(center + context, len(index_words))]
            mat[center, target] += 1
    return mat

def process_data(vocab_size, skip_window):
    file_path = download(FILE_NAME, EXPECTED_BYTES)
    words = read_data(file_path)
    dictionary, index_dictionary = build_vocab(words, vocab_size)
    index_words = convert_words_to_index(words, dictionary)
    del words # to save memory
    print(index_words[:25])
    mat = build_cooccurrence_matrix(dictionary, index_words, 2)
    print(np.shape(mat))
    return mat

startTime = datetime.now()
mat = process_data(VOCAB_SIZE, SKIP_WINDOW)
print(np.shape(mat))
print("Processing data and constructing matrix took: {0}".format(datetime.now() - startTime))

Dataset ready
[0, 0, 12, 6, 195, 2, 0, 46, 59, 156, 128, 742, 477, 0, 134, 1, 0, 2, 1, 103, 855, 3, 1, 0, 0]
(1000, 1000)
(1000, 1000)
Processing data and constructing matrix took: 0:01:06.081463


In [None]:
def context_counting_model(mat, vocab_size, embed_size):
    print("1")
    cooccurrence_mat = tf.placeholder(tf.float32, shape = [vocab_size,
                                                          vocab_size],
                                     name = 'cooccurrence_matrix')
    print("2")
    s, u, v = tf.svd(cooccurrence_mat)
    print("3")
    smaller_mat = tf.matmul(u[:embed_size], tf.diag(s), b_is_sparse=True)
    print("4")
    another_mat = tf.matmul(smaller_mat, v[:, :embed_size])
    print("5")
    print("{0} x {0}".format(vocab_size, vocab_size))
    startTime = datetime.now()
    with tf.Session() as sess:
        another_mat.eval(feed_dict = {cooccurrence_mat: mat})
    print("Taking the svd took: {0}".format(datetime.now() - startTime))
    
context_counting_model(mat, VOCAB_SIZE, EMBED_SIZE)

1
2
3
4
5
1000 x 1000


## Warning: I debugged for hours and the issue was with Jupyter Notebook, not my Tensorflow model. Lesson: Don't run Tensorflow models in Jupyter notebook!