In [1]:
import sys
import os
from typing import Dict, List

import numpy as np
import tensorflow as tf
print(tf.__version__)

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(linewidth=80) 

%load_ext line_profiler

2.3.0


In [None]:
NIL = "<nil>"  # Lower letter as lower() will be applied.
STRIDE = int(input("STRIDE size?"))
CONTEXT_SIZE = 1 + (STRIDE * 2)

SPACE = ' '

USE_PTB = bool(input("USE_PTB? Just enter for False or any string for True"))
USE_NATIVE=True

In [None]:
DEBUG = False
VALIDATION = True

# Text to process

### Test sample text

In [None]:
#corpus = "The fool doth think he is wise, but the wise man knows himself to be a fool."
#corpus = "To be, or not to be, that is the question"
#corpus = "To to be be, or not not not not not to be, that is that the question that matters"
corpus = "To be, or not to be, that is the question that matters"
#corpus = "I know how to build an attention in neural networks. But I don’t understand how attention layers learn the weights that pay attention to some specific embedding. I have this question because I’m tackling a NLP task using attention layer. I believe it should be very easy to learn (the most important part is to learn alignments). However, my neural networks only achieve 50% test set accuracy. And the attention matrix is weird. I don’t know how to improve my networks."

## PTB (Penn Treebank) 

In [None]:
#coding: utf-8
import sys
import os
sys.path.append('..')
try:
    import urllib.request
except ImportError:
    raise ImportError('Use Python3!')
import pickle
import numpy as np


url_base = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/'
key_file = {
    'train':'ptb.train.txt',
    'test':'ptb.test.txt',
    'valid':'ptb.valid.txt'
}
save_file = {
    'train':'ptb.train.npy',
    'test':'ptb.test.npy',
    'valid':'ptb.valid.npy'
}
vocab_file = 'ptb.vocab.pkl'

#dataset_dir = os.path.dirname(os.path.abspath(__file__))
dataset_dir = os.path.dirname(os.path.abspath("/home/oonisim/dataset/hoge"))
print(dataset_dir)

def _download(file_name):
    file_path = dataset_dir + '/' + file_name
    if os.path.exists(file_path):
        return

    print('Downloading ' + file_name + ' ... ')

    try:
        urllib.request.urlretrieve(url_base + file_name, file_path)
    except urllib.error.URLError:
        import ssl
        ssl._create_default_https_context = ssl._create_unverified_context
        urllib.request.urlretrieve(url_base + file_name, file_path)

    print('Done')


def load_text(data_type):
#    data_type = 'train'
    file_name = key_file[data_type]
    file_path = dataset_dir + '/' + file_name
    vocab_path = dataset_dir + '/' + vocab_file

    if not os.path.exists(file_path):
        _download(file_name)

    text = open(file_path).read().replace('\n', '<eos>').strip()
    return(text)
    
def load_vocab():
    vocab_path = dataset_dir + '/' + vocab_file

    if os.path.exists(vocab_path):
        with open(vocab_path, 'rb') as f:
            word_to_id, id_to_word = pickle.load(f)
        return word_to_id, id_to_word

    word_to_id = {}
    id_to_word = {}
    data_type = 'train'
    file_name = key_file[data_type]
    file_path = dataset_dir + '/' + file_name

    _download(file_name)

    words = open(file_path).read().replace('\n', '<eos>').strip().split()

    for i, word in enumerate(words):
        if word not in word_to_id:
            tmp_id = len(word_to_id)
            word_to_id[word] = tmp_id
            id_to_word[tmp_id] = word

    with open(vocab_path, 'wb') as f:
        pickle.dump((word_to_id, id_to_word), f)

    return word_to_id, id_to_word


def load_data(data_type='train'):
    '''
        :param data_type: データの種類：'train' or 'test' or 'valid (val)'
        :return:
    '''
    if data_type == 'val': data_type = 'valid'
    save_path = dataset_dir + '/' + save_file[data_type]

    word_to_id, id_to_word = load_vocab()

    if os.path.exists(save_path):
        sequence = np.load(save_path)
        return sequence, word_to_id, id_to_word

    file_name = key_file[data_type]
    file_path = dataset_dir + '/' + file_name
    _download(file_name)

    words = open(file_path).read().replace('\n', '<eos>').strip().split()
    sequence = np.array([word_to_id[w] for w in words])

    np.save(save_path, sequence)
    return sequence, word_to_id, id_to_word



# Utilities

### Padding

To avoid boundary checking when iterate through the sequenced corpus, pad the source text with '<nil>'.
e.g. (when context is of size 5):    
From:
```
|B|X|Y|Z|...|P|Q|R|E|
```

To:
```
|<nil>|<nil>|B|X|Y|Z|...|P|Q|R|E|<nil>|<nil>| 
```

In [None]:
DELIMITER = " "
def pad(corpus:str) -> str:
    """Prepand and appeend STRIDE times of the NIL word to the corpus"""
    assert corpus and len(corpus) > 0 and isinstance(corpus, str)
    
    padded = DELIMITER.join(
        [ NIL ] * STRIDE + [ corpus ] + [ NIL ] * STRIDE
    )
    """
    padded = sum(
        [ 
            [ NIL ] * STRIDE, 
            corpus.split(' '),
            [ NIL ] * STRIDE
        ],
        start=[]
    )
    """
    return padded

#print("[{}]".format(pad("tako ika bin")))
assert pad("tako ika bin") == NIL + DELIMITER + NIL + DELIMITER + "tako ika bin " + NIL + DELIMITER + NIL

### co-occurrence check

In [None]:
def cooccurrence_words(co_occurrence_matrix, word, word_to_id, id_to_word):
    """Provide the co-occurred words for the word"""
    return [(id_to_word[i], count) for i, count in enumerate(co_occurrence_matrix[word_to_id[word]])]

def word_frequency(co_occurrence_matrix, word, word_to_id):
    """Number of times when the word occurred in the sequene"""
    # Each time the word occurrs in the sequence, it will see (CONTEXT_SIZE -1) words. 
    co_occurrence_matrix[
        word_to_id[word]
    ].sum() / (CONTEXT_SIZE -1)
    
def total_frequencies(co_occurrence_matrix, word_to_id):
    """Sum of all word occurrence except NIL (same with vocabrary size excluding NIL)"""
    return (co_occurrence_matrix.sum() - co_occurrence_matrix[word_to_id[NIL]].sum()) / (CONTEXT_SIZE -1)
     

### Extract gapped slices

In [None]:
import numpy as np
def xslice(x, slices):
    """Extract multiple slices from an array-like and concatenate them.
    Args:
        x: array-like
        slices: slice or tuple of slice objects
    Return:
        Combined slices
    """
    if isinstance(slices, tuple):
        if isinstance(x, np.ndarray):
            return np.concatenate([x[_slice] for _slice in slices])
        else:
            return sum((x[s] if isinstance(s, slice) else [x[s]] for s in slices), [])        
    elif isinstance(slices, slice):
        return x[slices]
    else:
        return [x[slices]]

# Word indexing
Assign a numerical id to each word.

The row index of co-occurrence matrix is a word index. The number of words in the corpus can be less than the number of word indices because additional meta-word such as OOV, UNK, NIL can be added to the original corpus.

Make sure **the co-occurrence matrix row index matches with the word index**, unless explicitly adjust when row-index and word-index do not match.

## Load the corpus text

In [None]:
if USE_PTB:
    corpus = pad(load_text('train'))
else:
    print("Original corpus: \n[{}]".format(corpus))
    corpus = pad(corpus)
    print("Padded corpus: \n[{}]".format(corpus))

## Word indexing

### Native word indexing

In [None]:
import re
def native_word_indexing(corpus):
    """
    Args: 
        corpus: A string including sentences to process.
    Returns:
        sequence: 
            A numpy array of word indices to every word in the originlal corpus as as they appear in it.
            The objective of sequence is to preserve the original corpus but as numerical indices.
        word_to_id: A dictionary to map a word to a word index
        id_to_word: A dictionary to map a word index to a word
        vocabulary_size: Number of words identiifed in the corpus
    """
    words = re.compile('[\s\t]+').split(corpus)

    word_to_id = {}
    id_to_word = {}
    
    min_id = len(word_to_id)
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word

            
    # Word index starts with 0. Total words = max(word index) + 1
    vocabulary_size = new_id + 1
    assert vocabulary_size == (max(word_to_id.values()) + 1)

    sequence = np.array([word_to_id[w] for w in words])

    return sequence, word_to_id, id_to_word, vocabulary_size

### Native indexing

In [None]:
corpus = re.sub('[.,:;]+', SPACE, corpus.lower())

In [None]:
if USE_NATIVE:
    (sequence, word_to_id, id_to_word, vocabulary_size) = native_word_indexing(corpus)

print(vocabulary_size)
if not USE_PTB:
    print(word_to_id)

### Tensorflow Tokenizer indexing

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")

USE_TENSORFLOW = (not USE_NATIVE)
if USE_TENSORFLOW:
    # Each text in "texts" is a complete document as one string, 
    # e.g "To be or not to be, that is the question."
    texts = [ corpus ]   

    # fit_on_texts() processes multiple documents and handles all words in all the documents.
    tokenizer.fit_on_texts(texts)
    word_to_id = tokenizer.word_index

    # texts_to_sequences() ruturns sequences, one sequence for each text in "texts".
    sequences = (tokenizer.texts_to_sequences(texts))
    sequence = sequences[0]

    print(len(sequences))
    print(len(word_to_id))
    
    # Index of tokenizer.word_index starts at 1, NOT 0.
    # e.g. {'<OOV>': 1, 'the': 2, 'fool': 3, 'wise': 4, 'doth': 5, ...}
    vocabulary_size = max(word_to_id.values()) + 1
    print(vocabulary_size)

## Verification

In [None]:
if not USE_PTB:
    print("word to id \n{}".format(word_to_id))
    print("id to word \n{}".format(id_to_word))
    print()
    print("corpus is \n[{}]".format(corpus))
    print("sequence is \n{}".format(sequence))
    print("corpus size is {} sequence size is {} expected sum is {}".format(
        len(re.compile('[\t\s]+').split(corpus)), 
        len(sequence), 
        (len(sequence) - (2*STRIDE)) * (2*STRIDE)  # Exclude NIL from the sequence
    ))
    #print([id_to_word[index] for index in sequence])
    print(np.array([id_to_word[index] for index in sequence]))

# Co-occurrence vector(s)

## DLFS2 iterative approach

### Original version

In [None]:
def dlfs2_create_co_matrix(sequence, vocabulary_size, context_size=3):
    '''Generate co-occurreance matrix for the sequence.
    :param sequence: word index sequence of the sequence
    :param vocabulary_size:The number of unique words in the sequence. 
    :param window_size: 
        The number of words either left or right of the word to count co-occurreances, which is (context_ize / 2)
    :return: co-occurrence matrix
    '''
    assert (context_size % 2) == 1
    
    n = sequence_size = len(sequence)
    co_matrix = np.zeros((vocabulary_size, vocabulary_size), dtype=np.int32)

    window_size = int((context_size -1) / 2)
    for idx, word_id in enumerate(sequence):
        for i in range(1, window_size + 1):
            left_idx = idx - i
            right_idx = idx + i

            if left_idx >= 0:
                left_word_id = sequence[left_idx]
                co_matrix[word_id, left_word_id] += 1

            if right_idx < sequence_size:
                right_word_id = sequence[right_idx]
                co_matrix[word_id, right_word_id] += 1
                
    # Zero clear the co-occurrence words of NIL because NIL should not see other words.
    co_matrix[
        word_to_id[NIL.lower()]
    ] = 0

    return co_matrix

In [None]:
#%%timeit -n 1 
if VALIDATION:
    com0 = dlfs2_create_co_matrix(sequence, vocabulary_size, CONTEXT_SIZE)
    print(com0.shape)
    
    if not USE_PTB:
        print(com0)
        print("com0.sum() {}".format(com0.sum()))

    # Total sum of all word occurrences except NIL must matches with the original corpus size.
    assert total_frequencies(com0, word_to_id) == len(sequence) - (CONTEXT_SIZE -1)
    assert com0.sum() == (len(sequence) - (2*STRIDE)) * (2*STRIDE)  # Exclude NIL from the sequence

In [None]:
#print(id_to_word[4])
#cooccurrence_words(com0, id_to_word[4], word_to_id, id_to_word)

In [None]:
%lprun \
    -T dlfs2_create_co_matrix.log \
    -f dlfs2_create_co_matrix \
    dlfs2_create_co_matrix(sequence, vocabulary_size, CONTEXT_SIZE)
    
print(open('dlfs2_create_co_matrix.log', 'r').read())

### Improved version
* With the NIL padded sequence, no need for the boundary checks e.g. left_idx >= 0.
* By limiting the position range to ```sequence[stride : ((n-1)-stride) +1]```, no need to zero-clear the co-occurrence words of NIL

In [None]:
def create_co_matrix(sequence, vocabulary_size, context_size=3):
    '''Generate co-occurreance matrix for the sequence.
    :param sequence: word index sequence of the sequence
    :param vocabulary_size:The number of unique words in the sequence. 
    :param stride: 
        The number of words either left or right of the word to count co-occurreances, which is (context_ize / 2)
    :return: co-occurrence matrix
    '''
    assert (context_size % 2) == 1
    
    n = sequence_size = len(sequence)
    co_matrix = np.zeros((vocabulary_size, vocabulary_size), dtype=np.int32)

    stride = int((context_size -1) / 2)
    for position, word_id in enumerate(sequence[stride : ((n-1)-stride) +1], stride):
        for i in range(1, stride + 1):
            left_idx =position - i
            right_idx =position + i

            left_word_id = sequence[left_idx]
            co_matrix[word_id, left_word_id] += 1

            right_word_id = sequence[right_idx]
            co_matrix[word_id, right_word_id] += 1
                
    return co_matrix

### Validation

In [None]:
#%%timeit -n 1 
if VALIDATION:
    com1 = create_co_matrix(sequence, vocabulary_size, CONTEXT_SIZE)
    print(com1.shape)

    if not USE_PTB:
        print(com1)
        print("com1.sum() {}".format(com1.sum()))

    # Total sum of all word occurrences except NIL must matches with the original corpus size.
    assert total_frequencies(com1, word_to_id) == len(sequence) - (CONTEXT_SIZE -1)
    assert np.array_equal(com1, com0)

### Profiling

In [None]:
%lprun \
    -T create_co_matrix.log \
    -f create_co_matrix \
    create_co_matrix(sequence, vocabulary_size, CONTEXT_SIZE)

print(open('create_co_matrix.log', 'r').read())

## Re-impementation of the DLFS2 improved version

In [None]:
def simulate_create_co_matrix(sequence, co_occurrence_vector_size, context_size=3):
    """Implement the same logic with dlfs2 create_co_matrix.
    Args: 
        sequence: word index sequence of the original corpus text
        co_occurrence_vector_size: 
        context_size: context (N-gram size N) within to check co-occurrences.
    Returns:
        co_occurrence matrix
    """
    assert int(context_size %2) == 1
    
    n = sequence_size = len(sequence)
    co_occurrence_matrix = np.zeros((co_occurrence_vector_size, co_occurrence_vector_size), dtype=np.int32)

    stride = int((context_size - 1)/2 )
    assert(n > stride), "sequence_size {} is less than/equal to stride {}".format(
        n, stride
    )

    for position in range(stride, (n-1) - stride +1):        
        # --------------------------------------------------------------------------------
        # Super slow spending approx 75% of execution time 35 secs
        # Line #   Hits    Time       Per Hit   % Time 
        # 36    932231     805098.0      0.9      2.3          word_id = sequence[position]
        # 37   2796693    2240226.0      0.8      6.4          for offset in range(1, stride+1):
        # 38   5593386   11444886.0      2.0     32.6              co_occurrence_matrix[
        # 39   3728924    1921460.0      0.5      5.5                  word_id,
        # 40   1864462    1427308.0      0.8      4.1                  sequence[position - offset]
        # 41   1864462    1022951.0      0.5      2.9              ] +=1
        # 42   5593386   11203813.0      2.0     32.0              co_occurrence_matrix[
        # 43   3728924    1972667.0      0.5      5.6                  word_id,
        # 44   1864462    1473145.0      0.8      4.2                  sequence[position + offset]
        # 45   1864462    1029893.0      0.6      2.9              ] +=1
        # --------------------------------------------------------------------------------
        word_id = sequence[position]
        for offset in range(1, stride+1):
            co_occurrence_matrix[
                word_id,
                sequence[position - offset]
            ] +=1
            co_occurrence_matrix[
                word_id,
                sequence[position + offset]
            ] +=1
        # --------------------------------------------------------------------------------

    return co_occurrence_matrix

### Validation

In [None]:
if VALIDATION:
    com2 = simulate_create_co_matrix(sequence, vocabulary_size, CONTEXT_SIZE)
    print(com2.shape)

    if not USE_PTB:
        print(com2)
        print("com2.sum() {}".format(com2.sum()))

    # Total sum of all word occurrences except NIL must matches with the original corpus size.
    assert total_frequencies(com2, word_to_id) == len(sequence) - (CONTEXT_SIZE -1)
    assert np.array_equal(com2, com0)

### Profiling

In [None]:
%lprun \
    -T simulate_create_co_matrix.log \
    -f simulate_create_co_matrix \
    simulate_create_co_matrix(sequence, vocabulary_size, CONTEXT_SIZE)

print(open('simulate_create_co_matrix.log', 'r').read())

## Vectorized approach

In [None]:
def debug(sequence, index, stride, flag=False):
    if not flag:
        return
    
    n = len(sequence)
    print("word is {} and context is {}".format(
        id_to_word[sequence[index]],
        [ id_to_word[i] for i in sequence[max(0, (index-stride)) : min((index+stride) +1, n)]]
    ))

<img src="image/cooccurrence_matrix.png" align="left" width=1000 />

### Effect of padding with NIL to get F((w)
Be able get the number of times when the word **w** occurred in the sequence from the co occurrence matrix.
<img src="image/co_occurrence_matrix_counting_with_nil.png" align="left" width=1000/>

In [None]:
def create_cooccurrence_matrix(sequence, co_occurrence_vector_size, context_size=3):
    """
    Args: 
        sequence: word index sequence of the original corpus text
        co_occurrence_vector_size: 
        context_size: context (N-gram size N) within to check co-occurrences.
    Returns:
        co_occurrence matrix
    """
    assert int(context_size %2) == 1
    
    n = sequence_size = len(sequence)
    co_occurrence_matrix = np.zeros((co_occurrence_vector_size, co_occurrence_vector_size), dtype=np.int32)

    stride = int((context_size - 1)/2 )
    assert(n > stride), "sequence_size {} is less than/equal to stride {}".format(
        n, stride
    )

    for position in range(stride, (n-1) - stride +1):        
        # --------------------------------------------------------------------------------
        # Consider counting a word multiple time, and the word itself for performance.
        # e.g. stride=2
        # |W|W|W|W|W| If co-occurrences are all same word W at the position, need +4 for W
        # |X|X|W|X|X| If co-occurrences are all same word X, need +4 for X
        # |X|X|W|Y|Y| If co-occurrences X x 2, Y x 2, then need +2 for X and Y respectively.
        # --------------------------------------------------------------------------------

        # --------------------------------------------------------------------------------
        # Line #   Hits    Time       Per Hit   % Time 
        # 32   1864462    5358433.0      2.9     61.5          np.add.at(
        # 33    932231     446858.0      0.5      5.1             co_occurrence_matrix,
        # 34    932231     463579.0      0.5      5.3             (
        # 35    932231     609110.0      0.7      7.0                 sequence[position],
        # 36    932231     862299.0      0.9      9.9                 sequence[position-stride : (position+stride) +1]    
        # 37                                                      ),
        # 38    932231     437542.0      0.5      5.0             1
        # 39                                                   )
        # --------------------------------------------------------------------------------
        # word_id = sequence[position]
        np.add.at(
           co_occurrence_matrix,
           (
               sequence[position],                               # word_id
               sequence[position-stride : (position+stride) +1]  # indices to co-occurrence words 
           ),
           1
        )
        # --------------------------------------------------------------------------------

    # --------------------------------------------------------------------------------
    # Compensate the +1 self count of a word at each occurrence.
    # F(w) (frequency/occurrences of a word in the sequence) has been extra added besides 
    # the expected (2 * stride) * F(w) times, resulting in (context_size) * F(w).
    # --------------------------------------------------------------------------------
    np.fill_diagonal(
        co_occurrence_matrix,
        (co_occurrence_matrix.diagonal() - co_occurrence_matrix.sum(axis=1) / context_size)
    )

    return co_occurrence_matrix

### Debug version for trouble shooting

In [None]:
def debug_cooccurrence_matrix(sequence, co_occurrence_vector_size, context_size=3):
    """
    Args: 
        sequence: word position sequence of the original corpus text
        co_occurrence_vector_size: 
        context_size: context (N-gram size N) within to check co-occurrences.
    Returns:
        co_occurrence matrix
    """
    assert int(context_size %2) == 1

    n = sequence_size = len(sequence)
    co_occurrence_matrix = np.zeros((co_occurrence_vector_size, co_occurrence_vector_size), dtype=np.int32)
    co_matrix = np.zeros((co_occurrence_vector_size, co_occurrence_vector_size), dtype=np.int32)

    stride = int((context_size - 1)/2 )
    assert(n > stride), "sequence_size {} is less than/equal to stride {}".format(
        n, stride
    )

    print("Starting comparison")
    for position in range(stride, (n-1) - stride +1):
        print(position)
        word_id = sequence[position]

        for i in range(1, stride + 1):
            left_idx = position - i
            right_idx = position + i

            left_word_id = sequence[left_idx]
            co_matrix[word_id, left_word_id] += 1

            right_word_id = sequence[right_idx]
            co_matrix[word_id, right_word_id] += 1
        
        # --------------------------------------------------------------------------------
        # Consider counting the word itself. 
        # e.g. stride=2
        # |W|W|W|W|W| If co-occurrences are all same word W at the position, need +4 for W
        # |X|X|W|X|X| If co-occurrences are all same word X, need +4 for X
        # |X|X|W|Y|Y| If co-occurrences X x 2, Y x 2, then need +2 for X and Y respectively.
        # --------------------------------------------------------------------------------
        np.add.at(
            co_occurrence_matrix,
            (
                word_id,
                sequence[position-stride : (position+stride) +1]    # positions to co-occurence words 
            ),
            1
        )
        # --------------------------------------------------------------------------------
        # Remove the +1 self count of the word itself.
        # To avoid the cost of creating a gapped indices np.r_[sequence, [i:j, j+1:k]].
        # --------------------------------------------------------------------------------
        co_occurrence_matrix[word_id, word_id] -=1

        
        if(np.array_equal(co_matrix, co_occurrence_matrix)) is not True:
            print("sequence position is {}".format(position))
            print("Test matrix index \n{}\ matrix {}\n".format(
                [
                    word_id,                       # position to the word
                    sequence[(position-stride) : (position+stride) +1]  # indices to right co-occurrence words excluding word itself
                ],
                co_matrix[
                    [
                        word_id,                       # position to the word
                        sequence[(position-stride) : (position+stride) +1]  # indices to right co-occurrence words excluding word itself
                    ]
                ]
            ))
            print("co_occurrence_matrix index is \n{}\nmatrix is \n{}\n".format(
                [
                    word_id,   # position  to the word 
                    sequence[position-stride : (position+stride) +1]  # positions to co-occurence words 
                ],
                co_occurrence_matrix[          
                    [
                        word_id,   # position  to the word 
                        sequence[position-stride : (position+stride) +1]  # positions to co-occurence words 
                    ]
                ]
            ))
            debug(sequence, position, stride, True) 
            print("diff \n{}".format(co_matrix - co_occurrence_matrix))

            assert False
        
    return co_occurrence_matrix

### Varidation

In [None]:
#%%timeit -n 1 
if DEBUG:
    f = debug_cooccurrence_matrix
else:
    f = create_cooccurrence_matrix

if VALIDATION:
    com = f(sequence, vocabulary_size, CONTEXT_SIZE)
    print(com.shape)

    if not USE_PTB:
        print(com)
        print("com.sum() {}".format(com.sum()))

    assert total_frequencies(com, word_to_id) == len(sequence) - (CONTEXT_SIZE -1)
    assert np.array_equal(com0, com)

### Profiling

In [None]:
%lprun \
    -T create_cooccurrence_matrix.log \
    -f create_cooccurrence_matrix \
    create_cooccurrence_matrix(sequence, vocabulary_size, CONTEXT_SIZE)

print(open('create_cooccurrence_matrix.log', 'r').read())

# Research

In [None]:
def research_create_co_occurrence_matrix(sequence, co_occurrence_vector_size, context_size=3):
    """Implement the same logic with dlfs2 create_co_matrix.
    Args: 
        sequence: word index sequence of the original corpus text
        co_occurrence_vector_size: 
        context_size: context (N-gram size N) within to check co-occurrences.
    Returns:
        co_occurrence matrix
    """
    assert int(context_size %2) == 1
    
    n = sequence_size = len(sequence)
    co_occurrence_matrix = np.zeros((co_occurrence_vector_size, co_occurrence_vector_size), dtype=np.int32)

    stride = int((context_size - 1)/2 )
    assert(n > stride), "sequence_size {} is less than/equal to stride {}".format(
        n, stride
    )

    for position in range(stride, (n-1) - stride +1):        
        # --------------------------------------------------------------------------------
        # Consider counting the word itself. 
        # e.g. stride=2
        # |W|W|W|W|W| If co-occurrences are all same word W at the position, need +4 for W
        # |X|X|W|X|X| If co-occurrences are all same word X, need +4 for X
        # |X|X|W|Y|Y| If co-occurrences X x 2, Y x 2, then need +2 for X and Y respectively.
        # --------------------------------------------------------------------------------

        # --------------------------------------------------------------------------------
        # Super slow spending approx 75% of execution time
        # --------------------------------------------------------------------------------
        #co_occurrence_vector = co_occurrence_matrix[sequence[position]]
        #for index in range(context_size):
        #    co_occurrence_vector[index] += 1
        #REQUIRE_REMOVE_SELF_COUNTING = True
        # --------------------------------------------------------------------------------

        # --------------------------------------------------------------------------------
        # Super slow spending approx 75% of execution time 35 secs
        # Line #   Hits    Time       Per Hit   % Time 
        # 36    932231     805098.0      0.9      2.3          word_id = sequence[position]
        # 37   2796693    2240226.0      0.8      6.4          for offset in range(1, stride+1):
        # 38   5593386   11444886.0      2.0     32.6              co_occurrence_matrix[
        # 39   3728924    1921460.0      0.5      5.5                  word_id,
        # 40   1864462    1427308.0      0.8      4.1                  sequence[position - offset]
        # 41   1864462    1022951.0      0.5      2.9              ] +=1
        # 42   5593386   11203813.0      2.0     32.0              co_occurrence_matrix[
        # 43   3728924    1972667.0      0.5      5.6                  word_id,
        # 44   1864462    1473145.0      0.8      4.2                  sequence[position + offset]
        # 45   1864462    1029893.0      0.6      2.9              ] +=1
        # --------------------------------------------------------------------------------
        # word_id = sequence[position]
        # for offset in range(1, stride+1):
        #     co_occurrence_matrix[
        #         word_id,
        #         sequence[position - offset]
        #     ] +=1
        #     co_occurrence_matrix[
        #         word_id,
        #         sequence[position + offset]
        #     ] +=1
        # REQUIRE_REMOVE_SELF_COUNTING = False
        # --------------------------------------------------------------------------------
            
        # --------------------------------------------------------------------------------
        # Consider counting the word itself. 
        # e.g. stride=2
        # |W|W|W|W|W| If co-occurrences are all same word W at the position, need +4 for W
        # |X|X|W|X|X| If co-occurrences are all same word X, need +4 for X
        # |X|X|W|Y|Y| If co-occurrences X x 2, Y x 2, then need +2 for X and Y respectively.
        # --------------------------------------------------------------------------------

        # --------------------------------------------------------------------------------
        # xslice() to create combined slices costs a lot.
        # Total time: 22.0234 s
        # Line #   Hits    Time       Per Hit   % Time 
        # 72    932231     945544.0      1.0      4.3          word_id = sequence[position]
        # 73   1864462    6838467.0      3.7     31.1          np.add.at(
        # 74    932231     613173.0      0.7      2.8              co_occurrence_matrix,
        # 75    932231     671443.0      0.7      3.0              (
        # 76    932231     583880.0      0.6      2.7                  word_id,
        # 77   1864462    7954610.0      4.3     36.1                  xslice(
        # 78    932231     600371.0      0.6      2.7                      sequence,
        # 79    932231    1868599.0      2.0      8.5                      np.s_[position-stride: position, position+1 : position+stride +1]
        # 80                                                           )
        # 81                                                       ),
        # 82    932231     615451.0      0.7      2.8              1
        # 83                                                   )
        # 84    932231     692304.0      0.7      3.1          REQUIRE_REMOVE_SELF_COUNTING = False        
        # --------------------------------------------------------------------------------
        word_id = sequence[position]
        np.add.at(
            co_occurrence_matrix,
            (
                word_id,
                xslice(
                    sequence,
                    np.s_[position-stride: position, position+1 : position+stride +1]
                )
            ),
            1
        )
        REQUIRE_REMOVE_SELF_COUNTING = False

        # --------------------------------------------------------------------------------
        # Line #   Hits    Time       Per Hit   % Time 
        # 32   1864462    5358433.0      2.9     61.5          np.add.at(
        # 33    932231     446858.0      0.5      5.1             co_occurrence_matrix,
        # 34    932231     463579.0      0.5      5.3             (
        # 35    932231     609110.0      0.7      7.0                 sequence[position],
        # 36    932231     862299.0      0.9      9.9                 sequence[position-stride : (position+stride) +1]    
        # 37                                                      ),
        # 38    932231     437542.0      0.5      5.0             1
        # 39                                                   )
        # --------------------------------------------------------------------------------
        #np.add.at(
        #   co_occurrence_matrix,
        #   (
        #       sequence[position],
        #       sequence[position-stride : (position+stride) +1]    # positions to co-occurence words 
        #   ),
        #   1
        #)
        #REQUIRE_REMOVE_SELF_COUNTING = True
        # --------------------------------------------------------------------------------
        
        # --------------------------------------------------------------------------------
        # Remove self counting of the word at position itself.
        # To avoid the cost of creating a gapped indices np.r_[sequence, [i:j, j+1:k]].
        # --------------------------------------------------------------------------------
        # Very slow:  
        # Line #   Hits    Time       Per Hit   % Time 
        # ==============================================================
        # X        932231  4903853.0  5.3       33.5          
        # --------------------------------------------------------------------------------
        # co_occurrence_matrix[sequence[position],sequence[position]] -=1
        # --------------------------------------------------------------------------------

    # --------------------------------------------------------------------------------
    # Remove self counting of the word at position itself.
    # Line #   Hits   Time       Per Hit   % Time 
    # ==============================================================
    # X        1      86069.0    86069.0   0.9 
    # --------------------------------------------------------------------------------
    if REQUIRE_REMOVE_SELF_COUNTING:
        np.fill_diagonal(
            co_occurrence_matrix,
            (co_occurrence_matrix.diagonal() - co_occurrence_matrix.sum(axis=1) / context_size)
        )

    return co_occurrence_matrix

In [None]:
if VALIDATION:
    _matrix = research_create_co_occurrence_matrix(sequence, vocabulary_size, CONTEXT_SIZE)
    print(_matrix.shape)

    if not USE_PTB:
        print(_matrix)
        print("_matrix.sum() {}".format(_matrix.sum()))

    # Total sum of all word occurrences except NIL must matches with the original corpus size.
    assert total_frequencies(_matrix, word_to_id) == len(sequence) - (CONTEXT_SIZE -1)
    assert np.array_equal(_matrix, com0)

In [None]:
%lprun \
    -T research_create_co_occurrence_matrix.log \
    -f research_create_co_occurrence_matrix \
    research_create_co_occurrence_matrix(sequence, vocabulary_size, CONTEXT_SIZE)

print(open('research_create_co_occurrence_matrix.log', 'r').read())