In [1]:
import sys
import os
import re
from typing import Dict, List

import numpy as np
import tensorflow as tf
print(tf.__version__)

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(linewidth=400) 

%load_ext line_profiler

2.3.0


# Constants

In [2]:
from preprocessing import (
    DELIMITER,
    SPACE,
    NIL
)

STRIDE = 2
CONTEXT_SIZE = 1 + (STRIDE * 2)

USE_PTB = False
USE_NATIVE=True

In [3]:
DEBUG = False
VALIDATION = True

# Text to process

### Test sample text

In [4]:
#corpus = "The fool doth think he is wise, but the wise man knows himself to be a fool."
#corpus = "To be, or not to be, that is the question"
#corpus = "To to be be, or not not not not not to be, that is that the question that matters"
corpus = "To be, or not to be, that is the question that matters"
#corpus = "You say goodbye and I say hello ."
#corpus = "I know how to build an attention in neural networks. But I don’t understand how attention layers learn the weights that pay attention to some specific embedding. I have this question because I’m tackling a NLP task using attention layer. I believe it should be very easy to learn (the most important part is to learn alignments). However, my neural networks only achieve 50% test set accuracy. And the attention matrix is weird. I don’t know how to improve my networks."

## PTB (Penn Treebank) 

In [5]:
from data.ptb import load_data

/home/oonisim/dataset


# Utilities

### Padding

To avoid boundary checking when iterate through the sequenced corpus, pad the source text with '<nil>'.
e.g. (when context is of size 5):    
From:
```
|B|X|Y|Z|...|P|Q|R|E|
```

To:
```
|<nil>|<nil>|B|X|Y|Z|...|P|Q|R|E|<nil>|<nil>| 
```

In [6]:
from preprocessing.text import (
    PAD_MODE_SQUEEZE,
    pad_text
)

# Word indexing
Assign a numerical id to each word.

The row index of co-occurrence matrix is a word index. The number of words in the corpus can be less than the number of word indices because additional meta-word such as OOV, UNK, NIL can be added to the original corpus.

Make sure **the co-occurrence matrix row index matches with the word index**, unless explicitly adjust when row-index and word-index do not match.

## Load and pad the corpus text

In [7]:
if USE_PTB:
    corpus = pad_text(
        corpus=load_text('train'), mode=PAD_MODE_SQUEEZE, delimiter=SPACE, padding=NIL, length=STRIDE
    )
else:
    print("Original corpus: \n[{}]".format(corpus))
    corpus = pad_text(
        corpus=corpus, mode=PAD_MODE_SQUEEZE, delimiter=SPACE, padding=NIL, length=STRIDE
    )
    print("Padded corpus: \n[{}]".format(corpus))

Original corpus: 
[To be, or not to be, that is the question that matters]
Padded corpus: 
[<nil> <nil> To be, or not to be, that is the question that matters <nil> <nil>]


### Effect of padding with NIL to get F((w)
Be able get the number of times when the word **w** occurred in the sequence from the co occurrence matrix.
<img src="image/co_occurrence_matrix_counting_with_nil.png" align="left" width=1000/>

## Word indexing

### Native word indexing

In [8]:
from preprocessing.text import (
    text_to_sequence
)

### Native indexing

In [9]:
corpus = re.sub(r'[.,:;]+', SPACE, corpus.lower())

In [10]:
if USE_NATIVE:
    (sequence, word_to_id, id_to_word, vocabulary_size) = text_to_sequence(corpus)

print(vocabulary_size)
if not USE_PTB:
    print(word_to_id)

10
{'<nil>': 0, 'to': 1, 'be': 2, 'or': 3, 'not': 4, 'that': 5, 'is': 6, 'the': 7, 'question': 8, 'matters': 9}


### Tensorflow Tokenizer indexing

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")

USE_TENSORFLOW = (not USE_NATIVE)
if USE_TENSORFLOW:
    # Each text in "texts" is a complete document as one string, 
    # e.g "To be or not to be, that is the question."
    texts = [ corpus ]   

    # fit_on_texts() processes multiple documents and handles all words in all the documents.
    tokenizer.fit_on_texts(texts)
    word_to_id = tokenizer.word_index

    # texts_to_sequences() ruturns sequences, one sequence for each text in "texts".
    sequences = (tokenizer.texts_to_sequences(texts))
    sequence = sequences[0]

    print(len(sequences))
    print(len(word_to_id))
    
    # Index of tokenizer.word_index starts at 1, NOT 0.
    # e.g. {'<OOV>': 1, 'the': 2, 'fool': 3, 'wise': 4, 'doth': 5, ...}
    vocabulary_size = max(word_to_id.values()) + 1
    print(vocabulary_size)

## Verification

In [12]:
if not USE_PTB:
    print("word to id \n{}".format(word_to_id))
    print("id to word \n{}".format(id_to_word))
    print()
    print("corpus is \n[{}]".format(corpus))
    print("sequence is \n{}".format(sequence))
    print("corpus size is {} sequence size is {} expected sum is {}".format(
        len(re.compile('[\t\s]+').split(corpus)), 
        len(sequence), 
        (len(sequence) - (2*STRIDE)) * (2*STRIDE)  # Exclude NIL from the sequence
    ))
    #print([id_to_word[index] for index in sequence])
    print(np.array([id_to_word[index] for index in sequence]))

word to id 
{'<nil>': 0, 'to': 1, 'be': 2, 'or': 3, 'not': 4, 'that': 5, 'is': 6, 'the': 7, 'question': 8, 'matters': 9}
id to word 
{0: '<nil>', 1: 'to', 2: 'be', 3: 'or', 4: 'not', 5: 'that', 6: 'is', 7: 'the', 8: 'question', 9: 'matters'}

corpus is 
[<nil> <nil> to be  or not to be  that is the question that matters <nil> <nil>]
sequence is 
[0 0 1 2 3 4 1 2 5 6 7 8 5 9 0 0]
corpus size is 16 sequence size is 16 expected sum is 48
['<nil>' '<nil>' 'to' 'be' 'or' 'not' 'to' 'be' 'that' 'is' 'the' 'question' 'that' 'matters' '<nil>' '<nil>']


# Co-occurrence Matrix

<img src="image/cooccurrence_matrix.png" align="left" width=1000 />

In [13]:
from np.analytics import (
    create_cooccurrence_matrix,
    cooccurrence_words,
    word_frequency,
    total_frequencies
)

In [14]:
co_occurrece_matrix = create_cooccurrence_matrix(sequence, vocabulary_size, CONTEXT_SIZE)

if VALIDATION:
    print(co_occurrece_matrix.shape)

    if not USE_PTB:
        print(co_occurrece_matrix)
        print(".sum() {}".format(co_occurrece_matrix.sum()))

    assert total_frequencies(co_occurrece_matrix, word_to_id, CONTEXT_SIZE, NIL) == len(sequence) - (CONTEXT_SIZE -1)
    assert np.array_equal(co_occurrece_matrix, co_occurrece_matrix)

co_occurrece_matrix[1::, 1::]

(10, 10)
[[0 0 0 0 0 0 0 0 0 0]
 [2 0 2 2 1 1 0 0 0 0]
 [1 2 0 1 2 1 1 0 0 0]
 [0 2 1 0 1 0 0 0 0 0]
 [0 1 2 1 0 0 0 0 0 0]
 [1 1 1 0 0 0 1 2 1 1]
 [0 0 1 0 0 1 0 1 1 0]
 [0 0 0 0 0 2 1 0 1 0]
 [0 0 0 0 0 1 1 1 0 1]
 [2 0 0 0 0 1 0 0 1 0]]
.sum() 48


array([[0, 2, 2, 1, 1, 0, 0, 0, 0],
       [2, 0, 1, 2, 1, 1, 0, 0, 0],
       [2, 1, 0, 1, 0, 0, 0, 0, 0],
       [1, 2, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 1, 2, 1, 1],
       [0, 1, 0, 0, 1, 0, 1, 1, 0],
       [0, 0, 0, 0, 2, 1, 0, 1, 0],
       [0, 0, 0, 0, 1, 1, 1, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 1, 0]], dtype=int32)

# PPMI

##  Reference implementation from DSFS 2.

In [15]:
def dsfs2_ppmi(C, verbose=False, eps = 1e-8):
    '''PPMI（正の相互情報量）の作成

    :param C: 共起行列
    :param verbose: 進行状況を出力するかどうか
    :return:
    '''
    M = np.zeros_like(C, dtype=np.float32)
    N = np.sum(C)
    S = np.sum(C, axis=0)
    total = C.shape[0] * C.shape[1]
    cnt = 0

    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)
            M[i, j] = max(0, pmi)

    return M

In [16]:
#%%timeit n=1000
M = dsfs2_ppmi(co_occurrece_matrix)[1::, 1::]

In [17]:
print(M)

[[0.         1.1926451  2.         1.         0.19264509 0.         0.         0.         0.        ]
 [1.1926451  0.         0.77760756 1.7776076  0.         0.77760756 0.         0.         0.        ]
 [2.         0.77760756 0.         1.5849625  0.         0.         0.         0.         0.        ]
 [1.         1.7776076  1.5849625  0.         0.         0.         0.         0.         0.        ]
 [0.19264509 0.         0.         0.         0.         0.77760756 1.7776076  0.77760756 1.7776076 ]
 [0.         0.77760756 0.         0.         0.77760756 0.         1.5849625  1.5849625  0.        ]
 [0.         0.         0.         0.         1.7776076  1.5849625  0.         1.5849625  0.        ]
 [0.         0.         0.         0.         0.77760756 1.5849625  1.5849625  0.         2.5849626 ]
 [0.         0.         0.         0.         1.7776076  0.         0.         2.5849626  0.        ]]


## Own implementation

## PMI

<img src="image/pmi_from_co_occurrence_matrix.png" width=650 align="left"/>

In [18]:
from np.analytics import (
    pmi,
    ppmi
)

In [19]:
#%%timeit n=1000
m = ppmi(co_occurrece_matrix)[1::, 1::]

In [20]:
print(m)

[[0.         0.58496199 1.58496147 0.58496147 0.         0.         0.         0.         0.        ]
 [0.58496199 0.         0.58496147 1.58496147 0.         0.58496147 0.         0.         0.        ]
 [1.58496147 0.58496147 0.         1.58496043 0.         0.         0.         0.         0.        ]
 [0.58496147 1.58496147 1.58496043 0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.58496147 1.58496147 0.58496147 0.58496147]
 [0.         0.58496147 0.         0.         0.58496147 0.         1.58496043 1.58496043 0.        ]
 [0.         0.         0.         0.         1.58496147 1.58496043 0.         1.58496043 0.        ]
 [0.         0.         0.         0.         0.58496147 1.58496043 1.58496043 0.         1.58496043]
 [0.         0.         0.         0.         0.58496147 0.         0.         1.58496043 0.        ]]


In [21]:
(M - m).astype(int)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 1, 0]])

In [22]:
from np.analytics import (
    create_context_set
)

contexts, labels = create_context_set(sequence, CONTEXT_SIZE)