In [6]:
import nltk
import string
import itertools
import os
import pickle
import itertools
import numpy as np
import gensim

In [2]:
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/march/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/march/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Métricas

In [3]:
def freq_one(word, tokens):
    'Frecuencia de una palabra en un texto (Church 1990)'
    return sum((1 for t in tokens if t == word)) / float(len(tokens))

In [4]:
def freq_two(first, second, tokens, win_size=5):
    '''
    Frecuencia de dos palabras en un texto (Church 1990)
    
    Aplica correccion de (w - 1) para compensar por múltiples conteos.
    
    O(|tokens| * win_size)
    '''
    count = 0
    for idx in range(len(tokens)):
        left = max(0, idx - win_size + 1)
        if tokens[idx] == second:
            for x in tokens[left:idx]:
                if x == first:
                    count += 1
    return count / (win_size - 1) / float(len(tokens))

In [5]:
def freq_two_fast(first_token_pos, second_token_pos, token_count, win_size=5):
    '''
    Equivalente a freq_two pero más veloz.
    
    Args:
        first_token_pos :: [int] lista de posiciones en el texto donde se encuentra el token a izq
        second_token_pos :: [int] lista de posiciones en el texto donde se encuentra el token a der
        token_count :: int cantidad te tokens en el texto
        
    O(#apariciones token 1 + #apariciones token 2)
    '''
    s_it = reversed(second_token_pos)
    f_it = reversed(first_token_pos)
    
    count = 0
    
    try:
        f_idx = next(f_it)
        s_idx = next(s_it)
        while True:
            while (f_idx > s_idx):
                f_idx = next(f_it)
            local_f_it, f_it = itertools.tee(f_it)
            local_f_idx = f_idx
            while (s_idx - local_f_idx < win_size):
                if (s_idx - local_f_idx > 0):
                    count += 1
                local_f_idx = next(local_f_it)
            s_idx = next(s_it)
    except StopIteration:
        return count / (win_size - 1) / token_count

In [6]:
def info(first, second, tokens, win_size=5):
    'Mutual information of two tokens (Church 1990)'
    f2 = freq_two(first, second, tokens, win_size)
    if f2 < 6:
        return None
    fa = freq_one(first, tokens)
    fb = freq_one(second, tokens)
    q = f2 / (fa * fb)
    return np.log2(q)

In [112]:
def q_fast(first_token_pos, second_token_pos, token_count, win_size=5):
    f2 = freq_two_fast(first_token_pos, second_token_pos, token_count, win_size)
    #if f2 < (6 / (win_size - 1) / token_count):
    #    return None
    if f2 < 1:
        return None
    fa = len(first_token_pos)
    fb = len(second_token_pos)
    q = f2 / (fa * fb)
    return q 

### Corpus

In [8]:
punc_translator = str.maketrans(dict(zip(string.punctuation, [None] * len(string.punctuation))))
def tokenize(string):
    'Returns list of tokens'
    # TODO: Cambiar split por: https://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer
    words = string.split(' ')  # [Palabra]
    retoken = [ t.lower()
        for w in words
        for t in nltk.word_tokenize(w)
    ] 
    no_punt = [ t
        for t in retoken
        if t.translate(punc_translator) != ''
    ]
    return no_punt 

In [9]:
docno = []
corpus = []
toggle = 0
with open('ap/ap.txt','r') as f:
    for line in f:
        if line.startswith('<DOCNO>'):
            line_new = line.replace('<DOCNO> ','')
            line_new = line_new.replace(' </DOCNO>\n','')
            docno.append(line_new)
        if toggle:
            corpus.append(line)
            toggle = 0
        if line.startswith('<TEXT>'):
            toggle = 1        
docs = dict(zip(docno, corpus))      

#### Tests métricas

In [107]:
a = next(iter(docs.values()))

In [40]:
a_tokens = tokenize(a)

In [57]:
a_commons = [x[0] for x in Counter(sorted(a_tokens)).most_common(10)]

In [145]:
a_commons_pos = {
    token : [idx for idx, t in enumerate(a_tokens) if t == token]
    for token in a_commons
}

In [190]:
first = a_commons[0]
for j in range(10):
    second = a_commons[j]
    f = freq_two(first, second, a_tokens)
    f2 = freq_two_fast(a_commons_pos[first], a_commons_pos[second], len(a_tokens))
    print(first, second, f2 * len(a_tokens) * 4, f * len(a_tokens) * 4, f == f2)

the the 8.0 8.0 True
the a 3.0 3.0 True
the and 3.0 3.0 True
the said 4.0 4.0 True
the was 5.000000000000001 5.000000000000001 True
the of 7.0 7.0 True
the 's 7.0 7.0 True
the boy 9.0 9.0 True
the he 0.0 0.0 True
the school 6.0 6.0 True


###  Top 500

In [15]:
from collections import Counter
from collections import defaultdict

In [8]:
PICKLE_DIR = 'pickles'
EJ1_TOKENS_PATH = os.path.join(PICKLE_DIR, 'ej1_tokens.pkl')

In [9]:
if not os.path.isdir(PICKLE_DIR):
    os.mkdir(PICKLE_DIR)

In [10]:
if not os.path.isfile(EJ1_TOKENS_PATH):
    token_docs = {
        docno : tokenize(text)
        for docno, text in docs.items()
    }
    with open(EJ1_TOKENS_PATH, 'wb') as f:
        pickle.dump(token_docs, f)
else:
    with open(EJ1_TOKENS_PATH, 'rb') as f:
        token_docs = pickle.load(f)

In [11]:
all_tokens = [t for tokens in token_docs.values() for t in tokens]

In [12]:
top_tokens_counts = Counter(sorted(all_tokens)).most_common(500)

In [13]:
top_tokens = [x[0] for x in top_tokens_counts]

In [14]:
token_docs = {
    docno : tokens
    for docno, tokens in token_docs.items()
    if len(tokens) > 0
}

In [17]:
top_tokens_set = set(top_tokens)

In [75]:
# Armamos la matriz de co-frecuencias
win_size = 5
co_freq = defaultdict(lambda : defaultdict(int))
freq = defaultdict(int)

for tokens in token_docs.values():
    for i in range(len(tokens)):
        if tokens[i] in top_tokens_set:
            freq[tokens[i]] += 1
            for k in range(1, min(len(tokens) - i, win_size - 1)):
                if tokens[i + k] in top_tokens_set:
                    dists.append(i - i + k)
                    co_freq[tokens[i]][tokens[i+k]] += 1

In [76]:
# Ajustamos el valor por (win_size - 1)
for t1 in co_freq:
    for t2 in co_freq[t1]:
        co_freq[t1][t2] /= (win_size - 1)

In [77]:
# Calculamos N
N = sum([len(tokens) for tokens in token_docs.values()])

In [78]:
relations = {
    (t1, t2) : np.log2((co_freq[t1][t2] / N) / (freq[t1] * freq[t2] / (N * N)))
    for t1 in co_freq
    for t2 in co_freq[t1]
    if co_freq[t1][t2] * (win_size - 1) > 5
}

In [79]:
top_relations = sorted([r for r in relations.items() if r[0] != 0.0], 
                       key=lambda kv: kv[1], reverse=True)

In [80]:
top_relations[:15]

[(('prime', 'minister'), 8.6837530270268442),
 (('human', 'rights'), 8.5526180603043116),
 (('interest', 'rates'), 8.3645610699064417),
 (('south', 'africa'), 8.169449800993279),
 (('eastern', 'europe'), 7.9623518920134355),
 (('stock', 'exchange'), 7.6810756686074395),
 (('west', 'german'), 7.6212728580275799),
 (('district', 'judge'), 7.5351932278390263),
 (('news', 'conference'), 7.528537573654515),
 (('united', 'states'), 7.442257478147325),
 (('west', 'germany'), 7.3638379235443185),
 (('air', 'force'), 7.3537929912865181),
 (('told', 'reporters'), 7.3484801332298364),
 (('communist', 'party'), 7.3327744825370926),
 (('white', 'house'), 7.3271189120223053)]