In [129]:
import gensim
import itertools
import re
import nltk
import os
import pickle

#### Parseamos TASA

In [91]:
with open('tasa', 'r') as f:
    all_tasa = f.read()

In [144]:
tasa_documents = [doc for doc in all_tasa.split('\n\n') if len(doc) > 0]

In [103]:
def get_tas_doc_id(tasa_doc):
    first_line = tasa_doc.split('\n', 2)[0]
    doc_id = re.match('\[([a-zA-Z0-9\.]*)\]', first_line).groups()[0]
    return doc_id

In [125]:
import string
punc_translator = str.maketrans(dict(zip(string.punctuation, [None] * len(string.punctuation))))
def tokenize(string):
    'Returns list of tokens'
    # TODO: Cambiar split por: https://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer
    #words = string.split(' ')  # [Palabra]
    retoken = [ t.lower()
        for t in nltk.word_tokenize(string)
    ] 
    no_punt = [ t
        for t in retoken
        if t.translate(punc_translator) != ''
    ]
    
    return no_punt 

In [109]:
def get_tasa_doc_tokens(tasa_doc):
    text = ' '.join([
            l.strip().replace('\n', '') 
            for l in tasa_doc.split('[S]')[1:]
        ])
    return tokenize(text)

In [177]:
PICKLES_DIR = 'pickles'
EJ3_TASA_TOKENS_PATH = os.path.join(PICKLES_DIR, 'tasa_tokens.pkl')
if not os.path.isdir(PICKLES_DIR):
    os.mkdir(PICKLES_DIR)

if not os.path.isfile(EJ3_TASA_TOKENS_PATH):
    tasa_tokens = []
    for tasa_doc in tasa_documents:
        tasa_tokens.append(
            (get_tas_doc_id(tasa_doc), get_tasa_doc_tokens(tasa_doc))
        )
    tasa_tokens = dict(tasa_tokens)
    
    with open(EJ3_TASA_TOKENS_PATH, 'wb') as f:
        pickle.dump(tasa_tokens, f)
else:
    with open(EJ3_TASA_TOKENS_PATH, 'rb') as f:
        tasa_tokens = pickle.load(f)

#### Entrenamos LSI (LSA Indexing)

In [178]:
tasa_dict = gensim.corpora.Dictionary(tasa_tokens.values())

In [179]:
tasa_corpus = { 
    doc_id : tasa_dict.doc2bow(tokens)
    for doc_id, tokens in tasa_tokens.items()
}

In [180]:
tasa_lsi = gensim.models.lsimodel.LsiModel(corpus=tasa_corpus.values(), 
                                           id2word=tasa_dict, 
                                           num_topics=300)

#### Parseamos WordSim

In [185]:
import pandas as pd
ws = pd.DataFrame.from_csv('wordsim/combined.csv', index_col=None)

In [211]:
ws.columns = ['w1', 'w2', 'human_dist']

In [289]:
ws.head()

Unnamed: 0,w1,w2,human_dist
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.0
3,book,paper,7.46
4,computer,keyboard,7.62


In [232]:
from scipy.spatial.distance import cosine

In [405]:
def tasa_dist(word_1, word_2, tasa_dict=tasa_dict, tasa_lsi=tasa_lsi):
    w_id_1 = tasa_dict.doc2bow([word_1])
    w_id_2 = tasa_dict.doc2bow([word_2])
    if len(w_id_1) == 0 or len(w_id_2) == 0:
        return None
    wv1 = tasa_lsi[w_id_1]
    wv2 = tasa_lsi[w_id_2]
    csc1 = gensim.matutils.corpus2dense([wv1], 300).reshape(-1)
    csc2 = gensim.matutils.corpus2dense([wv2], 300).reshape(-1)
    return ((1 + (1 - cosine(csc1, csc2))) / 2) * 10

In [406]:
ws['lsi'] = [tasa_dist(*r[1][['w1', 'w2']]) for r in ws.iterrows()]

In [407]:
ws

Unnamed: 0,w1,w2,human_dist,lsi
0,love,sex,6.77,5.981308
1,tiger,cat,7.35,5.346739
2,tiger,tiger,10.00,10.000000
3,book,paper,7.46,5.132282
4,computer,keyboard,7.62,9.404235
5,computer,internet,7.58,
6,plane,car,5.77,4.766024
7,train,car,6.31,7.285066
8,telephone,communication,7.50,6.127763
9,television,radio,6.77,7.504021
