In [1]:
import gensim
import itertools
import re
import nltk
import os
import pickle

In [2]:
import numpy as np

#### Parseamos TASA

In [3]:
with open('tasa', 'r') as f:
    all_tasa = f.read()

In [4]:
tasa_documents = [doc for doc in all_tasa.split('\n\n') if len(doc) > 0]

In [5]:
def get_tas_doc_id(tasa_doc):
    first_line = tasa_doc.split('\n', 2)[0]
    doc_id = re.match('\[([a-zA-Z0-9\.]*)\]', first_line).groups()[0]
    return doc_id

In [6]:
import string
punc_translator = str.maketrans(dict(zip(string.punctuation, [None] * len(string.punctuation))))
def tokenize(string):
    'Returns list of tokens'
    # TODO: Cambiar split por: https://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer
    #words = string.split(' ')  # [Palabra]
    retoken = [ t.lower()
        for t in nltk.word_tokenize(string)
    ] 
    no_punt = [ t
        for t in retoken
        if t.translate(punc_translator) != ''
    ]
    
    return no_punt 

In [7]:
def get_tasa_doc_tokens(tasa_doc):
    text = ' '.join([
            l.strip().replace('\n', '') 
            for l in tasa_doc.split('[S]')[1:]
        ])
    return tokenize(text)

In [8]:
PICKLES_DIR = 'pickles'
EJ3_TASA_TOKENS_PATH = os.path.join(PICKLES_DIR, 'tasa_tokens.pkl')
if not os.path.isdir(PICKLES_DIR):
    os.mkdir(PICKLES_DIR)

if not os.path.isfile(EJ3_TASA_TOKENS_PATH):
    tasa_tokens = []
    for tasa_doc in tasa_documents:
        tasa_tokens.append(
            (get_tas_doc_id(tasa_doc), get_tasa_doc_tokens(tasa_doc))
        )
    tasa_tokens = dict(tasa_tokens)
    
    with open(EJ3_TASA_TOKENS_PATH, 'wb') as f:
        pickle.dump(tasa_tokens, f)
else:
    with open(EJ3_TASA_TOKENS_PATH, 'rb') as f:
        tasa_tokens = pickle.load(f)

#### Entrenamos LSI (LSA Indexing)

In [9]:
TASA_LSI_PATH = os.path.join(PICKLES_DIR, 'tasa_lsi.lsi')

In [10]:
if not os.path.isfile(TASA_LSI_PATH):
    tasa_dict = gensim.corpora.Dictionary(tasa_tokens.values())
    
    tasa_corpus = { 
        doc_id : tasa_dict.doc2bow(tokens)
        for doc_id, tokens in tasa_tokens.items()
    }
    
    tasa_lsi = gensim.models.lsimodel.LsiModel(corpus=tasa_corpus.values(), 
                                               id2word=tasa_dict, 
                                               num_topics=300)
    
    tasa_lsi.save(TASA_LSI_PATH)
else:
    tasa_lsi = gensim.models.LsiModel.load(TASA_LSI_PATH)

#### Parseamos WordSim

In [11]:
import pandas as pd
ws = pd.DataFrame.from_csv('wordsim/combined.csv', index_col=None)

In [12]:
ws.columns = ['w1', 'w2', 'human_dist']

In [13]:
ws.head()

Unnamed: 0,w1,w2,human_dist
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.0
3,book,paper,7.46
4,computer,keyboard,7.62


In [14]:
from scipy.spatial.distance import cosine

In [15]:
def tasa_dist(word_1, word_2, tasa_dict=tasa_dict, tasa_lsi=tasa_lsi):
    w_id_1 = tasa_dict.doc2bow([word_1])
    w_id_2 = tasa_dict.doc2bow([word_2])
    if len(w_id_1) == 0 or len(w_id_2) == 0:
        return None
    wv1 = tasa_lsi[w_id_1]
    wv2 = tasa_lsi[w_id_2]
    csc1 = gensim.matutils.corpus2dense([wv1], 300).reshape(-1)
    csc2 = gensim.matutils.corpus2dense([wv2], 300).reshape(-1)
    return ((1 + (1 - cosine(csc1, csc2))) / 2) * 10

In [16]:
ws['lsi'] = [tasa_dist(*r[1][['w1', 'w2']]) for r in ws.iterrows()]

### Word2Vec

In [None]:
TASA_WV_PATH = os.path.join(PICKLES_DIR, 'tasa_wv.wv')

In [None]:
if not os.path.isdir(TASA_LSI_PATH):
    tasa_wv = gensim.models.Word2Vec(list(tasa_tokens.values()), workers=4)
    tasa_wv.save(TASA_WV_PATH)
else:
    tasa_wv = gensim.models.Word2Vec.load(TASA_WV_PATH)

In [None]:
word_sim_wv = []
for r in ws.iterrows():
    w1, w2 = r[1][['w1', 'w2']]
    try:
        word_sim_wv.append(tasa_wv.similarity(w1, w2))
    except KeyError:
        word_sim_wv.append(None)

In [None]:
dif_nan = np.nonzero(pd.isnull(ws['lsi']) != pd.isnull(ws['wv']))

In [None]:
ws.iloc[dif_nan]