In [None]:
import json
import nltk
from nltk import *
from nltk.corpus import stopwords
from nltk.stem.porter import *
from stemming.porter2 import stem

In [None]:
from collections import Counter
import scipy.sparse as sps
import numpy as np
from sparsesvd import sparsesvd
from scipy.stats import spearmanr

In [None]:
def save_sparse_csr(filename, array):
    np.savez(filename, data=array.data, indices=array.indices,
             indptr=array.indptr, shape=array.shape)

def load_sparse_csr(filename):
    loader = np.load(filename)
    return sps.csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

In [None]:
import pandas as pd
df1 = pd.read_csv('./temp/wordsim353/combined.csv')

In [None]:
#raw json text is in temp directory
loaded_data = {}
with open('./temp/content0.raw') as json_file:  
    loaded_data = json.load(json_file)

In [None]:
##create index files for articles and words
lemma = nltk.wordnet.WordNetLemmatizer()
article2ind = {}
word2ind = {}
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()
ind = 0
temp1 = 0 
for article in loaded_data:
    article2ind[article] = ind
    ind = ind + 1
    tokens  = word_tokenize(loaded_data[article]['text'])
    tokens = [token for token in tokens if not token in stop_words]
#         tokens = [stem(token) for token in tokens]
    tokens = [lemma.lemmatize(token) for token in tokens]
    tokens = [porter.stem(token) for token in tokens]
    for token in tokens:
        if token not in word2ind.keys():
            word2ind[token] = temp1
            temp1 = temp1 + 1
                
with open('./temp/article2ind.json','w') as outfile:
    json.dump(article2ind,outfile)
with open('./temp/word2ind.json','w') as outfile:
    json.dump(word2ind,outfile)

In [None]:
word_count = {}
for article, doc in loaded_data.items():
    tokens  = word_tokenize(doc['text'])
    tokens = [token for token in tokens if not token in stop_words]
#         tokens = [stem(token) for token in tokens]
    tokens = [lemma.lemmatize(token) for token in tokens]
    tokens = [porter.stem(token) for token in tokens]
    wordmap = Counter(tokens).items()
    for word,count in wordmap:
        if word in word_count:
            word_count[word] +=1
        else:
            word_count[word] = 1
with open('./temp/word_count.json','w') as outfile:
    json.dump(word_count,outfile)

In [None]:
n_articles = len(article2ind)
n_words = len(word2ind)
mat = sps.dok_matrix((n_words, n_articles), dtype=np.float32)
for article, doc in loaded_data.items():
    j = article2ind[article]
    tokens  = word_tokenize(doc['text'])
    tokens = [token for token in tokens if not token in stop_words]
#         tokens = [porter.stem(token) for token in tokens]
#         tokens = [stem(token) for token in tokens]
    tokens = [lemma.lemmatize(token) for token in tokens]
    tokens = [porter.stem(token) for token in tokens]
    wordmap = Counter(tokens).items()
    for word, count in wordmap:
        i = word2ind[word]
        mat[i,j] = count*np.log(n_articles/word_count[word])
mat = mat.tocsr()
save_sparse_csr("./temp/mat.npz", mat)

In [None]:
esa= load_sparse_csr("./temp/mat.npz")

In [None]:
word2ind = {}
with open('./temp/word2ind.json') as json_file:  
    word2ind = json.load(json_file)
esa_sim = []  
word_sim = []
lemma = WordNetLemmatizer()
for index, row in df1.iterrows():
    word1 = lemma.lemmatize(row["Word 1"].lower())
    word2 = lemma.lemmatize(row["Word 2"].lower())
    try:
        i = word2ind[word1]
        j = word2ind[word2]
    except:
        continue
    sim = np.dot(esa[i,:],esa[j,:].T)/(np.sqrt((np.dot(esa[i,:],esa[i,:].T))*
                                               (np.dot(esa[j,:],esa[j,:].T))))
    esa_sim.append(float(sim))
    word_sim.append(row["Human (mean)"])
corr, p_value = spearmanr(esa_sim, word_sim)
print(corr)

In [None]:
smat = sps.csc_matrix(esa.T)
ut, s, vt = sparsesvd(smat, 50)

In [None]:
lsa_sim = []  
word_sim = []
lemma = WordNetLemmatizer()
porter = PorterStemmer()
for index, row in df1.iterrows():
    word1 = lemma.lemmatize(row["Word 1"].lower())
    word2 = lemma.lemmatize(row["Word 2"].lower())
    word1 = porter.stem(word1)
    word2 = porter.stem(word2)
    try:
        i = word2ind[word1]
        j = word2ind[word2]
    except:
        continue
    l = np.multiply(vt.T[i],s)
    l = np.matmul(l,ut)
    r = np.multiply(vt.T[j],s)
    r = np.matmul(r,ut)
    sim = np.dot(l,r.T)/np.sqrt(np.dot(l,l.T)*np.dot(r,r.T))
    lsa_sim.append(float(sim))
    word_sim.append(row["Human (mean)"])
corr, p_value = spearmanr(lsa_sim, word_sim)
print(corr)  

In [None]:
import pandas as pd
import seaborn as sns
import imgkit
syn = pd.read_csv('./temp/synonyms_esa.csv')
hyp = pd.read_csv('./temp/hypernyms_esa.csv')
ant = pd.read_csv('./temp/antonmys_esa.csv')
mer = pd.read_csv('./temp/meronyms_esa.csv')

In [None]:
names = ['syn','hyp','ant','mer']
for i,df in enumerate([syn,hyp,ant,mer]):
    html = df.style.background_gradient(cmap='RdBu',axis = 0).render()
    imgkit.from_string(html,names[i]+'_esa.png')

In [None]:
syn = pd.read_csv('./temp/synonyms_lsa.csv')
hyp = pd.read_csv('./temp/hypernyms_lsa.csv')
ant = pd.read_csv('./temp/antonmys_lsa.csv')
mer = pd.read_csv('./temp/meronyms_lsa.csv')
names = ['syn','hyp','ant','mer']
for i,df in enumerate([syn,hyp,ant,mer]):
    html = df.style.background_gradient(cmap='RdBu',axis = 0).render()
    imgkit.from_string(html,names[i]+'_lsa.png')