In [1]:
import numpy as np
import re

In [6]:
from sklearn.datasets import fetch_20newsgroups as getData

In [3]:
from scipy.sparse import lil_matrix

In [7]:
data = getData(subset='train',remove=('headers','footers','quotes'))
X=data.data

In [9]:
x = ' '.join(X)


In [11]:
len(x)

13793298

In [12]:
x = x.lower()

In [13]:
corpus_list = re.split('\W',x)

In [14]:
len(corpus_list)

3807553

In [102]:
cutOffValue = 50
from collections import defaultdict
frequency = defaultdict(int)
for token in corpus_list:
    frequency[token] += 1
processedCorpus_list = [token for token in corpus_list if frequency[token] > cutOffValue ]

In [103]:
len(processedCorpus_list)

3423482

In [104]:
allWords = np.array(list(frequency.keys()))
allCounts = np.array(list(frequency.values()))

In [105]:
vocab = allWords[allCounts >= cutOffValue]
wordCounts = allCounts[allCounts >= cutOffValue]

In [195]:
len(processedCorpus_list)

3423482

In [106]:
def computeWordContextMatrix(corpus_list,vocab=None,window_size=2):
    if vocab is None:
        vocab=sorted(list(set(corpus_list)))
    numWords = len(vocab)
    #M = np.zeros((numWords,numWords))
    M = lil_matrix((numWords,numWords))
    W2I = dict(zip(vocab,np.arange(numWords)))
    I2w = dict(zip(np.arange(numWords),vocab))
    doc = corpus_list
    curIdx = 0
    docLen = len(doc)
    while curIdx < docLen:
        left = max(curIdx-window_size,0)
        right = min(curIdx+window_size+1,docLen)
        wordsInContext = doc[left:curIdx] + doc[curIdx+1:right]
        currentWord=doc[curIdx]
        currentWordIdx = W2I[currentWord]
        for word in wordsInContext:
            contextIdx = W2I[word]
            M[currentWordIdx,contextIdx] += 1
        curIdx += 1
    return M,W2I,I2w
            

In [174]:
M,W2I,I2W = computeWordContextMatrix(processedCorpus_list,vocab)

In [120]:
sorted(np.sum(M,axis=1))[-1]

matrix([[5601541.]])

In [116]:
I2W[0]

'i'

In [None]:
np.outer()

In [None]:
np.isinf()

In [177]:
M2 = M.copy()

In [214]:
def pmi(M,positive=True):
    col_totals = np.sum(M,axis=0)
    total = col_totals.sum()
    row_totals = np.sum(M, axis=1)
    #print(col_totals,'\n',row_totals)
    expected = np.outer(row_totals,col_totals) / total
    #print(expected)
    
    M = M / expected
    with np.errstate(divide='ignore'):
        M = np.log(M)
    #M[np.isinf(M)] = 0.0
    
    
    
    if positive:
        M[M<0]=0.0
    return M

In [215]:
M2 = pmi(M)

  return np.true_divide(self.todense(), other)


In [196]:
M2.shape

(4327, 4327)

In [218]:
np.min(M2)

nan

In [169]:
# def calculate_pmi(matrix,positive=True):
#     # Convert the matrix to probabilities
#     p_xy = matrix / np.sum(matrix)
#     p_x = np.sum(matrix, axis=1) / np.sum(matrix)
#     p_y = np.sum(matrix, axis=0) / np.sum(matrix)
    
#     # Avoid division by zero
#     p_x[p_x == 0] = 1
#     p_y[p_y == 0] = 1
    
#     # Calculate PMI
#     pmi_matrix = np.log2(p_xy / (np.outer(p_x, p_y)))
    
# #     if positive:
# #         pmi_matrix[pmi_matrix<0] = 0.0
#     return pmi_matrix

In [173]:
M = calculate_pmi(M)

In [219]:
M2 = np.nan_to_num(M2,nan=0.0)

In [220]:
np.min(M2)

0.0

In [200]:
M2[np.isnan(M2)]

(1, 643425)

(1, 0)

In [5]:
from sklearn.decomposition import TruncatedSVD,PCA,IncrementalPCA

In [221]:
transformer = TruncatedSVD(n_components=100)
M_reduced = transformer.fit_transform(M2)



In [222]:
# transformerPca = IncrementalPCA(n_components=100)
# M_reduced_pca = transformerPca.fit_transform(M)
# M_reduced_pca[0]

In [325]:
np.min(M_reduced)

-14.71154543042965

In [206]:
M.shape

(4327, 4327)

In [326]:
np.sum(np.isnan(M_reduced))

0

In [254]:
def getNorms(E):
    if E.ndim == 1:
        E = E[np.newaxis,:]
    nrms =  np.sum(E**2,axis=1)**0.5
    return nrms

In [256]:
print(getNorms(M_reduced[1,:]))

[34.32019928]


In [271]:
def normalize(E):
 #   if E.ndim ==1:
#        E = E[np.newaxis,:]
#     for i in range(E.shape[0]):
#         nrms = getNorms(E[i][np.newaxis,:])
#         print(nrms,np.sqrt(nrms))
#         E[i] = E[i]/(np.sqrt(nrms))
    nrms = getNorms(E)

#     print(nrms[:,np.newaxis])
#     print(nrms.shape)
#     print(E.shape)
    return E / nrms[:,np.newaxis]

In [262]:
E = M_reduced[1,:]
norm_E = normalize(E)
print(np.sum(norm_E,axis=1))

[[34.32019928]]
(1,)
(100,)
[0.47660618]


In [253]:
np.sum(norm_E,axis=1)

array([0.47660618])

In [264]:
np.sum(M_reduced[1])

16.357219017403168

In [267]:
np.sum(M_reduced[1]**2)**0.5

34.32019927633177

In [268]:
getNorms(norm_E)

array([1.])

In [330]:
def cosineSimilarity(E,v):
    E = normalize(E)
    v = normalize(v)
    scores = E.dot(v.T)
    scores[np.isnan(scores)] = 0.0
    return scores

In [361]:
def getMostSimilarWords(E,word,W2I,topn=10):
    v = E[W2I[word],]
    scores = cosineSimilarity(E,v)
    print(scores)
    sortedScores = np.sort(scores.reshape(-1))[::-1]
    print(sortedScores.shape)
    print(np.sum(np.isnan(sortedScores)))
    idx = np.argsort(scores.reshape(-1))[::-1]
    topNScores = sortedScores[1:topn]
    topNWordsIdx = idx[1:topn]
    return topNScores,topNWordsIdx
        

In [362]:
E.shape

(100,)

In [363]:
scores, idx = getMostSimilarWords(M_reduced,'good',W2I)

[[ 0.34593316]
 [ 0.5032537 ]
 [ 0.23720749]
 ...
 [ 0.01849561]
 [-0.00428651]
 [ 0.05945247]]
(4327,)
0


  return E / nrms[:,np.newaxis]


In [364]:
print(scores)
print(idx)

[0.80788386 0.79602826 0.78773826 0.76289219 0.7561541  0.75012522
 0.74995983 0.71025756 0.70943284]
[ 185 1510  995 2367 1442  183  451   18  165]


In [365]:
scores

array([0.80788386, 0.79602826, 0.78773826, 0.76289219, 0.7561541 ,
       0.75012522, 0.74995983, 0.71025756, 0.70943284])

In [366]:
idx

array([ 185, 1510,  995, 2367, 1442,  183,  451,   18,  165], dtype=int64)

In [368]:
for i in idx:
    print(I2W[i])

great
bad
best
decent
excellent
better
nice
a
like
