The score for each token (word) is computed by $$w_{i,j} = tf_{i,j} * \log\left(\frac{N}{df_{i,j}}\right)$$

In [3]:
documents = ['The sky is blue', 
             'The sun is bright', 
             'The sun in the sky is bright', 
             'we can see the shining sun, the bright sun']

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
vectorizer = TfidfVectorizer(stop_words='english')
# vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
print(X.toarray())
print(vectorizer.get_feature_names())
print(X.shape)

[[0.78528828 0.         0.         0.6191303  0.        ]
 [0.         0.70710678 0.         0.         0.70710678]
 [0.         0.53256952 0.         0.65782931 0.53256952]
 [0.         0.36626037 0.57381765 0.         0.73252075]]
['blue', 'bright', 'shining', 'sky', 'sun']
(4, 5)


In [33]:
tfidf_matrix = X.sum(axis=0)
print(tfidf_matrix)

[[0.78528828 1.60593668 0.57381765 1.27695961 1.97219705]]


In [34]:
tfidf_scores = np.ravel(tfidf_matrix)
print(tfidf_scores)

[0.78528828 1.60593668 0.57381765 1.27695961 1.97219705]


In [35]:
vectorizer.get_feature_names()

['blue', 'bright', 'shining', 'sky', 'sun']

In [36]:
tfidf_dict = dict(zip(vectorizer.get_feature_names(), tfidf_scores))
print(tfidf_dict)

{'blue': 0.7852882757103967, 'bright': 1.605936677684143, 'shining': 0.5738176501168697, 'sky': 1.27695960978985, 'sun': 1.9721970507561841}


In [37]:
sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)

[('sun', 1.9721970507561841),
 ('bright', 1.605936677684143),
 ('sky', 1.27695960978985),
 ('blue', 0.7852882757103967),
 ('shining', 0.5738176501168697)]

In [41]:
def keywords_sklearn(docs, k):
    vec = TfidfVectorizer(stop_words='english')
    _matrix = vec.fit_transform(docs)
    _scores = np.sum(_matrix, axis=0)
    _scores = np.ravel(_scores)
    _score_dict = dict(zip(vec.get_feature_names(), _scores))
    return sorted(_score_dict.items(), key=lambda x: x[1], reverse=True)[:k]

In [42]:
print(keywords_sklearn(documents, 3))

[('sun', 1.9721970507561841), ('bright', 1.605936677684143), ('sky', 1.27695960978985)]


In [50]:
import math
def distance(p1, p2):
    squared_terms = []
    for x1, x2 in zip(p1, p2):
        squared_terms.append(math.pow(x1-x2, 2))
    return math.sqrt(sum(squared_terms))

In [51]:
def all_distances(word_dict):
    results = {}
    words = list(word_dict.keys())
    for index, word in enumerate(words):
        comp_start = index + 1
        for comp_index in range(comp_start, len(words)):
            point1 = word_dict[word]
            point2 = word_dict[words[comp_index]]
            dist = distance(point1, point2)
            results[f'{word} | {words[comp_index]}'] = dist
    return results

In [52]:
test_words = {'pizza': [1,1], 'food': [2,2], 'sport': [5,6]}

In [53]:
all_distances(test_words)

{'pizza | food': 1.4142135623730951,
 'pizza | sport': 6.4031242374328485,
 'food | sport': 5.0}

Distance formula: $$d_{i,j} = \sqrt{(v_{i1} - v_{j1})^2 + (v_{i2} - v_{j2})^2 + \ldots + (v_{in} - v_{jn})^2 }$$

In [55]:
import codecs

targets = {}

with codecs.open('../datasets/glove.840B.300D.txt', 'r') as f:
    count = 0
    for c, r in enumerate(f):
        sr = r.split()
        if sr[0] in ['pizza', 'food', 'sport']:
            targets[sr[0]] = [float(i) for i in sr[1:]]
            count += 1
        if count == 3:
            break

In [56]:
targets

{'food': [-0.43512,
  0.028351,
  0.4911,
  -0.35168,
  -0.11578,
  1.0369,
  -0.09755,
  0.086624,
  -0.1789,
  2.4555,
  -1.2798,
  0.021074,
  -0.03225,
  0.094673,
  -0.14,
  -0.52143,
  0.00066447,
  1.8051,
  -0.22604,
  0.33227,
  0.00041163,
  0.062654,
  0.14973,
  -0.5026,
  0.089701,
  -0.26908,
  -0.083594,
  -0.16677,
  -0.17036,
  -0.32049,
  -0.23586,
  -0.40395,
  0.32683,
  -0.21712,
  0.098576,
  0.47552,
  0.092994,
  -0.061034,
  0.12673,
  0.60856,
  -0.0067936,
  -0.21831,
  0.021751,
  -0.24858,
  -0.035244,
  0.13692,
  -0.37109,
  0.54421,
  0.040017,
  0.13992,
  0.039967,
  -0.31745,
  0.24408,
  -0.2355,
  0.24884,
  -0.31929,
  0.11282,
  -0.010198,
  -0.050538,
  -0.1155,
  0.30273,
  -0.61441,
  0.016135,
  0.010675,
  0.15108,
  -1.1759,
  0.097104,
  0.071706,
  0.19795,
  0.27253,
  -0.22122,
  0.64478,
  -0.066252,
  -0.29403,
  0.16281,
  -0.0078554,
  -0.14986,
  -0.11364,
  0.36459,
  0.13723,
  0.46612,
  0.26157,
  0.0065022,
  -0.67068,
  -0.075

In [57]:
all_distances(targets)

{'food | sport': 8.303718155175723,
 'food | pizza': 6.312737677708338,
 'sport | pizza': 8.817056623492523}

In [59]:
with codecs.open('../datasets/glove.840B.300D.txt', 'r') as f:
    count = 0
    for c, r in enumerate(f):
        sr = r.split()
        if sr[0] in ['run', 'food', 'sport']:
            targets[sr[0]] = [float(i) for i in sr[1:]]
            count += 1
        if count == 3:
            break

In [60]:
all_distances(targets)

{'food | sport': 8.303718155175723,
 'food | pizza': 6.312737677708338,
 'food | run': 7.821521246238745,
 'sport | pizza': 8.817056623492523,
 'sport | run': 7.331488158214085,
 'pizza | run': 8.136990289820696}

In [61]:
with codecs.open('../datasets/glove.840B.300D.txt', 'r') as f:
    count = 0
    for c, r in enumerate(f):
        sr = r.split()
        if sr[0] in ['marathon', 'food', 'sport']:
            targets[sr[0]] = [float(i) for i in sr[1:]]
            count += 1
        if count == 3:
            break

In [62]:
all_distances(targets)

{'food | sport': 8.303718155175723,
 'food | pizza': 6.312737677708338,
 'food | run': 7.821521246238745,
 'food | marathon': 8.493424334228525,
 'sport | pizza': 8.817056623492523,
 'sport | run': 7.331488158214085,
 'sport | marathon': 7.622491120715408,
 'pizza | run': 8.136990289820696,
 'pizza | marathon': 8.208846187664582,
 'run | marathon': 6.529200301849037}