In [3]:
import numpy as np
import re

with open('data/03. sentences.txt', 'r') as f_snts:
    raw_sentences = [s.strip().lower() for s in f_snts.readlines()]
    
re.split(r'\W+', raw_sentences[0])

['in',
 'comparison',
 'to',
 'dogs',
 'cats',
 'have',
 'not',
 'undergone',
 'major',
 'changes',
 'during',
 'the',
 'domestication',
 'process',
 '']

In [4]:
tokenized_sentences = [[word for word in re.split(r'[^a-z]', sentence) if word] for sentence in raw_sentences]

In [5]:
from collections import Counter

words = Counter([word for sentence in tokenized_sentences for word in sentence])

ind_2_word = list(words)
word_2_ind = dict([(w,i) for i,w in enumerate(ind_2_word)])

In [6]:
#[[word_2_ind[word] for word in sentence] for sentence in tokenized_sentences]
A = np.zeros((len(tokenized_sentences), len(words)), dtype=np.float32)
for i,j in [(i, word_2_ind[w]) for i,s in enumerate(tokenized_sentences) for w in s]:
    A[i,j] += 1

In [7]:
A

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 2., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 1., 1., 1.]], dtype=float32)

In [8]:
norms = np.sqrt(np.sum(np.square(A), axis=1))
A_norm = A / norms[:, np.newaxis]
sum(A_norm[0, :] ** 2)

0.9999999403953552

In [9]:
cosine = np.matmul(A_norm, A_norm.T)

In [10]:
np.array([round(x, 3) for x in cosine[0,:]])

array([1.   , 0.047, 0.136, 0.105, 0.223, 0.06 , 0.267, 0.074, 0.116,
       0.094, 0.167, 0.12 , 0.16 , 0.13 , 0.126, 0.056, 0.159, 0.043,
       0.056, 0.111, 0.157, 0.175], dtype=float32)

In [32]:
np.argpartition(cosine[0,:], -1)

array([21,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20,  0], dtype=int64)

In [12]:
nearest = np.argsort(cosine, axis = 1)[:, -3:-1]
nearest

array([[ 4,  6],
       [12,  9],
       [ 9,  8],
       [ 4,  6],
       [13,  9],
       [12,  6],
       [13,  5],
       [ 2, 10],
       [10,  2],
       [ 4,  2],
       [ 8, 12],
       [ 8,  2],
       [ 6, 10],
       [17,  6],
       [21, 16],
       [17, 21],
       [14, 17],
       [21, 18],
       [21, 17],
       [21, 18],
       [17, 21],
       [14, 17]], dtype=int64)

In [33]:
[[cosine[i,j] for j in r] for i, r in enumerate(nearest)]

[[0.22291128, 0.26726124],
 [0.24748738, 0.28125],
 [0.44821072, 0.4635525],
 [0.16357216, 0.19611613],
 [0.25286087, 0.36860487],
 [0.2236068, 0.31304953],
 [0.29104277, 0.31304953],
 [0.28128433, 0.28915745],
 [0.391254, 0.4635525],
 [0.36860487, 0.44821072],
 [0.391254, 0.4170288],
 [0.2581989, 0.34016803],
 [0.28, 0.4170288],
 [0.2754113, 0.29104277],
 [0.36004117, 0.38650063],
 [0.30442953, 0.31851098],
 [0.38650063, 0.38692066],
 [0.38939574, 0.5073825],
 [0.31851098, 0.5073825],
 [0.27300942, 0.3043478],
 [0.28632814, 0.29957232],
 [0.36004117, 0.38939574]]

___

## Submission

In [40]:
with open('out/03. submission.txt', mode='w') as f_out:
    output = '{0} {1}'.format(nearest[0,0], nearest[0,1])
    print(output)
    f_out.write(output)

4 6
