# Singular Value Decomposition

In [44]:
from collections import defaultdict, Counter
import numpy as np
from heapq import nlargest

In [23]:
hamlet_path = 'hamlet.txt'
with open(hamlet_path, 'r') as hamlet:
    lines = hamlet.readlines()

In [17]:
lines = [i.strip().lower() for i in lines if i.strip()]
lines[:20]

['the tragedy of hamlet, prince of denmark',
 'by william shakespeare',
 'dramatis personae',
 'claudius, king of denmark.',
 'marcellus, officer.',
 'hamlet, son to the former, and nephew to the present king.',
 'polonius, lord chamberlain.',
 'horatio, friend to hamlet.',
 'laertes, son to polonius.',
 'voltemand, courtier.',
 'cornelius, courtier.',
 'rosencrantz, courtier.',
 'guildenstern, courtier.',
 'osric, courtier.',
 'a gentleman, courtier.',
 'a priest.',
 'marcellus, officer.',
 'bernardo, officer.',
 'francisco, a soldier',
 'reynaldo, servant to polonius.']

In [18]:
len(lines)

4163

Posiadamy około 4100 linii z Hamleta, podzielimy to na 500 dokumentów

In [30]:
documents = defaultdict(list)
bag_of_words = set()

index = 0

for i, value in enumerate(lines):
    documents[index] += value.split()
    bag_of_words |= set(value.split())
    if (i + 1) % 8 == 0:
        index += 1
len(documents)

558

Następnie obliczmy częstotliwość wystąpienia słowa dla każdego dokumentu

In [31]:
freq = []
for _, words_list in documents.items():
    bag_of_words_for_doc = dict.fromkeys(bag_of_words, 0)
    for word in words_list:
        bag_of_words_for_doc[word] += 1
    freq.append(bag_of_words_for_doc)

### Term-by-document matrix

In [32]:
def term_by_document():
    amount_of_words = len(bag_of_words)
    res = np.zeros(shape=(amount_of_words, 558))
    for i in range(558):
        for index, word in enumerate(list(bag_of_words)):
            res[index, i] = freq[i][word]
    return res

In [33]:
term_by_document_matrix = term_by_document()

### IDF - redukcja często występujących słów

In [35]:
amount_of_words = len(bag_of_words)
idf = []
for index, words_list in enumerate(list(bag_of_words)):
    words_num = 0
    for i in range(558):
        if term_by_document_matrix[index, i] > 0:
            words_num += 1
    if words_num > 0:
        idf.append(np.log10(558 / words_num))

### Znajdowanie $k$ dokumentów zbliżonych do zapytania

In [45]:
def rate_of_similarity(sentence, k):
    query = sentence.strip().lower().split()
    
    q = np.zeros(amount_of_words)
    j = 0
    for word in query:
        if word in bag_of_words:
            q[j] = 1
        j += 1
    similarity_rate = {}
    for i in range(558):
        dj = term_by_document_matrix[:,[i]]
        q_norm = np.linalg.norm(q)
        dj_norm = np.linalg.norm(dj)
        cosj = np.dot(q,dj)/(q_norm*dj_norm)
        similarity_rate.update({i:cosj})
        
    return nlargest(k, similarity_rate, key=similarity_rate.get)

In [46]:
sentence = "For food and diet"
simple = rate_of_similarity(sentence, 10)

In [47]:
simple

[230, 507, 264, 0, 1, 2, 3, 4, 5, 6]

### SVG

In [49]:
U, s, V = np.linalg.svd(term_by_document_matrix, full_matrices=False)
S = np.diag(s)
np.allclose(term_by_document_matrix, np.dot(U, np.dot(S, V)))
term_by_document_matrix = np.dot(U, np.dot(S, V))

In [52]:
withSVD = rate_of_similarity(sentence, 10)
withSVD

[230, 507, 264, 521, 390, 261, 395, 295, 109, 209]

### Na koniec IDF

In [53]:
for i in range(558):
    for j in range(len(idf)):
        term_by_document_matrix[j, i] *= idf[j]
        
withIDF = rate_of_similarity(sentence, 10)
withIDF

[230, 507, 264, 521, 390, 261, 256, 512, 295, 365]

## Wnioski
Wszystkie metody się różnią ostatnimi elementami listy, natomiast początkowe są takie same. Świadczy to o poprawności implementacji.