In [1]:
from data_manager import Vector, TruncatedMatrix
import json
import pickle
from math import log10 as log
import numpy as np
from scipy import sparse
from sklearn.decomposition import TruncatedSVD

In [2]:
with open("data/alphabet.json", "r", encoding="latin-1") as f:
    alphabet = json.load(f)

In [3]:
vectors: list[Vector] = []
with open("data/Bag_of_words.pickle", "rb") as f:
    while True:
        try:
            vectors.append(pickle.load(f))
        except EOFError:
            break

In [4]:
N = len(vectors)
M = len(alphabet)
print(M, N)

97308 182513


In [5]:
word_frequency = {}
for word in alphabet:
    word_frequency[word] = 0

for vec in vectors:
    for word in vec.vector:
        word_frequency[word] += 1

idf = {}
for word in word_frequency:
    idf[word] = log(N / word_frequency[word])

In [31]:
def create_sparse_matrix(filename="sparse_matrix.npz", use_idf=False):
    no_of_entries = 0
    for v in vectors:
        no_of_entries += len(v.vector)

    data = np.array([0 for _ in range(no_of_entries)], dtype=float)
    row = np.array([0 for _ in range(no_of_entries)], dtype=int)
    col = np.copy(row)

    ind = 0
    for i, v in enumerate(vectors):
        start = ind
        for w in v.vector:
            data[ind] = v.vector[w] * (idf[w] if use_idf else 1)
            row[ind] = alphabet[w]
            col[ind] = i
            ind += 1
        data[start:ind] /= np.linalg.norm(data[start:ind])

    matrix_to_save = sparse.csr_matrix((data, (row, col)), shape=(M, N), dtype=float)
    sparse.save_npz(f"data/matrices/{filename}", matrix_to_save)

In [None]:
create_sparse_matrix()

In [8]:
create_sparse_matrix("sparse_matrix_idf.npz", use_idf=True)

In [5]:
matrix = sparse.load_npz("data/matrices/sparse_matrix.npz")
matrix_idf = sparse.load_npz("data/matrices/sparse_matrix_idf.npz")

In [6]:
def save_truncated_svd(truncation_level):
    svd = TruncatedSVD(truncation_level).fit(matrix_idf)
    u_s = svd.transform(matrix_idf)
    sigma = svd.singular_values_
    v = svd.components_
    vector_lengths = np.linalg.norm(np.diag(sigma) @ v, axis=0)
    svd_truncation_level = TruncatedMatrix(truncation_level, u_s, sigma, v, vector_lengths)
    with open(f"data/matrices/svd_matrix_{truncation_level}.pickle", "wb") as file:
        pickle.dump(svd_truncation_level, file)

In [7]:
def load_truncated_svd(truncation_level):
    with open(f"data/matrices/svd_matrix_{truncation_level}.pickle", "rb") as file:
        svd = pickle.load(file)
    return svd

In [26]:
%%time
save_truncated_svd(20)

CPU times: total: 26.3 s
Wall time: 32.6 s


In [58]:
%%time
save_truncated_svd(50)

CPU times: total: 49.3 s
Wall time: 55.8 s


In [28]:
%%time
save_truncated_svd(100)

CPU times: total: 1min 30s
Wall time: 1min 38s


In [29]:
%%time
save_truncated_svd(200)

CPU times: total: 2min 15s
Wall time: 2min 13s


In [30]:
%%time
save_truncated_svd(400)

CPU times: total: 4min 4s
Wall time: 3min 33s


In [8]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.probability import FreqDist
from string import punctuation
from math import sqrt

stop_words = set([word.lower() for word in stopwords.words('english')])
snowball_stemmer = SnowballStemmer('english')

def normalize_vector(vector):
    length = 0
    for w in vector:
        length += vector[w]**2
    length = sqrt(length)
    for w in vector:
        vector[w] /= length
    return vector

def process_text(text: str) -> dict[str, int]:
    text = "".join(list(map(lambda c: " " if c in punctuation else c, text)))
    words = [w.lower() for sentence in sent_tokenize(text) for w in word_tokenize(sentence)]
    words = [w for w in words if w not in stop_words]
    words = [snowball_stemmer.stem(w) for w in words]
    words = [w for w in words if w in alphabet]
    return normalize_vector(dict(FreqDist(words)))

In [9]:
def convert_to_sparse(query):
    data = np.array([0 for _ in range(len(query))], dtype=float)
    row = np.array([0 for _ in range(len(query))], dtype=int)
    col = np.copy(row)
    ind = 0
    for w in query:
        data[ind] = query[w]
        col[ind] = alphabet[w]
        ind += 1

    return sparse.csr_matrix((data, (row, col)), shape=(1, M), dtype=float)

def k_best_vectors(vector: sparse.csr_matrix):
    best_values = np.sort(vector.todense(), axis=1)[:, ::-1]
    best_indices = np.argsort(vector.todense(), axis=1)[:, ::-1]
    return [(best_indices[0, i], best_values[0, i]) for i in range(k)]

def find_closest_vectors(query):
    sparse_query = convert_to_sparse(query)
    return k_best_vectors(np.abs(sparse_query @ matrix))

def find_closest_vectors_idf(query):
    sparse_query = convert_to_sparse(query)
    return k_best_vectors(np.abs(sparse_query @ matrix_idf))

In [10]:
def convert_to_dense(query):
    data = np.array([0 for _ in range(M)], dtype=float)
    for w in query:
        data[alphabet[w]] = query[w]
    return data

def find_closest_vectors_svd(query, svd: TruncatedMatrix):
    dense_query = convert_to_dense(query)
    best = np.abs(((dense_query @ svd.U_S) @ svd.V) / svd.vector_lengths)
    return list(zip(np.argsort(best)[::-1][:k], np.sort(best)[::-1][:k]))

In [11]:
def print_answers(query_text, function, *args):
    q = process_text(query_text)
    res = function(q, *args)
    url = "https://stackoverflow.com/questions/"
    for i, (ind, angle) in enumerate(res):
        print(f"Result number: {i + 1}")
        print(f"Absolut value of angle between query and result: {angle}")
        print(f"Question: {vectors[ind].question_title}")
        print(f"Question site url: {url}{vectors[ind].question_id}")
        print("--------------------------------------")

In [12]:
k = 100

In [13]:
question = "What are generators"

In [15]:
print_answers(question, find_closest_vectors_idf)

Result number: 1
Absolut value of angle between query and result: 0.31333871876767555
Question: Python optimization through bytecode
Question site url: https://stackoverflow.com/questions/20940935
--------------------------------------
Result number: 2
Absolut value of angle between query and result: 0.25668561660644856
Question: How to validate Python bytecode?
Question site url: https://stackoverflow.com/questions/23267970
--------------------------------------
Result number: 3
Absolut value of angle between query and result: 0.25199111924524653
Question: Python bytecode and .pyc file format specification
Question site url: https://stackoverflow.com/questions/35229387
--------------------------------------
Result number: 4
Absolut value of angle between query and result: 0.24871896548908304
Question: Debugging python bytecode when source is not available
Question site url: https://stackoverflow.com/questions/32486204
--------------------------------------
Result number: 5
Absolut val

In [17]:
truncated_idf = load_truncated_svd(200)

In [18]:
print_answers(question, find_closest_vectors_svd, truncated_idf)

Result number: 1
Absolut value of angle between query and result: 0.07376743893757974
Question: Python optimization through bytecode
Question site url: https://stackoverflow.com/questions/20940935
--------------------------------------
Result number: 2
Absolut value of angle between query and result: 0.0732394107487622
Question: Is it easy to fully decompile python compiled(*.pyc) files?
Question site url: https://stackoverflow.com/questions/3464326
--------------------------------------
Result number: 3
Absolut value of angle between query and result: 0.0717861899463477
Question: What are Python disassemblers used for?
Question site url: https://stackoverflow.com/questions/18647897
--------------------------------------
Result number: 4
Absolut value of angle between query and result: 0.07112096289087391
Question: Difference between loading time and running time in python?
Question site url: https://stackoverflow.com/questions/6803126
--------------------------------------
Result numb