In [1]:
from data_manager import TruncatedMatrix, Question
import json
import pickle
import numpy as np
from scipy import sparse

In [2]:
with open("../data/alphabet.json", "r", encoding="latin-1") as f:
    alphabet = json.load(f)
    
def load_truncated_svd(truncation_level):
    with open(f"../data/matrices/svd_matrix_{truncation_level}.pickle", "rb") as file:
        svd = pickle.load(file)
    return svd

In [3]:
questions: list[Question] = []
with open("../data/questions.pickle", "rb") as f:
    while True:
        try:
            questions.append(pickle.load(f))
        except EOFError:
            break

In [4]:
N, M = 607282, 231286 

In [5]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.probability import FreqDist
from string import punctuation
from math import sqrt


stop_words = set([word.lower() for word in stopwords.words('english')])
snowball_stemmer = SnowballStemmer('english')

def normalize_vector(vector):
    length = 0
    for w in vector:
        length += vector[w]**2
    length = sqrt(length)
    for w in vector:
        vector[w] /= length
    return vector

def process_text(text: str) -> dict[str, int]:
    text = "".join(list(map(lambda c: " " if c in punctuation else c, text)))
    words = [w.lower() for sentence in sent_tokenize(text) for w in word_tokenize(sentence)]
    words = [w for w in words if w not in stop_words]
    words = [snowball_stemmer.stem(w) for w in words]
    words = [w for w in words if w in alphabet]
    return normalize_vector(dict(FreqDist(words)))

In [10]:
def convert_to_sparse(query):
    data = np.array([0 for _ in range(len(query))], dtype=float)
    row = np.array([0 for _ in range(len(query))], dtype=int)
    col = np.copy(row)
    ind = 0
    for w in query:
        data[ind] = query[w]
        col[ind] = alphabet[w]
        ind += 1

    return sparse.csr_matrix((data, (row, col)), shape=(1, M), dtype=float)

def k_best_vectors(vector: sparse.csr_matrix, k: int):
    best_values = np.sort(vector.todense(), axis=1)[:, ::-1]
    best_indices = np.argsort(vector.todense(), axis=1)[:, ::-1]
    return [(best_indices[0, i], best_values[0, i]) for i in range(k)]

def convert_to_dense(query):
    data = np.array([0 for _ in range(M)], dtype=float)
    for w in query:
        data[alphabet[w]] = query[w]
    return data

def find_closest_vectors_svd(query, svd: TruncatedMatrix, k: int):
    dense_query = convert_to_dense(query)
    best = np.abs(((dense_query @ svd.U_S) @ svd.V) / svd.vector_lengths)
    return list(zip(np.argsort(best)[::-1][:k], np.sort(best)[::-1][:k]))

def print_answers(query_text, function, *args):
    q = process_text(query_text)
    res = function(q, *args)
    url = "https://stackoverflow.com/questions/"
    for i, (ind, angle) in enumerate(res):
        print(f"Result number: {i + 1}")
        print(f"Absolut value of angle between query and result: {angle}")
        print(f"Question: {questions[ind].title}")
        print(f"Question site url: {url}{questions[ind].id}")
        print("--------------------------------------")

In [11]:
question = "What are generators"

In [12]:
truncated_idf = load_truncated_svd(100)

In [13]:
print_answers(question, find_closest_vectors_svd, truncated_idf, 20)

Result number: 1
Absolut value of angle between query and result: 0.14018989411145244
Question: Recursive python generators: why does the yield need to be iterated over?
Question site url: https://stackoverflow.com/questions/25141510
--------------------------------------
Result number: 2
Absolut value of angle between query and result: 0.13884618695721068
Question: Fast way for getting unique values from iterators
Question site url: https://stackoverflow.com/questions/27440441
--------------------------------------
Result number: 3
Absolut value of angle between query and result: 0.13701741582915355
Question: Are Python/ES6 Generators also Coroutines?
Question site url: https://stackoverflow.com/questions/31892128
--------------------------------------
Result number: 4
Absolut value of angle between query and result: 0.13679040610320933
Question: Itertools.permutations create n random solutions to the TSP
Question site url: https://stackoverflow.com/questions/25805138
----------------

In [14]:
question = "Switch case in python"

In [15]:
print_answers(question, find_closest_vectors_svd, truncated_idf, 20)

Result number: 1
Absolut value of angle between query and result: 0.04546931845836813
Question: Are there technical reasons a Ruby DSL like RSpec couldn't be rewritten in Python?
Question site url: https://stackoverflow.com/questions/7079855
--------------------------------------
Result number: 2
Absolut value of angle between query and result: 0.04511185468400673
Question: design of python: why is assert a statement and not a function?
Question site url: https://stackoverflow.com/questions/13390401
--------------------------------------
Result number: 3
Absolut value of angle between query and result: 0.044239640026830215
Question: Do we have something similar/equivalent to Jdbctemplate in Python?
Question site url: https://stackoverflow.com/questions/7494640
--------------------------------------
Result number: 4
Absolut value of angle between query and result: 0.044065791374514926
Question: Terse error-checking in Python
Question site url: https://stackoverflow.com/questions/2096421