In [1]:
from data_manager import Vector, TruncatedMatrix
import json
import pickle
from math import log10 as log
import numpy as np
from scipy import sparse
from sklearn.decomposition import TruncatedSVD

In [2]:
with open("data/alphabet.json", "r", encoding="latin-1") as f:
    alphabet = json.load(f)

In [None]:
vectors: list[Vector] = []
with open("data/Bag_of_words.pickle", "rb") as f:
    while True:
        try:
            vectors.append(pickle.load(f))
        except EOFError:
            break

In [6]:
N = len(vectors)
M = len(alphabet)
print(M, N)

231286 607282


In [7]:
word_frequency = {}
for word in alphabet:
    word_frequency[word] = 0

for vec in vectors:
    for word in vec.vector:
        word_frequency[word] += 1

idf = {}
for word in word_frequency:
    idf[word] = log(N / word_frequency[word])

In [10]:
def create_sparse_matrix(filename="sparse_matrix.npz", use_idf=False):
    no_of_entries = 0
    for v in vectors:
        no_of_entries += len(v.vector)

    data = np.array([0 for _ in range(no_of_entries)], dtype=float)
    row = np.array([0 for _ in range(no_of_entries)], dtype=int)
    col = np.copy(row)

    ind = 0
    for i, v in enumerate(vectors):
        start = ind
        for w in v.vector:
            data[ind] = v.vector[w] * (idf[w] if use_idf else 1)
            row[ind] = alphabet[w]
            col[ind] = i
            ind += 1
        data[start:ind] /= np.linalg.norm(data[start:ind])

    matrix_to_save = sparse.csr_matrix((data, (row, col)), shape=(M, N), dtype=float)
    sparse.save_npz(f"data/matrices/{filename}", matrix_to_save)

In [11]:
create_sparse_matrix("sparse_matrix_idf.npz", use_idf=True)

In [3]:
matrix_idf = sparse.load_npz("data/matrices/sparse_matrix_idf.npz")

In [4]:
def save_truncated_svd(truncation_level):
    svd = TruncatedSVD(truncation_level).fit(matrix_idf)
    u_s = svd.transform(matrix_idf)
    sigma = svd.singular_values_
    v = svd.components_
    vector_lengths = np.linalg.norm(np.diag(sigma) @ v, axis=0)
    svd_truncation_level = TruncatedMatrix(truncation_level, u_s, sigma, v, vector_lengths)
    with open(f"data/matrices/svd_matrix_{truncation_level}.pickle", "wb") as file:
        pickle.dump(svd_truncation_level, file)

In [None]:
%%time
save_truncated_svd(20)

In [None]:
%%time
save_truncated_svd(50)

In [None]:
%%time
save_truncated_svd(100)