# Latent Semantic Indexing project

## Importing the libraries

In [69]:
import os
import argparse
import math
import pandas as pd
import numpy as np
from numpy.linalg import norm
from collections import Counter
from scipy.sparse import csr_matrix, coo_matrix, find, save_npz, load_npz
import string
import re
# import Natural Language Toolkit for preprocessing the data
import nltk

nltk.download("wordnet")
nltk.download("punkt")

from nltk import word_tokenize  # tokenizer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/pettepiero/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/pettepiero/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Setting up the notebook

In [70]:
LOAD = True
SKLEARN = False
COMPARE = False
VERBOSE = True

## Loading data

In [71]:
with open("data/time/TIME.ALL", "r") as f:
    lines = f.read().split("*TEXT ")
    lines = lines[1:]

documents_list = []  # list of tuples of strings (headline, content)

for i, article in enumerate(lines):
    parts = article.split("\n\n", 1)
    documents_list.append((parts[0], parts[1].lstrip("\n").lower()))

# Importing stop words from TIME.STP dataset into a list
with open("data/time/TIME.STP", "r") as f:
    sw = f.read().split("\n\n")
stop_words = [word.lower() for word in sw]

del lines
del sw

## Preprocessing

In [72]:
def remove_contractions(words: list):
    """
    Removes contractions from a list of words.

    Keyword arguments:
    words -- list of words to remove contractions from
    Returns:
    A list of words without contractions.
    """
    # List of common English contractions
    contractions = [
        "'s",
        "'re",
        "'ve",
        "'d",
        "'ll",
        "'m",
        "'em",
        "n't",
        "'clock",
        "'tis",
        "'twas",
    ]
    words_without_contractions = [word for word in words if word not in contractions]

    return words_without_contractions


def remove_thousands_separator(input_string: str):
    """
    Removes commas used as thousands separators from a string 
    when surrounded by numbers.

    Keyword arguments:
    input_string -- The input string with commas as thousands separators.
    Returns:
    The input string with appropriate commas removed.
    """
    result = ""
    for i, char in enumerate(input_string):
        if (
            char == ","
            and i > 0
            and i < len(input_string) - 1
            and input_string[i - 1].isdigit()
            and input_string[i + 1].isdigit()
        ):
            continue  # Skip the comma if it's between two digits
        result += char
    return result

In [73]:
# This preprocessing part uses some built-in functions from nltk.
def clean_text(headline: str):
    """
    Cleans a string by removing punctuation, tokenizing, lowercasing,
    lemmatizing, removal of contractions and stop words. Also removes
    words whose length is 0.
    
    Keyword arguments:
    headline -- string to clean
    Returns:
    A string
    """
    le = WordNetLemmatizer()  # from WordNet
    word_tokens = word_tokenize(headline)
    word_tokens = [
        w.lower()  # necessary because used for query too
        for w in word_tokens
        if w not in string.punctuation and w not in ["``"]
    ]
    word_tokens = remove_contractions(word_tokens)
    tokens = [
        le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w) > 0
    ]
    # for w in word_tokens:
    #     if '-' in w:
    #         # Use regular expression to split words containing hyphens
    #         subtokens = re.split(r'[-\s]', w)
    #         subtokens = [le.lemmatize(subtoken) for subtoken in subtokens if subtoken not in stop_words and len(subtoken) > 0]
    #         tokens.extend(subtokens)
    #     else:
    #         tokens.append(le.lemmatize(w))

    cleaned_text = " ".join(tokens)
    return cleaned_text


def str_to_lst(sentence: str):
    """
    Converts a string to a list of words.

    Keyword arguments:
    sentence -- string to convert
    Returns:
    A list of words.
    """
    lst = sentence.split()
    lst = [word.strip(string.punctuation) for word in lst]
    return lst

In [74]:
def str_df_to_lst_df(df: pd.DataFrame):
    """
    Converts a dataframe with a column of strings
    to a dataframe with a column of lists.
    
    Keyword arguments:
    df -- dataframe to convert
    Returns:
    A dataframe with a column of lists.
    """
    # Very extremist function that only works for this specific case
    df["Listed content"] = df["Cleaned content"].apply(str_to_lst)
    return df


def preprocess_TIME_data(docs_list: list):
    """
    Preprocesses the TIME.ALL dataset.
    Keyword arguments:
    docs_list -- list of tuples of strings (headline, content)
    Returns:
    A dataframe with a column of strings and a column of lists.
    """
    df = pd.DataFrame(data=docs_list)
    df.columns = ["Article", "Content"]
    df.drop(["Article"], axis=1, inplace=True)
    df["Cleaned content"] = df["Content"].apply(clean_text)
    df["Cleaned content"] = df["Cleaned content"].apply(remove_thousands_separator)
    df = str_df_to_lst_df(df)

    return df

In [75]:
data = preprocess_TIME_data(docs_list=documents_list)
if VERBOSE:
    print("Finished loading data and created dataframe.")

Finished loading data and created dataframe.


## Creating a dictionary

In [76]:
words = [
    word for sentence in data["Listed content"] for word in sentence if len(word) > 1
]
word_counts = Counter(words)
cols_dict = {word: idx for idx, (word, _) in enumerate(word_counts.items())}
if VERBOSE:
    print(f"Created dictionary of length {len(cols_dict)}:")
    print(cols_dict)

del word_counts
del words

Created dictionary of length 20510:


This dictionary will be used from now on to map words to their index in the matrices.

### Creating pipeline

In [77]:
# Creating sklearn pipeline
custom_pattern = r"\b\w+\b|\b\w+-\w+\b"
pipe = Pipeline(
    [
        (
            "count",
            CountVectorizer(vocabulary=cols_dict, token_pattern=custom_pattern),
        ),
        ("tfidf", TfidfTransformer()),
    ]
)

print(f"Type of pipe: {type(pipe)}")
print(f"Type of pipe['count'] = {type(pipe['count'])}")
print(f"Type of pipe['tfidf'] = {type(pipe['tfidf'])}")

Type of pipe: <class 'sklearn.pipeline.Pipeline'>
Type of pipe['count'] = <class 'sklearn.feature_extraction.text.CountVectorizer'>
Type of pipe['tfidf'] = <class 'sklearn.feature_extraction.text.TfidfTransformer'>


## Create term/document matrix

A dictionary is used to store the count of appereances of each word.

In [78]:
def count_words(sentence: str):
    """
    Counts the number of appearances of each word in a sentence.
    Returns a dictionary with the words as keys and the number 
    of appearances as values.

    Keyword arguments:
    sentence -- string
    Returns:
    A dictionary
    """
    counter_dict = {}
    words = re.findall(r"\b[\w-]+\b", sentence)
    for word in words:
        word = word.strip(string.punctuation)  # Remove leading and trailing punctuation
        # Split words containing hyphens
        if "-" in word:
            subwords = word.split("-")
            for subword in subwords:
                counter_dict[subword] = counter_dict.get(subword, 0) + 1
        else:
            counter_dict[word] = counter_dict.get(word, 0) + 1

    return counter_dict

In [79]:
def create_doc_term_matrix(corpus: list, mapper_dictionary: dict):
    """
    Calculates document (rows) - term (cols) matrix.
    Each cell is the number of appearances. Uses a dictionary,
    so only elements in that dictionary will be counted.
    Keyword arguments:
    corpus -- list of strings (documents)
    mapper_dictionary -- dict to assign every known word to a column
    Returns:
    A sparse matrix
    """
    nz_tuples = []
    for d_idx, doc in enumerate(corpus):
        counter_dict = count_words(doc)
        for word in counter_dict:
            if word in mapper_dictionary:
                word_index = mapper_dictionary[word]
                nz_tuples.append((d_idx, word_index, counter_dict[word]))

    rows, cols, values = zip(*nz_tuples)
    temporary_matrix = coo_matrix((values, (rows, cols)))

    return temporary_matrix.tocsr()  # Convert COO matrix to CSR matrix and return


def get_idf(term_idx: int, count_matrix: csr_matrix):
    """
    Calculates the idf for a term given its index in the count matrix.

    Keyword arguments:
    term_idx -- index of the term in the count matrix
    count_matrix -- Term frequency csr matrix
    Returns:
    A float
    """
    n = count_matrix.shape[0]
    # Set of non zero docs for this term
    docs = set(count_matrix[:, term_idx].nonzero()[0])
    df = len(docs)
    tsidfs = []  # List of idf for a term
    for doc in docs:
        tf = count_matrix[doc, term_idx]
        idf = math.log((1 + n) / (1 + df), 10) + 1  # Formula from sklearn
        tsidfs.append(tf * idf)


    return tsidfs

In [80]:
def calc_tf_idf(count_matrix: csr_matrix):
    """
    Calculates the tf-idf matrix given a count matrix.
    The aim is to obtain the same results as sklearn.TfidfTransformer

    Keyword arguments
    count_matrix -- Term frequency csr matrix
    Returns:
    A sparse matrix
    """
    if VERBOSE:
        print(
            "\nYou've chosen the extremely slow algorithm... this is going to take a while."
        )
    tuples = []
    n_terms = count_matrix.shape[1]
    for term in range(n_terms):
        if VERBOSE:
            if term % 100 == 0:
                print(f"Current term: {term}/{n_terms}")
        idfs = get_idf(term, count_matrix)
        docs = set(count_matrix[:, term].nonzero()[0])
        if len(idfs) == len(docs):
            for doc, idf in zip(docs, idfs):
                tuples.append((doc, term, idf))
        else:
            print("ERROR inside 'calc_tf_idf' -> len(idfs) != len(docs)")
        rows, cols, values = zip(*tuples)
        temporary = coo_matrix((values, (rows, cols)))
        normalized_matrix = normalize(temporary, norm="l2", axis=1)

    return normalized_matrix.tocsr()


def scikit_matr(dataframe: pd.DataFrame, pipeline: Pipeline):
    """
    Create document/term and tfidf matrix using
    scikit-learn CountVectorizer and TfidfTransformer.

    Keyword arguments:
    dataframe --    pandas df where column "Listed content" is the one
                    containing the list of words for each document.
    pipeline --     sklearn pipeline object
    Returns:
    A tuple containing the sparse document/term matrix,
    the sparse tfidf matrix and the pipeline object.
    """
    doc_t_matrix = pipeline["count"].fit_transform(dataframe["Cleaned content"])
    tfidf = pipeline.fit_transform([doc for doc in dataframe["Cleaned content"]])
    return doc_t_matrix, tfidf, pipeline

### What happens in scikit_matr?

1) **pipeline["count"].transform(*list of strings*):**
   
   takes the *list of strings* and creates a mapping from words to feature indices. This is done with a dictionary, which will contain each word as keys and the mapping as values. This dictionary is stored in the 'vocabulary_' attribute of the CountVectorizer object, i.e. pipeline["count"].

2) **pipeline.fit_transform(*list of strings*):**

   This is actually fitting all of the transformers one after the other and sequentially transforming the data. Therefore, pipeline["count"] is fitted again. This time, the returned value is that of the last element of the pipeline. The fit_transform of pipeline["tfidf"] object computes the IDF values for each term in the dataset and stores these values in the "idf_" attribute.

In [81]:
def print_sparse_matrix_difference(
    matrix1: csr_matrix, matrix2: csr_matrix, epsilon: float = 0.05
):
    """
    Prints differences between two sparse matrices. A cell is considered
    different if the absolute value of the difference between the two
    values is greater than 0.05.

    Keyword arguments:
    matrix1 -- First sparse matrix
    matrix2 -- Second sparse matrix
    epsilon -- Threshold for the difference between two values
    """
    counter = 0
    rows1, cols1, values1 = find(
        matrix1
    )  # Find non-zero elements and their coordinates
    rows2, cols2, values2 = find(matrix2)
    set1 = set(zip(rows1, cols1, values1))
    set2 = set(zip(rows2, cols2, values2))

    num_elements_to_print = 15
    list1 = sorted(set1)
    list2 = sorted(set2)
    if VERBOSE:
        print("\tFirst 15 elements for each matrix.")
        for i in range(min(num_elements_to_print, len(list1))):
            print(f"\t\tset1[i] = {list1[i]} \t set2[i] = {list2[i]}")

    differences = set1.symmetric_difference(set2)
    # Find the differences between the two sets
    for diff in differences:
        row, col, _ = diff
        value1 = matrix1[row, col] if diff in set1 else 0
        value2 = matrix2[row, col] if diff in set2 else 0
        if abs(value1 - value2) >= 0.05:
            counter += 1
    if VERBOSE:
        print(f"\tNumber of differences = {counter}\n")

## Loading matrices from files or creating them

In [82]:
def load_all_matrices():
    """
    Function to load all sparse matrices from './matrices/'.
    Returns the matrices in the following order:
    doc_t_mtx, true_doc_term_matrix, my_tfidf, tfidf
    """
    if LOAD:
        doc_t_mtx = load_npz("./matrices/my_dt.npz")
        true_doc_t_mtx = load_npz("./matrices/true_dt.npz")
        # TF-IDF matrices
        my_tfidf = load_npz("./matrices/my_tfidf.npz")
        tfidf = load_npz("./matrices/true_tfidf.npz")
        if VERBOSE:
            print("\tLoaded term count matrix from './matrices/my_dt.npz'")
            print("\tLoaded true doc-term matrix from './matrices/true_dt.npz'")
            print("\tLoaded tf-idf matrix from './matrices/my_tfidf.npz'")
            print("\tLoaded true tfidf matrix from './matrices/true_tfidf.npz'")

        return doc_t_mtx, true_doc_t_mtx, my_tfidf, tfidf
    else:
        print("ERROR: load_all_matrices shouldn't have been called.")
        return 0
    

In [83]:
if LOAD:
    print("Set to load matrices mode.")
    doc_t_mtx, true_doc_term_matrix, my_tfidf, tfidf = load_all_matrices()

    if COMPARE:
        print("Set to matrix compare mode.")
        if VERBOSE:
            print(
                f"\tCustom document/term matrix number of non-zero elements: {doc_t_mtx.nnz}"
            )
            print(
                f"\tpipe['count'].transform(data['Cleaned content']) number of non-zero elements: {true_doc_term_matrix.nnz}"
            )
            print(
                "\tComparing sklearn document/term sparse matrix (set1) vs custom one (set2)'"
            )
        print_sparse_matrix_difference(true_doc_term_matrix, doc_t_mtx)
        # TF-IDF matrix comparison
        if VERBOSE:
            print("Comparing sklearn tf-idf matrix (set1) vs custom one (set2)")
        print_sparse_matrix_difference(tfidf, my_tfidf)
        print("\tNote:\tthis sounds like a terrible number but the calculated")
        print("\t\tvalues and the sklearn ones are actually very close!")

else:
    if COMPARE:
        print("Set to matrix compare mode.")

    if SKLEARN or COMPARE:
        print("Using scikit-learn library.")
        print("Creating document/term matrix...\n")
        true_doc_term_matrix, tfidf, pipe = scikit_matr(dataframe=data, pipeline=pipe)
        save_npz("./matrices/true_dt.npz", true_doc_term_matrix)
        print("Saved doc/term matrix to './matrices/true_dt.npz'")
        save_npz("./matrices/true_tfidf.npz", tfidf)

        idf = pipe["tfidf"].idf_
        dd = dict(zip(pipe.get_feature_names_out(), idf))
        sorted_dict = sorted(dd, key=dd.get)

    if not SKLEARN or COMPARE:
        print("Creating document/term matrix...\n")
        doc_t_mtx = create_doc_term_matrix(data["Cleaned content"], cols_dict)
        save_npz("./matrices/my_dt.npz", doc_t_mtx)
        print("Saved matrix to './matrices/my_dt.npz'")

        print("\nUsing custom algorithm to calculate tf-idf")
        my_tfidf = calc_tf_idf(doc_t_mtx)
        save_npz("./matrices/my_tfidf.npz", my_tfidf)
        print("Saved matrix to './matrices/my_tfidf.npz'")

        print(f"tfidf = \n {my_tfidf}")

Set to matrix compare mode.
Using scikit-learn library.
Creating document/term matrix...



Saved doc/term matrix to './matrices/true_dt.npz'
Creating document/term matrix...

Saved matrix to './matrices/my_dt.npz'

Using custom algorithm to calculate tf-idf

You've chosen the extremely slow algorithm... this is going to take a while.
Current term: 0/20510
Current term: 100/20510
Current term: 200/20510
Current term: 300/20510
Current term: 400/20510
Current term: 500/20510
Current term: 600/20510
Current term: 700/20510
Current term: 800/20510
Current term: 900/20510
Current term: 1000/20510
Current term: 1100/20510
Current term: 1200/20510
Current term: 1300/20510
Current term: 1400/20510
Current term: 1500/20510
Current term: 1600/20510
Current term: 1700/20510
Current term: 1800/20510
Current term: 1900/20510
Current term: 2000/20510
Current term: 2100/20510
Current term: 2200/20510
Current term: 2300/20510
Current term: 2400/20510
Current term: 2500/20510
Current term: 2600/20510
Current term: 2700/20510
Current term: 2800/20510
Current term: 2900/20510
Current term: 300

## Latent Semantic Analysis

Create a truncated SVD model with 100 components. That is, only the first 100 singular values and vectors will be used.

In [84]:
lsa_model = TruncatedSVD(
    n_components=100, algorithm="randomized", n_iter=10, random_state=42
)

if SKLEARN:
    # sklearn's tfidf matrix is still going to be used from now on
    _, tfidf, pipe = scikit_matr(dataframe=data, pipeline=pipe)
else:
    print("Using custom tfidf matrix.")
    tfidf = my_tfidf

lsa_matrix = lsa_model.fit_transform(tfidf)
print("Created latent semantic analysis model")

Using custom tfidf matrix.
Created latent semantic analysis model


In [85]:
print("Showing the singular values:\n")
print(lsa_model.singular_values_)

Showing the singular values:

[4.14303868 2.67376226 2.51540967 2.44209524 2.27237358 2.06886913
 1.9893948  1.94434177 1.90992711 1.87625195 1.82627205 1.78856187
 1.74918883 1.68235583 1.65264002 1.61227931 1.5842544  1.55865416
 1.53768507 1.53030635 1.50941562 1.49341566 1.4648994  1.46322646
 1.44058998 1.43506419 1.41478355 1.39291776 1.37973024 1.37635248
 1.3660572  1.34366197 1.32876537 1.319917   1.31328251 1.304815
 1.29787292 1.28644573 1.28444218 1.26861683 1.26190984 1.24530059
 1.23989991 1.23846371 1.23517532 1.22804525 1.21974027 1.21551644
 1.21046041 1.20755668 1.19997333 1.19572069 1.19159995 1.18785557
 1.18362313 1.17826129 1.17216329 1.16504036 1.1607677  1.16001246
 1.15313795 1.1493255  1.14631575 1.14243459 1.13890031 1.13502194
 1.12994578 1.12726198 1.12645685 1.11710585 1.11353018 1.10831617
 1.10584373 1.10142825 1.09750016 1.09493394 1.09084608 1.08727334
 1.08479382 1.07962474 1.07769482 1.07624892 1.0742022  1.07053256
 1.06716262 1.06621365 1.06204274 

Let's try to see what the most important terms are in the most important "topics".

In [86]:
pipe.fit([doc for doc in data["Cleaned content"]])
vocab = pipe.get_feature_names_out()

for i, comp in enumerate(lsa_model.components_[:3]):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key=lambda x: x[1], reverse=True)[:10]

    if VERBOSE:
        print(
            "\tTopic "
            + str(i)
            + f" of weight: {lsa_model.singular_values_[i]} - Top 10 most important words: "
        )
        for t in sorted_words:
            print(f"{t[0]}", end=" ")
        print("\n")

	Topic 0 of weight: 4.143038678937661 - Top 10 most important words: 
government de party communist new red gaulle viet minister soviet 

	Topic 1 of weight: 2.673762263809813 - Top 10 most important words: 
viet diem buddhist nam south saigon cong nhu government vietnamese 

	Topic 2 of weight: 2.5154096697669344 - Top 10 most important words: 
de gaulle viet france diem buddhist europe french nam south 



In [96]:
terms = pipe["count"].get_feature_names_out()
print(len(terms))
print(lsa_model.components_.shape)
weights_per_term = pd.DataFrame(lsa_model.components_, columns=terms)
print(weights_per_term[:3])

20510
(100, 20510)
       ally    nassau  december      1960           u.s  proposed      help  \
0  0.034961  0.007682  0.007557  0.015843 -1.739247e-22  0.013908  0.031753   
1 -0.038249 -0.014990 -0.006612 -0.001396  1.616808e-18 -0.005684  0.004900   
2  0.028258  0.016876  0.006527 -0.000659  4.454269e-18  0.003608 -0.000741   

       nato   develop   nuclear  ...  garnered  d.r.p  45-seat  assemblyman  \
0  0.059715  0.006957  0.060691  ...  0.000722   -0.0     -0.0     0.000722   
1 -0.080739 -0.006886 -0.080309  ...  0.000801    0.0      0.0     0.000801   
2  0.073361  0.000263  0.043715  ...  0.000259    0.0      0.0     0.000259   

   innauguration       236  stabilizing  palatable       113     omaha  
0       0.000722  0.000722     0.000722   0.000722  0.001867  0.001244  
1       0.000801  0.000801     0.000801   0.000801  0.001846 -0.000580  
2       0.000259  0.000259     0.000259   0.000259  0.000461 -0.000737  

[3 rows x 20510 columns]


In [88]:
if VERBOSE:
    print(f"\tlsa_matrix:\n\t {lsa_matrix}")
    print(f"\tlsa_matrix.shape: {lsa_matrix.shape}")
    print(f"\ttf_transformer.get_feature_names_out(): {pipe.get_feature_names_out()}")

	lsa_matrix:
	 [[ 0.35757215 -0.33058819  0.3561878  ...  0.01529527 -0.01853001
  -0.0127625 ]
 [ 0.15155977 -0.03062733 -0.14254489 ... -0.10965427  0.07995239
   0.00839892]
 [ 0.11578728  0.00066753 -0.03160293 ...  0.03460429 -0.06264624
   0.04801383]
 ...
 [ 0.11544345  0.03247569 -0.01153102 ...  0.0056115   0.00982517
   0.01432413]
 [ 0.17186565  0.07946109  0.02269839 ...  0.02544908 -0.00430022
   0.00836932]
 [ 0.10455973 -0.02030834 -0.02282379 ...  0.03061291  0.0327303
  -0.09795992]]
	lsa_matrix.shape: (423, 100)
	tf_transformer.get_feature_names_out(): ['ally' 'nassau' 'december' ... 'palatable' '113' 'omaha']


In [89]:
for i, comp in enumerate(lsa_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key=lambda x: x[1], reverse=True)[:10]

    if VERBOSE:
        print("\tTopic " + str(i) + f" of weight:{lsa_model.singular_values_[i]} - Top 10 most important words: ")
        for t in sorted_words:
            print(f"{t[0]}", end=" ")
        print("\n")

	Topic 0 of weight:4.143038678937661 - Top 10 most important words: 
government de party communist new red gaulle viet minister soviet 

	Topic 1 of weight:2.673762263809813 - Top 10 most important words: 
viet diem buddhist nam south saigon cong nhu government vietnamese 

	Topic 2 of weight:2.5154096697669344 - Top 10 most important words: 
de gaulle viet france diem buddhist europe french nam south 

	Topic 3 of weight:2.442095237641751 - Top 10 most important words: 
nasser arab baath syria egypt yemen iraq syrian cairo saudi 

	Topic 4 of weight:2.2723735775455713 - Top 10 most important words: 
tory labor election party macmillan wilson britain christine minister socialist 

	Topic 5 of weight:2.068869132082329 - Top 10 most important words: 
nenni socialist party fanfani christian democrat italy election nasser apertura 

	Topic 6 of weight:1.9893948046005239 - Top 10 most important words: 
nenni lao fanfani neutralist socialist christian tshombe democrat communist premier 

	To

### Dealing with the query

In [90]:
def get_query():
    """
    Gets query from user and returns it after cleaning it.
    
    Returns:
    A string
    """
    print("\n********************************************************")
    query_str = input("Write your free-text query: ")
    print(query_str)
    query = clean_text(query_str)
    if VERBOSE:
        print(f"Nice, what I'm going to use is: {query}")

    return query


def transform_query(query: str):
    """
    Transforms query into a vector using the pipeline
    
    Keyword arguments:
    query -- string
    Returns:
    A vector
    """
    query_vector = pipe.transform([query])
    if VERBOSE:
        print(f"query_vector: {query_vector}")
        print(f"query_vector.shape: {query_vector.shape}")
    query_lsa = lsa_model.transform(query_vector).reshape(-1)
    
    return query_lsa

In [91]:
def cos_similarity(v1: np.ndarray, v2: np.ndarray):
    """
    Calculates the cosine similarity between two vectors.

    Keyword arguments:
    v1 -- first vector
    v2 -- second vector
    Returns:
    A float
    """
    return np.dot(v1, v2) / (norm(v1) * norm(v2))


def sim_measures(query_vector: np.ndarray, docs_matr: np.ndarray):
    """
    Calculates the cosine similarity between a query vector and
    a matrix of documents.

    Keyword arguments:
    query_vector -- vector representing the query
    docs_matr -- matrix of documents
    Returns:
    A list of floats
    """
    measures = []
    for doc in docs_matr:
        sim = cos_similarity(query_vector, doc)
        measures.append(sim)
        
    return measures

In [92]:
def print_top_results(res_df: pd.DataFrame, n_results: int =5):
    """
    Prints top n_results from an ordered dataframe.

    Keyword arguments:
    res_df -- ordered pd.Dataframe
    n_results -- number of results to print
    """
    print(f"{res_df.iloc[:n_results]}")

    if VERBOSE:
        topn_doc_indices = res_df.iloc[:n_results, 0].index.tolist()
        for doc in topn_doc_indices:
            print(f"\n\n{documents_list[doc][0]}")
            print(f"{documents_list[doc][1]}")

In [93]:
query = get_query()
query_v = transform_query(query)

results = sim_measures(query_v, lsa_matrix)
res_df = pd.DataFrame(data=results)
res_df.index = res_df.index + 1
res_df.columns = ["Similarity measure"]
res_df.index.name = "DocID"
res_df = res_df.sort_values(by="Similarity measure", ascending=False)

print_top_results(res_df, 5)


********************************************************



Nice, what I'm going to use is: 
query_vector: 
query_vector.shape: (1, 20510)
       Similarity measure
DocID                    
1                     NaN
2                     NaN
3                     NaN
4                     NaN
5                     NaN


018 01/04/63 PAGE 021
russia who's in charge here ? it was in 1954 that nikita

khrushchev launched his grandiose " virgin lands " gamble . part of the

plan was to plow up 32 million acres of marginal land in kazakhstan,

and settle it with communist " pioneers, " who were to plant and

produce huge quantities of desperately needed grain within two years .

nikita's scheme flopped . there was not enough rainfall, and the

pioneers did not take to tractor life on the bleak frontier . except

for 1958, each harvest has been lower than the previous year's . worst

year of all was 1962, when the virgin lands delivered only half their

quotas . naturally, khrushchev takes none of the blame for the fiasco .

three years ago he foun

  return np.dot(v1, v2) / (norm(v1) * norm(v2))


In [94]:
VERBOSE = True
print_top_results(res_df, 5)

       Similarity measure
DocID                    
1                     NaN
2                     NaN
3                     NaN
4                     NaN
5                     NaN


018 01/04/63 PAGE 021
russia who's in charge here ? it was in 1954 that nikita

khrushchev launched his grandiose " virgin lands " gamble . part of the

plan was to plow up 32 million acres of marginal land in kazakhstan,

and settle it with communist " pioneers, " who were to plant and

produce huge quantities of desperately needed grain within two years .

nikita's scheme flopped . there was not enough rainfall, and the

pioneers did not take to tractor life on the bleak frontier . except

for 1958, each harvest has been lower than the previous year's . worst

year of all was 1962, when the virgin lands delivered only half their

quotas . naturally, khrushchev takes none of the blame for the fiasco .

three years ago he found a scapegoat in kazakhstan party boss nikolai

belyaev, fired him for his " err