# MLlab_LSI_query

In [180]:
import spacy
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS
from sklearn.cluster import KMeans
import time
import numpy as np
from scipy.sparse.linalg import svds, eigs

In [181]:
%run src/file_utils.py
%run src/configuration.py
%run 'load_and_prepro_document.ipynb'

## LSI
LSI is actually just doing SVD at TF-IDF matrix, and to get an approximate TF-IDF matrix with low number of dimension.
Here I try to use LSI to realize a information retrieval application.

In [182]:
# here just use os lib to get the 1000 documents in this folder
import os
documents_list = list()
for root, dirs, files in os.walk("./LabShare/data/all/json", topdown=False):
    for name in files:
        documents_list.append(name)
documents_list = documents_list[:1000]

In [183]:
start_time = time.time()
# here I override the preProcess() in fit_transform(). Because the input data is already preprocessed.
def preProcess(s):
    return s
my_doc, my_doc_name = get_clean_data(documents_list)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform( my_doc )
print (time.time() - start_time)

7.5903236865997314


In [184]:
# now compute the input query's vector.
query = 'Sportbekleidung schuh '   #query string

# step 1, do same preprosseing for this query
nlp = spacy.load("de")
sentence = nlp(query, disable=['parser', 'ner'])
filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]

vocabularly = set()
for word in filtered_lemmas:
    vocabularly.add(word.replace('\n', '').strip().lower())

new_vocab = set()
for u in vocabularly:
    if u != '':
        new_vocab.add(u)

# step 2, generate query's tf-idf vector
query_vector_ori = np.zeros(tfidf_matrix.shape[1]) #initilize the query vector
idf = vectorizer.idf_
feature_name = vectorizer.get_feature_names()

# find my words in this feature_name list, and its corresponding index
print("search query is: ")
print(new_vocab)
for words in new_vocab:
    idx = feature_name.index(words)
    query_vector_ori[idx] = idf[idx]
# do normalize
query_vector_ori = query_vector_ori/np.linalg.norm(query_vector_ori)

# step3, transfer the origin vector to low_dim space
k = 100
u, s, vt = svds(tfidf_matrix.T, k=k)  # transpose the tfidf_matrix, get item*document
#here k is the remaining dimension. could from 1 to (number of document-1)
# d_hat = s.inv*U.t*d    
s_dig = np.diag(s)
query_vector_low_dim = ((np.linalg.inv(s_dig)).dot(u.T)).dot(query_vector_ori)
# get query in low dim

# step4, compute the similarity
def calculate_simility(q1,q2):
    sim = q1.dot(q2)/(np.linalg.norm(q1)*np.linalg.norm(q2))
    return sim
sim = np.zeros(vt.shape[1])
for i in range(0,vt.shape[1]):
    sim[i] = calculate_simility(query_vector_low_dim,vt[:,i])

# step5, take top 10 similar document
top_idx = np.argsort(-sim)[0:10]  # here -sim, since I want t get decending order sort,and get the top 3 index
print('------------------------------------')
print('related document: \t related score')
for i in top_idx:
    print(my_doc_name[i]+':\t'+ str(sim[i]))

# try to find some way to connect document and this index

search query is: 
{'sportbekleidung', 'schuh'}
------------------------------------
related document: 	 related score
PUMA-QuarterlyReport-2012-Q3.json:	0.9705766715598944
PUMA-QuarterlyReport-2012-Q2.json:	0.9584773368670201
PUMA-QuarterlyReport-2015-Q1.json:	0.9543901309787146
PUMA-QuarterlyReport-2014-Q3.json:	0.9507635544600765
PUMA-QuarterlyReport-2010-Q2.json:	0.9435183542968074
PUMA-QuarterlyReport-2011-Q3.json:	0.9304272544834216
PUMA-QuarterlyReport-2010-Q1.json:	0.9128649922630564
PUMA-AnnualReport-2013.json:	0.8812087556023984
Adidas-AnnualReport-2016.json:	0.3034457396028766
Zalando-AnnualReport-2015.json:	0.21861967178319974
