# MLlab_svd_query

In [2]:
import spacy
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS
from sklearn.cluster import KMeans
import time
import numpy as np
from scipy.sparse.linalg import svds, eigs

In [3]:
%run src/file_utils.py
%run src/configuration.py

In [4]:
#document_test = ['BMW-AnnualReport-2016.json']

In [5]:
#vocab_documents = ['BMW-AnnualReport-2016.json', 'CarlZeissMeditec-AnnualReport-2016.json', 'BVB-AnnualReport-2016.json']

In [13]:
documents = ['BMW-AnnualReport-2015.json', 
             'BMW-AnnualReport-2016.json', 
             'BMW-AnnualReport-2017.json', 
             'CarlZeissMeditec-AnnualReport-2015.json', 
             'CarlZeissMeditec-AnnualReport-2016.json', 
             'CarlZeissMeditec-AnnualReport-2017.json',
             'BVB-AnnualReport-2015.json', 
             'BVB-AnnualReport-2016.json', 
             'BVB-AnnualReport-2017.json',
             'Aareal-AnnualReport-2010.json',
             'Adidas-AnnualReport-2010.json',
             'AdlerRealEstate-AnnualReport-2014.json',
             'ADOProperties-QuarterlyReport-2017-Q1.json',
             'Airbus-AnnualReport-2012.json',
             'Aixtron-AnnualReport-2016.json',
             'Allianz-AnnualReport-2015.json',
             'alstria-AnnualReport-2012.json',
             'AmadeusFiRe-AnnualReport-2013.json'
            ]

In [14]:
TYPE = 'type'
PARAGRAPH = 'paragraph'
CONTENT = 'content'

In [15]:
def readContentOfFile(file_name):
    content = ''
    try:
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    content += item[CONTENT]
    except:
        FileUtils.fix_json(file_name)
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    content += item[CONTENT]
    return content

## try LSI

In [16]:
# given documents list, to upload the current after preprocessing document and vocabularly
# input parameter:
#    documents: a list, contain documents' name
# output:
#    document_prepro: a list, contain strings, which contain after preprocessing input document's content.
#    vocabularly: a set, contains vocabularly we get from input document list
def update_preprocess_data_and_vocabularly(documents):
    vocabularly = set()
    document_prepro = list()
    nlp = spacy.load("de")
    for document in documents:
        content_of_document = readContentOfFile(FILE_PATH+document)
        sentence = nlp(content_of_document)
        filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
        filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
        filtered_words_withoutcurrency = [word for word in filtered_words_withoutdigits if not word.is_currency]
        filtered_words_withoutverbs = [word for word in filtered_words_withoutcurrency if word.pos_ != 'VERB']
        filtered_words_withoutnum = [word for word in filtered_words_withoutverbs if word.pos_ != 'NUM']
        filtered_words_withoutsym = [word for word in filtered_words_withoutnum if word.pos_ != 'SYM']
        filtered_words_withoutpunc = [word for word in filtered_words_withoutsym if word.pos_ != 'PUNCT']
        filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
        tmp = set()
        for word in filtered_lemmas:
            tmp.add(word.replace('\n', '').strip().lower())
        new_vocab = set()
        for u in tmp:
            if u != '':
                new_vocab.add(u)
        vocabularly.update(new_vocab)
        lemmatized_content = " ".join(item for item in filtered_lemmas)
        document_prepro.append(lemmatized_content.lower())
       
    vocabularly.remove('million')
    vocabularly.remove('tausend')
    vocabularly.remove('eur')
    vocabularly.remove('teur')
    vocabularly.remove('*')
    vocabularly.remove('+')
    vocabularly.remove('&')
    vocabularly.remove('%')
    return vocabularly, document_prepro
    

In [17]:
start_time = time.time()
my_voc, my_doc = update_preprocess_data_and_vocabularly(documents)

vectorizer = TfidfVectorizer(vocabulary=my_voc)
tfidf_matrix = vectorizer.fit_transform( my_doc)
print (time.time() - start_time)

112.49612021446228


In [18]:


# now compute the input query's vector.
query = "europe "  #query string


# step 1, do preprosseing for this query
nlp = spacy.load("de")
sentence = nlp(query)
filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
vocabularly = set()
for word in filtered_lemmas:
    vocabularly.add(word.replace('\n', '').strip().lower())
new_vocab = set()
for u in vocabularly:
    if u != '':
        new_vocab.add(u)
        

# step 2, generate query's tf-idf vector
query_vector_ori = np.zeros(tfidf_matrix.shape[1]) #initilize the query vector
idf = vectorizer.idf_
feature_name = vectorizer.get_feature_names()
#print(idf)
# find my words in this feature_name list, and its corresponding index
print(new_vocab)
for words in new_vocab:
    idx = feature_name.index(words)
    query_vector_ori[idx] = idf[idx]
    print(query_vector_ori[idx])
    print(idx)
# do normalize
query_vector_ori = query_vector_ori/np.linalg.norm(query_vector_ori)
    
# step3, transfer the origin vector to low_dim space
k = 3
u, s, vt = svds(tfidf_matrix.T, k=k)  # transpose the tfidf_matrix, get item*document
#here k is the remaining dimension. could from 1 to (number of document-1), could try take all paragraph as document
# k could be a parameter
# d_hat = s.inv*U.t*d    may be not right?
s_dig = np.diag(s)
query_vector_low_dim = ((np.linalg.inv(s_dig)).dot(u.T)).dot(query_vector_ori)
# get Vk
#tfidf_matrix_low_dim = (u.dot(s_dig)).dot(vt)

# step4, compute the similarity
def calculate_simility(q1,q2):
    sim = q1.dot(q2)/(np.linalg.norm(q1)*np.linalg.norm(q2))
    return sim
sim = np.zeros(vt.shape[1])
for i in range(0,vt.shape[1]):
    sim[i] = calculate_simility(query_vector_low_dim,vt[:,i])

# step5, take top 3 similar document
print(sim)
top_3_idx = np.argsort(-sim)[0:3]  # here -sim, since I want t get decending order sort,and get the top 3 index
print(top_3_idx)
# try to find some way to connect document and this index

{'kranken', 'arzt'}
2.3350010667323398
19003
2.55814461804655
3214
[-0.14607688 -0.14500215 -0.14500215  0.99172208  0.99237067  0.99168017
 -0.15109107 -0.16220985 -0.14787144  0.53310718  0.63828439  0.50989149
  0.59832618  0.54703209  0.57013083  0.62034624  0.53400373  0.82639627]
[4 3 5]


ValueError: 'auto' is not in list

5021

18958