# Wyszukiwarka

#### Plan:
tremin: 20.12.2017
1. Dictionary
    - Stop words removed
    - Stemming (Porter Stemmer, Porter Stemmer 2)
    - Reduction (optional)
2. Budowa BackOfWords (indeksowanie)
    - documentTermMatrix (wektory słów dla każdego artykułu)
    - Inverse Document Frequency (IDF(w) - przemnożenie kolumny słowa w w documentTermMatrix przez IDF(w)
    - Normalizacja (sprowadzenie wektora dla każdego dokumentu do jednostkowego - przydatne przy danych o zróżnicowanej długości)
3. Query
    - Przekleństwo wymiaru
    - iloczyn skalarny
4. SVD (LRMA)
    - biblioteka, która liczy pierwsze k wektorów 
    - biblioteka dla danych żadkich
5. * Semantyka w nietrywialny sposób
    - Latent semantic indexing
    - Latent Dirichlet approximation

In [9]:
import pandas as pd
from tqdm import tqdm
import re
import numpy as np
from nltk.stem import PorterStemmer, snowball
from nltk.stem import WordNetLemmatizer
from scipy.sparse import csr_matrix, lil_matrix, diags
from scipy.sparse.linalg import svds
import scipy
import itertools
from math import log
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.externals import joblib

### Params

In [10]:
# filename = "data.tsv"
# filename = "articles2.csv"
filename = 'PythonApp/buzzfeed.csv'
stopwords_filename = "stopwords"

In [11]:
if filename == 'PythonApp/buzzfeed.csv':
    column_name = 'article'
    texts = pd.read_csv(filename)
    texts = texts.fillna('')
    texts['title'].add_suffix(' ')
    texts[column_name] = texts['title'] + " " + texts['first_paragraph'] + " " + texts['text']
elif filename == 'articles2.csv':
    texts = pd.read_csv(filename)
    column_name = 'Article'
elif filename == 'data.tsv':
    texts = pd.read_csv(filename, header=0, delimiter="\t")
    column_name = 'review'

In [12]:
texts.shape

(44064, 6)

In [13]:
with open(stopwords_filename, "r") as file:
    stopwords = set(file.read().splitlines()[1:])

In [14]:
def parse_words(tokens, ps, wnl):
    for t in tokens:
        t = t.lower()
        t = wnl.lemmatize(t)
        t = re.sub('[^a-z]', '', t)
        t = ps.stem(t)
        yield t

In [15]:
class CustomVectorizer(TfidfVectorizer):
    """TfidfVectorizer is a CountVectorizer with IDF transformation and normalisation:
        vectorizer = CustomVectorizer()
        vectorizer.fit_transform(texts)
        transformer = TfidfTransformer()
        tfidf = transformer.fit_transform(X)
    """
    
    def __init__(self, stop_words='english'):
        super(CustomVectorizer, self).__init__(stop_words=stop_words)
#         ps = PorterStemmer()
        self.ps = snowball.EnglishStemmer()
        self.wnl = WordNetLemmatizer()
    def build_tokenizer(self):
        tokenize = super(CustomVectorizer, self).build_tokenizer()
        return lambda doc: list(parse_words(tokenize(doc), self.ps, self.wnl))

In [16]:
vectorizer = CustomVectorizer(stop_words=stopwords)

In [17]:
texts = texts.head(10000)

In [18]:
bag_of_words = vectorizer.fit_transform(tqdm(texts[column_name]))

100%|██████████| 10000/10000 [01:37<00:00, 102.84it/s]


## SVD

In [19]:
# SVD AND LOW RANK APPROXIMATION FOR DATA MATRIX
U, s, V = svds(bag_of_words)
#print(U.shape, V.shape, s.shape)

approx = 30
lra = len(s)*approx//100
s[len(s) - lra:] = [0] * lra

S = diags(s)

bag_of_words_svd = U @ (S @ V)

# Save model:

In [20]:
def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [21]:
save_sparse_csr('PythonApp/bag_of_words', bag_of_words)

In [22]:
np.save('PythonApp/bag_of_words_svd', bag_of_words_svd)

In [23]:
joblib.dump(vectorizer, 'PythonApp/vectorizer.pkl') 

['PythonApp/vectorizer.pkl']

# Search

In [24]:
def count_non_zero(bag_of_words):
    if type(bag_of_words) == scipy.sparse.csr.csr_matrix:
        return bag_of_words.count_nonzero()
    else:
        return np.count_nonzero(bag_of_words)

In [25]:
def get_best_result(query, bag_of_words, k=10):
    query_vec = vectorizer.transform([query])[0]
    query_length = query_vec.count_nonzero()
    number_of_docs = len(texts)
    matches = {i: 0 for i in range(number_of_docs)}
    for i in tqdm(range(number_of_docs)):
        matches[i] = query_vec.multiply(bag_of_words[i]).sum()/(query_length * count_non_zero(bag_of_words[i]))
#         matches[i] = [word * backOfWords[i,j] for j,word in enumerate(query_vec.nonzero()[1])][0].sum()/(query_length * backOfWords[i].count_nonzero())
    return list(dict(sorted(matches.items(), key=lambda x: x[1], reverse=True)))[:k]

In [26]:
def print_urls(best_results):
    best_results = list(map(lambda x: texts['url'][x], best_results))
    for i, result in enumerate(best_results):
        print(i, ': ', result)

In [33]:
best_results = get_best_result("Trump becomes the president", bag_of_words)

100%|██████████| 10000/10000 [00:06<00:00, 1565.07it/s]


In [34]:
best_results

[6432, 2438, 1176, 1875, 7668, 3015, 6877, 1054, 3591, 5691]

In [35]:
print_urls(best_results)

0 :  https://www.buzzfeed.com/sophmason/how-donald-is-your-trump-2t6pk
1 :  https://www.buzzfeed.com/buzzfeednews/inauguration-day-emoji
2 :  https://www.buzzfeed.com/kristimreed/learn-more-about-president-elect-trump-2ra6b
3 :  https://www.buzzfeed.com/wooishgurl7/who-said-it-donald-trump-or-michael-scott-2g7qh
4 :  https://www.buzzfeed.com/kerihw/you-know-what-uranium-is-right-nwsm
5 :  https://www.buzzfeed.com/trvsndvl/do-you-know-the-real-donald-trump-2rhoy
6 :  https://www.buzzfeed.com/deenazaidi/immigration-marine-le-pen-and-the-trump-connecti-2hg33
7 :  https://www.buzzfeed.com/csdoingthings/trump-your-cat-40-cats-with-trump-hair-2rsa2
8 :  https://www.buzzfeed.com/ferneine/states-of-the-trump-lqro
9 :  https://www.buzzfeed.com/leonardos415b01eb6/i-was-inspired-by-trump-success-2t2br


In [36]:
best_results = get_best_result("Trump becomes the president", bag_of_words_svd)

100%|██████████| 10000/10000 [00:04<00:00, 2039.23it/s]


In [37]:
best_results

[1583, 7746, 6432, 3894, 7642, 7915, 4084, 5695, 502, 8181]

In [38]:
print_urls(best_results)

0 :  https://www.buzzfeed.com/annehelenpetersen/why-donald-trump-didnt-get-elected-the-first-time-around
1 :  https://www.buzzfeed.com/jimwaterson/british-mp-says-uk-is-pimping-out-the-queen-for-donald-trump
2 :  https://www.buzzfeed.com/sophmason/how-donald-is-your-trump-2t6pk
3 :  https://www.buzzfeed.com/jimwaterson/theresa-may-won-trump-over-but-the-special-relationship-is-n
4 :  https://www.buzzfeed.com/buzzfeednews/whats-going-on-around-the-world-today-feb-17-17
5 :  https://www.buzzfeed.com/tomnamako/donald-trump-presser-media
6 :  https://www.buzzfeed.com/maryanngeorgantopoulos/trump-wall-and-immigration-executive-orders
7 :  https://www.buzzfeed.com/tariniparti/trump-world-including-steve-bannon-is-already-looking-at-the
8 :  https://www.buzzfeed.com/buzzfeednews/whats-going-on-around-the-world-today-jan-12-17
9 :  https://www.buzzfeed.com/emmaloop/lindsey-graham-says-he-will-look-into-trumps-wiretapping-cla
