# Wyszukiwarka

#### Plan:
tremin: 20.12.2017
1. Dictionary
    - Stop words removed
    - Stemming (Porter Stemmer, Porter Stemmer 2)
    - Reduction (optional)
2. Budowa BackOfWords (indeksowanie)
    - documentTermMatrix (wektory słów dla każdego artykułu)
    - Inverse Document Frequency (IDF(w) - przemnożenie kolumny słowa w w documentTermMatrix przez IDF(w)
    - Normalizacja (sprowadzenie wektora dla każdego dokumentu do jednostkowego - przydatne przy danych o zróżnicowanej długości)
3. Query
    - Przekleństwo wymiaru
    - iloczyn skalarny
4. SVD (LRMA)
    - biblioteka, która liczy pierwsze k wektorów 
    - biblioteka dla danych żadkich
5. * Semantyka w nietrywialny sposób
    - Latent semantic indexing
    - Latent Dirichlet approximation

# Preprocessing

### Drop non english articles to file

In [1]:
import csv
from langdetect import detect, lang_detect_exception


def drop_non_en(path):
    """Drop."""
    with open(path, encoding='utf-8', errors="ignore") as csvfile:
        with open('./droped.csv', 'w') as outfile:
            writer = csv.writer(outfile)
            reader = csv.reader(csvfile)
            writer.writerow(next(reader))
            for row in tqdm(reader):
                try:
                    if detect(row[4]) == 'en':
                        writer.writerow(row)
                except lang_detect_exception.LangDetectException:
                    pass


drop_non_en(filename)

NameError: name 'filename' is not defined

In [2]:
import pandas as pd
from tqdm import tqdm
import re
import numpy as np
from nltk.stem import PorterStemmer, snowball
from nltk.stem import WordNetLemmatizer
from scipy.sparse import csr_matrix, lil_matrix, diags
from scipy.sparse.linalg import svds
import scipy
import itertools
from math import log
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.externals import joblib
import heapq

### Params

In [3]:
# filename = "data.tsv"
# filename = "articles2.csv"
filename = 'PythonApp/buzzfeed.csv'
stopwords_filename = "stopwords"

In [4]:
if filename == 'PythonApp/buzzfeed.csv':
    column_name = 'article'
    texts = pd.read_csv(filename)
    texts = texts.fillna('')
    texts['title'].add_suffix(' ')
    texts[column_name] = texts['title'] + " " + texts['first_paragraph'] + " " + texts['text']
elif filename == 'articles2.csv':
    texts = pd.read_csv(filename)
    column_name = 'Article'
elif filename == 'data.tsv':
    texts = pd.read_csv(filename, header=0, delimiter="\t")
    column_name = 'review'

In [5]:
texts.shape

(36909, 6)

In [6]:
with open(stopwords_filename, "r") as file:
    stopwords = set(file.read().splitlines()[1:])

In [7]:
def parse_words(tokens, ps, wnl):
    for t in tokens:
        t = t.lower()
        t = wnl.lemmatize(t)
        t = re.sub('[^a-z]', '', t)
        t = ps.stem(t)
        yield t

In [8]:
class CustomVectorizer(TfidfVectorizer):
    """TfidfVectorizer is a CountVectorizer with IDF transformation and normalisation:
        vectorizer = CustomVectorizer()
        vectorizer.fit_transform(texts)
        transformer = TfidfTransformer()
        tfidf = transformer.fit_transform(X)
    """
    
    def __init__(self, stop_words='english'):
        super(CustomVectorizer, self).__init__(stop_words=stop_words)
#         ps = PorterStemmer()
        self.ps = snowball.EnglishStemmer()
        self.wnl = WordNetLemmatizer()
    def build_tokenizer(self):
        tokenize = super(CustomVectorizer, self).build_tokenizer()
        return lambda doc: list(parse_words(tokenize(doc), self.ps, self.wnl))

In [9]:
vectorizer = CustomVectorizer(stop_words=stopwords)

In [10]:
texts = texts.head(15000)

In [11]:
bag_of_words = vectorizer.fit_transform(tqdm(texts[column_name]))

100%|██████████| 15000/15000 [02:45<00:00, 90.71it/s] 


## SVD

In [14]:
# SVD AND LOW RANK APPROXIMATION FOR DATA MATRIX
U, s, V = svds(bag_of_words)
#print(U.shape, V.shape, s.shape)

approx = round(0.3)
lra = len(s)*approx
s[len(s) - lra:] = [0] * lra

S = diags(s)

bag_of_words_svd = U @ (S @ V)

# Save model:

In [15]:
def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [16]:
save_sparse_csr('PythonApp/bag_of_words', bag_of_words)

In [17]:
np.save('PythonApp/bag_of_words_svd', bag_of_words_svd)

In [18]:
joblib.dump(vectorizer, 'PythonApp/vectorizer.pkl') 

['PythonApp/vectorizer.pkl']

# Load model

In [12]:
bag_of_words = load_sparse_csr('PythonApp/bag_of_words.npz')

In [13]:
vectorizer = joblib.load('PythonApp/vectorizer.pkl')

In [14]:
bag_of_words_svd = np.load('PythonApp/bag_of_words_svd.npy')

# Search

In [19]:
def count_non_zero(bag_of_words):
    if type(bag_of_words) == scipy.sparse.csr.csr_matrix:
        return bag_of_words.count_nonzero()
    else:
        return np.count_nonzero(bag_of_words)

In [20]:
# def get_best_result(query, bag_of_words, k=10):
#     query_vec = vectorizer.transform([query])[0]
#     query_length = query_vec.count_nonzero()
#     number_of_docs = len(texts)
#     matches = {i: 0 for i in range(number_of_docs)}
#     for i in tqdm(range(number_of_docs)):
#         matches[i] = query_vec.multiply(bag_of_words[i]).sum()/(query_length * count_non_zero(bag_of_words[i]))
# #         matches[i] = [word * backOfWords[i,j] for j,word in enumerate(query_vec.nonzero()[1])][0].sum()/(query_length * backOfWords[i].count_nonzero())
#     return list(dict(sorted(matches.items(), key=lambda x: x[1], reverse=True)))[:k]

In [21]:
def get_best_result(query, bag_of_words, k=10):
    query_vec = vectorizer.transform([query])[0]
    query_length = query_vec.count_nonzero()
    number_of_docs = texts.shape[0]
    matches = dict()
    for i in tqdm(range(number_of_docs)):
        matches[i] = 0
        for word_index in query_vec.indices:
            matches[i] += bag_of_words[i, word_index]
        matches[i] /= (query_length * count_non_zero(bag_of_words[i]))
    return list(dict(heapq.nlargest(k, matches.items(), key=lambda x: x[1])))

In [22]:
def print_urls(best_results):
    best_results = list(map(lambda x: texts['url'][x], best_results))
    for i, result in enumerate(best_results):
        print(i, ': ', result)

In [23]:
best_results = get_best_result("Trump becomes the president", bag_of_words)

100%|██████████| 15000/15000 [00:04<00:00, 3441.27it/s]


In [24]:
best_results

[5313, 11703, 1586, 5633, 11800, 4703, 14267, 878, 6419, 2409]

In [25]:
print_urls(best_results)

0 :  https://www.buzzfeed.com/sophmason/how-donald-is-your-trump-2t6pk
1 :  https://www.buzzfeed.com/luxuryhd/trumps-luxury-hotels-top-10-aa-1vd0x
2 :  https://www.buzzfeed.com/wooishgurl7/who-said-it-donald-trump-or-michael-scott-2g7qh
3 :  https://www.buzzfeed.com/deenazaidi/immigration-marine-le-pen-and-the-trump-connecti-2hg33
4 :  https://www.buzzfeed.com/kevinjamesshay/ode-to-mr-trump-1wwul
5 :  https://www.buzzfeed.com/leonardos415b01eb6/i-was-inspired-by-trump-success-2t2br
6 :  https://www.buzzfeed.com/cinamatics/top-5-things-you-may-not-know-about-donald-trump-2zrei
7 :  https://www.buzzfeed.com/katiejoyxox/the-trump-transition-1hg4y
8 :  https://www.buzzfeed.com/catonb/did-donald-trump-really-say-this-or-did-bugs-bunny-snyg
9 :  https://www.buzzfeed.com/mallorymuratore/who-said-it-donald-trump-or-corinne-from-the-bach-nwve


In [26]:
best_results = get_best_result("Trump becomes the president", bag_of_words_svd)

100%|██████████| 15000/15000 [00:04<00:00, 3400.34it/s]


In [27]:
best_results

[1339, 13496, 12989, 13309, 11504, 3252, 6339, 6259, 4707, 6485]

In [28]:
print_urls(best_results)

0 :  https://www.buzzfeed.com/annehelenpetersen/why-donald-trump-didnt-get-elected-the-first-time-around
1 :  https://www.buzzfeed.com/jimdalrympleii/trump-highly-classified-information-russians
2 :  https://www.buzzfeed.com/buzzfeednews/trump-first-foreign-trip
3 :  https://www.buzzfeed.com/tariniparti/exhausted-republicans-are-starting-to-get-fed-up-with-the
4 :  https://www.buzzfeed.com/henrygomez/republicans-are-waiting-for-trumps-help-or-wrath-as-key
5 :  https://www.buzzfeed.com/jimwaterson/theresa-may-won-trump-over-but-the-special-relationship-is-n
6 :  https://www.buzzfeed.com/jimwaterson/british-mp-says-uk-is-pimping-out-the-queen-for-donald-trump
7 :  https://www.buzzfeed.com/buzzfeednews/whats-going-on-around-the-world-today-feb-17-17
8 :  https://www.buzzfeed.com/tariniparti/trump-world-including-steve-bannon-is-already-looking-at-the
9 :  https://www.buzzfeed.com/tomnamako/donald-trump-presser-media
