# Search Engine full stack 

## Building an Indexer for the crawled webpages

In [1]:
import os, elasticsearch

In [2]:
article_path = "newyorktimes/newyorktimes/articles"

docs = [] 
doc_names = []
doc_dic ={}
for root, dirs, files in os.walk(article_path):
    for file in files:
        with open(os.path.join(root,file), 'r') as f:
            doc_dic[file]= f.read()
            
docs = [*doc_dic.values()]
doc_names = [*doc_dic.keys()] 

In [3]:
es = elasticsearch.Elasticsearch()

for i in range(len(docs)):
    title = ''.join(doc_names[i].split('-')[1:]).split('.txt')[0]
    authors = docs[i].split('Author(s): ')[1].split('\n')[0].split(', ')
    url = docs[i].split('url: ')[1].split('\n')[0]
    text = ''.join(docs[i].split(url)[1].split('\n')[2:])

    es.index(index='scrapped', doc_type='article', id=i, body={
        'title': title,
        'authors': authors,
        'url': url,
        'text': text
    })

In [4]:
def search_text(word):
    res = es.search(index='scrapped', body={"query": {"match": {"text": word}}})
    return list(map(lambda x: x['_source'], res['hits']['hits']))

In [5]:
def search_title(word):
    res = es.search(index='scrapped', body={"query": {"match": {"title": word}}})
    return list(map(lambda x: x['_source'], res['hits']['hits']))

In [6]:
def search_author(word):
    res = es.search(index='scrapped', body={"query": {"match": {"authors": word}}})
    return list(map(lambda x: x['_source'], res['hits']['hits']))

In [7]:
def search_fuzzy(word):
    res = es.search(index='scrapped', body={"query": {"fuzzy": {"text": word}}})
    return list(map(lambda x: x['_source'], res['hits']['hits']))

In [8]:
def search_compound_fields(field1, value1, field2, value2):
    res = es.search(index='scrapped', body={ "query": { "bool": { "must": [ { "match": { field1: value1 } }, { "match": { field2: value2 } } ] } } })
    return list(map(lambda x: x['_source'], res['hits']['hits']))

In [9]:
list(map(lambda article: article['title'], search_text('Ahmed')))

['Syrian Military Seizes Mosque in Dara’a That Was Focus of Uprising',
 'In Libya, Reinforcements Help Rebels Hold Their Ground',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says',
 'Syrian Resort Town Is Stronghold for Alawites',
 'Aleppo Residents, Battered by War, Struggle to Survive']

In [10]:
list(map(lambda article: article['title'], search_title('Qaddafi')))

['Allies Pressure Qaddafi Forces',
 'Rebels Storm Qaddafi Compound',
 'Rebels Storm Qaddafi Compound',
 'Rebels Storm Qaddafi Compound',
 'Allies Pressure Qaddafi Forces',
 'Rebels Storm Qaddafi Compound',
 'Allies Pressure Qaddafi Forces',
 'Allies Pressure Qaddafi Forces',
 'Rebels Storm Qaddafi Compound',
 'Rebels Storm Qaddafi Compound']

In [11]:
list(map(lambda article: article['title'], search_author('Scott')))

['Thousands Fleeing Qaddafi Find Hospitality in Tunisia',
 'The Fake Americans Russia Created to Influence the Election',
 'Thousands Fleeing Qaddafi Find Hospitality in Tunisia',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says']

In [12]:
list(map(lambda article: article['title'], search_fuzzy('austrio')))

['European Union Bans Syrian Oil as Crackdown Continues',
 'Trump Declares Opioid Crisis a ‘Health Emergency’ but Requests No Funds',
 'In Arabian Desert, a Sustainable City Rises',
 'Through the Outback']

In [13]:
list(map(lambda article: article['title'], search_compound_fields('title', 'russia', 'authors', 'scott')))

['The Fake Americans Russia Created to Influence the Election']

## Summarizer 

In [14]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer

LANGUAGE = "english"
SENTENCES_COUNT = 4

def summarize_text(text):
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)

    sentences = []
    
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        sentences.append(' '.join(sentence.words))
    
    return ' '.join(sentences)

## Snippet surrounding

In [15]:
import re
import numpy as np

def get_surrounding(text, word, n):
    wordregx = r"\W*([\w]+)"
    groups =  np.array(re.search(r'{}\W*{}{}'.format(wordregx*n, word,wordregx*n), text).groups())
    return ' '.join(np.hstack(( groups[:n], [word], groups[n:] )).ravel())

In [16]:
article_text = list(map(lambda article: article['text'], search_text('weather')))[0]

In [17]:
summarize_text(article_text)

'But outside experts say and anecdotal evidence suggests that old intrigues thought to have been suppressed are at work again With a retro campaign encouraging residents to sing patriotic songs those initiatives delighted leftists and won broad support among vast throngs of citizens left behind in China s race toward among others the programs raised uncomfortable parallels to Mao s disastrous Cultural Revolution crusade against capitalism in which millions died or were persecuted and smacked of Bo s basic problem is that he does not operate according to the party s established practices said a journalist for one Communist Party news outlet speaking anonymously for fear of reprisal Instead they will be chosen by a consensus of elite some accounts Bo s ouster weakens the loose political coalition that is still maintained by Jiang Zemin who left China s presidency a decade ago but still wields power through officials he has placed in the leadership bureaucracy View all New York Times prob

In [18]:
get_surrounding(article_text, 'critical', 3)

'This is a critical political issue perhaps'