# Search Engine full stack 

## Building an Indexer for the crawled webpages

In [1]:
import os, elasticsearch

In [2]:
scrapped = {}

article_path = "newyorktimes/newyorktimes/articles"

docs = [] 
doc_names = []
doc_dic ={}
for root, dirs, files in os.walk(article_path):
    for file in files:
        with open(os.path.join(root,file), 'r') as f:
            doc_dic[file]= f.read()
            
docs = [*doc_dic.values()]
doc_names = [*doc_dic.keys()] 

In [3]:
es = elasticsearch.Elasticsearch()

for i in range(len(docs)):
    title = ''.join(doc_names[i].split('-')[1:]).split('.txt')[0]
    authors = docs[i].split('Author(s): ')[1].split('\n')[0].split(', ')
    url = docs[i].split('url: ')[1].split('\n')[0]
    text =''.join(docs[i].split('url: ')[1].split('\n')[2:])

    es.index(index='scrapped', doc_type='article', id=i, body={
        'title': title,
        'authors': authors,
        'url': url,
        'text': text
    })

In [4]:
def search_text(word):
    res = es.search(index='scrapped', body={"query": {"match": {"text": word}}})
    return list(map(lambda x: x['_source']['title'], res['hits']['hits']))

In [5]:
def search_title(word):
    res = es.search(index='scrapped', body={"query": {"match": {"title": word}}})
    return list(map(lambda x: x['_source']['title'], res['hits']['hits']))

In [6]:
def search_author(word):
    res = es.search(index='scrapped', body={"query": {"match": {"authors": word}}})
    return list(map(lambda x: x['_source']['title'], res['hits']['hits']))

In [7]:
def search_fuzzy(word):
    res = es.search(index='scrapped', body={"query": {"fuzzy": {"text": word}}})
    return list(map(lambda x: x['_source']['title'], res['hits']['hits']))

In [8]:
def search_compound_fields(field1, value1, field2, value2):
    res = es.search(index='scrapped', body={ "query": { "bool": { "must": [ { "match": { field1: value1 } }, { "match": { field2: value2 } } ] } } })
    return list(map(lambda x: x['_source']['title'], res['hits']['hits']))

In [9]:
search_text('Ahmed')

['Syrian Military Seizes Mosque in Dara’a That Was Focus of Uprising',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says',
 'In Libya, Reinforcements Help Rebels Hold Their Ground',
 'Aleppo Residents, Battered by War, Struggle to Survive',
 'Syrian Resort Town Is Stronghold for Alawites']

In [10]:
search_title('Qaddafi')

['Rebels Storm Qaddafi Compound',
 'Rebels Storm Qaddafi Compound',
 'Allies Pressure Qaddafi Forces',
 'Rebels Storm Qaddafi Compound',
 'Rebels Storm Qaddafi Compound',
 'Allies Pressure Qaddafi Forces',
 'Rebels Storm Qaddafi Compound',
 'Rebels Storm Qaddafi Compound',
 'Qaddafi at Large as Forces Fight to Control Compound',
 'Allies Pressure Qaddafi Forces']

In [11]:
search_author('Scott')

['The Fake Americans Russia Created to Influence the Election',
 'Thousands Fleeing Qaddafi Find Hospitality in Tunisia',
 'Thousands Fleeing Qaddafi Find Hospitality in Tunisia',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says']

In [12]:
search_fuzzy('austrio')

['European Union Bans Syrian Oil as Crackdown Continues',
 'Trump Declares Opioid Crisis a ‘Health Emergency’ but Requests No Funds',
 'In Arabian Desert, a Sustainable City Rises',
 'Through the Outback']

In [13]:
search_compound_fields('title', 'russia', 'authors', 'scott')

['The Fake Americans Russia Created to Influence the Election']