# Search Engine full stack 

## Building an Indexer for the crawled webpages

In [1]:
import os, elasticsearch

In [2]:
article_path = "newyorktimes/newyorktimes/articles"

docs = [] 
doc_names = []
doc_dic ={}
for root, dirs, files in os.walk(article_path):
    for file in files:
        with open(os.path.join(root,file), 'r') as f:
            doc_dic[file]= f.read()
            
docs = [*doc_dic.values()]
doc_names = [*doc_dic.keys()] 

In [3]:
es = elasticsearch.Elasticsearch()

for i in range(len(docs)):
    title = ''.join(doc_names[i].split('-')[1:]).split('.txt')[0]
    authors = docs[i].split('Author(s): ')[1].split('\n')[0].split(', ')
    url = docs[i].split('url: ')[1].split('\n')[0]
    text =''.join(docs[i].split('url: ')[1].split('\n')[2:])

    es.index(index='scrapped', doc_type='article', id=i, body={
        'title': title,
        'authors': authors,
        'url': url,
        'text': text
    })

In [4]:
def search_text(word):
    res = es.search(index='scrapped', body={"query": {"match": {"text": word}}})
    return list(map(lambda x: x['_source']['title'], res['hits']['hits']))

In [5]:
def search_title(word):
    res = es.search(index='scrapped', body={"query": {"match": {"title": word}}})
    return list(map(lambda x: x['_source']['title'], res['hits']['hits']))

In [6]:
def search_author(word):
    res = es.search(index='scrapped', body={"query": {"match": {"authors": word}}})
    return list(map(lambda x: x['_source']['title'], res['hits']['hits']))

In [7]:
def search_fuzzy(word):
    res = es.search(index='scrapped', body={"query": {"fuzzy": {"text": word}}})
    return list(map(lambda x: x['_source']['title'], res['hits']['hits']))

In [8]:
def search_compound_fields(field1, value1, field2, value2):
    res = es.search(index='scrapped', body={ "query": { "bool": { "must": [ { "match": { field1: value1 } }, { "match": { field2: value2 } } ] } } })
    return list(map(lambda x: x['_source']['title'], res['hits']['hits']))

In [9]:
search_text('Ahmed')

['Syrian Military Seizes Mosque in Dara’a That Was Focus of Uprising',
 'In Libya, Reinforcements Help Rebels Hold Their Ground',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says',
 'Syrian Resort Town Is Stronghold for Alawites',
 'Aleppo Residents, Battered by War, Struggle to Survive']

In [10]:
search_title('Qaddafi')

['Allies Pressure Qaddafi Forces',
 'Rebels Storm Qaddafi Compound',
 'Rebels Storm Qaddafi Compound',
 'Rebels Storm Qaddafi Compound',
 'Allies Pressure Qaddafi Forces',
 'Rebels Storm Qaddafi Compound',
 'Rebels Storm Qaddafi Compound',
 'Rebels Storm Qaddafi Compound',
 'Allies Pressure Qaddafi Forces',
 'Allies Pressure Qaddafi Forces']

In [11]:
search_author('Scott')

['Thousands Fleeing Qaddafi Find Hospitality in Tunisia',
 'The Fake Americans Russia Created to Influence the Election',
 'Thousands Fleeing Qaddafi Find Hospitality in Tunisia',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says',
 'Libyan Forces Chase Rebel Convoy Into Tunisia, a Rebel Fighter Says']

In [12]:
search_fuzzy('austrio')

['European Union Bans Syrian Oil as Crackdown Continues',
 'Trump Declares Opioid Crisis a ‘Health Emergency’ but Requests No Funds',
 'In Arabian Desert, a Sustainable City Rises',
 'Through the Outback']

In [13]:
search_compound_fields('title', 'russia', 'authors', 'scott')

['The Fake Americans Russia Created to Influence the Election']

## Bonus 

In [14]:
#from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "english"
SENTENCES_COUNT = 2

if __name__ == "__main__":
    
    #url = "https://www.nytimes.com/2017/11/08/world/europe/brexit-britain-patel-johnson.html"
    #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    
    parser = PlaintextParser.from_string("Hence, a ragged government is “moving from one crisis to the next crisis,” in the words of Nicholas Crowson, professor of contemporary British history at the University of Birmingham. With Brexit now the key dividing line in British politics, he said, management of a cabinet has become “close to impossible.”Having already lost her defense secretary, Michael Fallon, and with two other ministers under threat in the sleaze allegations, Mrs. May has declared Ms. Patel’s case closed. And on Tuesday, Mr. Johnson brushed aside calls for his resignation.Ms. Patel held the meetings in Israel without telling the Foreign Office, a clear breach of protocol. The appointments were set up with the help of Stuart Polak, the honorary president of the Conservative Friends of Israel, a lobbying organization.Ms. Patel also asked officials from her department to examine whether public money could support humanitarian operations by the Israeli Defense Forces in parts of the Golan Heights. As she ought to have predicted, the answer was no, because Britain regards the Golan Heights as occupied territory.In addition, Ms. Patel was forced to issue an embarrassing clarification, walking back the false impression she gave The Guardian that Mr. Johnson was aware of her meetings in advance.The foreign secretary has problems of his own over comments he made when speaking to a parliamentary committee late last month about a British citizen of Iranian descent, Nazanin Zaghari-Ratcliffe, who is imprisoned in Iran. Mr. Johnson described her conviction for sedition as a mockery of justice but also said she had been “simply teaching people journalism.”PhotoMs. Zaghari-Ratcliffe is serving a five-year prison term on charges of seeking to overthrow the government, claims her supporters have called absurd. Days after Mr. Johnson’s comments, she was taken to a new court hearing, where the minister’s statement was cited as evidence that she had been engaged in “propaganda against the regime,” a serious offense.Advertisement Continue reading the main storyOn Tuesday Mr. Johnson said that in a phone conversation with his Iranian counterpart, Mohammad Javad Zarif, he had expressed concern that his words had been used against Ms. Zaghari-Ratcliffe, but was told that was not the case.“I accept that my remarks could have been clearer,” Mr. Johnson said. But he declined to apologize and he called on critics to direct their ire at Iran instead.Newsletter Sign Up Continue reading the main story Please verify you're not a robot by clicking the box. Invalid email address. Please re-enter. You must select a newsletter to subscribe to. Sign Up You agree to receive occasional updates and special offers for The New York Times's products and services. Thank you for subscribing. An error has occurred. Please try again later. View all New York Times newsletters.That failed to satisfy opposition lawmakers, including Ben Bradshaw, who said that if Britain “had a prime minister who wasn’t so weak,” both of the cabinet ministers “would have been sacked.”The shadow foreign secretary, Emily Thornberry, told lawmakers that she hoped that “no lasting damage has been done to Nazarin as a result of his blunder.”Mr. Johnson was always a controversial choice of foreign secretary, having before his appointment suggested that President Barack Obama had an “ancestral dislike of the British Empire,” written a poem insinuating that Turkey’s president had sexual relations with a goat and likened the European Union to the Third Reich.Ms. Patel’s appointment raised eyebrows, too, because she told The Daily Telegraph in 2013 that the department she now runs should be scrapped and replaced with a trade-focused body to help businesses invest in the developing world.Yet, Mr. Johnson and Ms. Patel were leading campaigners for Brexit in the referendum, and Mrs. May therefore found them cabinet jobs — a big one in the case of Mr. Johnson, one of the public faces of the campaign to leave the European Union.The resignation from the defense post by Mr. Fallon, a “remain” supporter in the referendum, seems to have been prompted in part by complaints from a Brexit-enthusiast, Andrea Leadsom, who is the leader of the House of Commons.When Mr. Fallon stood down last week, the internal balance of the cabinet meant that he had to be replaced by someone who — like him — was loyal to Mrs. May and who had campaigned for remain during the referendum. Mrs. May opted to promote the chief whip, Gavin Williamson, a decision that provoked fierce complaints that such a senior position had gone to someone with no experience of a big government department or knowledge of the military.Advertisement Continue reading the main storyPerhaps the only consolation for Mrs. May is that, at a time when her leadership is under continual threat, several potential successors are — for now at least — out of the picture. Mr. Fallon had been talked of a potential caretaker leader, while Mr. Johnson’s latest misstep error has focused attention on his competence.On Tuesday one lawmaker, Anna Soubry, wrote on Twitter that Mr. Johnson’s “lack of contrition is as shameful as the original error” over Ms. Zaghari-Ratcliffe. She said, “Boris Johnson doesn’t understand magnitude of the job & responsibility he holds.”And Ms. Soubry is a fellow Conservative.", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)

With Brexit now the key dividing line in British politics, he said, management of a cabinet has become “close to impossible.”Having already lost her defense secretary, Michael Fallon, and with two other ministers under threat in the sleaze allegations, Mrs. May has declared Ms. Patel’s case closed.
Mr. Johnson described her conviction for sedition as a mockery of justice but also said she had been “simply teaching people journalism.”PhotoMs.
