In [None]:
"""Merge Index on Elasticsearch cloud"""

In [1]:
import glob
import os
import re
import sys
from elasticsearch import Elasticsearch, helpers
import pickle
from bs4 import BeautifulSoup

In [2]:
host='https://elastic:cwHN1LsyXbAGmb5LxCbADTkj@cs6200.es.us-west1.gcp.cloud.es.io:9243'

es = Elasticsearch([host],timeout=3000)
print(es.ping())

True


In [3]:
"""
Function: get_files_in_dir()
Input: folder_path: a path to a folder of files to be indexed
Returns: file_path_list: a list of paths to each file in the folder
Does: Gets the names of all files in the folder then appends each
file's path name to a list to return
"""
def get_files_in_dir(folder_path):
    # gets all names of files in directory
    file_list = os.listdir(folder_path)

    # append them to list with their full paths
    file_path_list = []
    for file in file_list:
        file_path_list.append(os.path.join(folder_path, file))

    return file_path_list


"""
Function: get_data_from_text_file()
Input: file: a single file that may contain multiple documents to be indexed
Returns: data: a list of lists; each sub-list is a line from the file
Does: reads each line of the file and appends it in a list to data
"""
def get_data_from_text_file(file):
    # declare an empty list for the data
    data = []
    for line in open(file, encoding="ISO-8859-1", errors='ignore'):
        data += [str(line)]
    return data

In [5]:
"""
Function: yield_docs()
Input: files: a list of each file path that we want to index (each file contains one doc)
Returns: null
Does: For each file, get the fields that we need and do some text clean up. Check if the doc is already in the corpus.
If it is, update the author and inlinks. If it isn't stage it to be indexed. 
"""
def yield_docs(files):
    
    # each file contains one doc
    for count, file in enumerate(files):
        
        # retrieve data from file
        doc = get_data_from_text_file(file)
        doc = "".join(doc)
        
        # get doc no
        docno_s = doc.find("<DOCNO>") + len("<DOCNO>") 
        docno_e = doc.find("</DOCNO>")
        docno = doc[docno_s:docno_e].strip()

        # get title
        title_s = doc.find("<TITLE>") + len("<TITLE>") 
        ttile_e = doc.find("</TITLE>")
        title = doc[title_s:ttile_e].strip()

        # find author
#         author_s = doc.find("<AUTHOR>") + len("<AUTHOR>") 
#         author_e = doc.find("</AUTHOR>")
#         author = doc[author_s:author_e].strip()
        author = "Melanie"

        # find text
        text_s = doc.find("<TEXT>") + len("<TEXT>") 
        text_e = doc.find("</TEXT>")
        text = doc[text_s:text_e].strip()
        text = re.sub(r'\n+', '\n', text).strip()
        text = text.lower()
        
        # text cleaning
        text_start_cut = text.find("jump to search")
        if text_start_cut != -1:
            text = text[text_start_cut+len("jump to search"):]
        text_end_cut3 = text.find("sources[edit]")
        if text_end_cut3 != -1:
            text = text[:text_end_cut3]
        text_end_cut4 = text.find("this page was last edited")
        if text_end_cut4 != -1:
            text = text[:text_end_cut4]
        text_end_cut5 = text.find("navigation menu")
        if text_end_cut5 != -1:
            text = text[:text_end_cut5]
        text = text.replace("[edit]", " ")

        # find outlinks
        out_s = doc.find("<OUTLINKS>") + len("<OUTLINKS>") 
        out_e = doc.find("</OUTLINKS>")
        outlinks = doc[out_s:out_e].strip()
        outlinks = outlinks.split("http://")
        outlinks_final = []
        for each in outlinks:
            temp = "http://" + each
            outlinks_final.append(temp)
        if "http://" in outlinks_final:
            outlinks_final.remove("http://")

        # find inlinks
        in_s = doc.find("<INLINKS>") + len("<INLINKS>") 
        in_e = doc.find("</INLINKS>")
        inlinks = doc[in_s:in_e].strip()
        inlinks = inlinks.split("http://")
        inlinks_final = []
        for each in inlinks:
            temp = "http://" + each
            inlinks_final.append(temp)
        if "http://" in inlinks_final:
            inlinks_final.remove("http://")

        # find raw html
        raw_s = doc.find("<RAW_HTML>") + len("<RAW_HTML>") 
        raw_e = doc.find("</RAW_HTML>")
        raw = doc[raw_s:raw_e].strip()
        
        # figure out if doc is already in index
        body_check = {
            "query": {
                "match": {
                    "_id": docno
                }
            }
        }
        resp = es.search(index="corpus_wwii", body=body_check)
        if resp["hits"]["total"]["value"] != 0:
            
            # get inlinks and update
            r_author = resp["hits"]["hits"][0]["_source"]["author"]
            if "Melanie" not in r_author:
                new_author = r_author + " " + author
                
                r_inlinks = resp["hits"]["hits"][0]["_source"]["inlinks"]
                r_inlinks_set = set(resp["hits"]["hits"][0]["_source"]["inlinks"].split("\n"))
                r_inlinks_new = r_inlinks + "\n"
                for each in inlinks_final:
                    if each not in r_inlinks_set:
                        r_inlinks_new += each + "\n"

                # use update() API to add new inlinks and author
                source_to_update = {
                    "doc" : {
                        "inlinks" : r_inlinks_new,
                        "author" : new_author
                    }
                }

                es.update(index='corpus_wwii', doc_type="_doc", id=docno, body=source_to_update)
            
        else:
            # yield doc for upload
            formatted_inlinks = "\n".join(inlinks_final)
            formatted_outlinks = "\n".join(outlinks_final)
            doc_source = {
                "docno": docno,
                "title": title,
                "text": text,
                "outlinks": formatted_outlinks,
                "inlinks": formatted_inlinks,
                "rawhtml": raw,
                "author": author
            }
#             add_body = {
#                 "_index": "corpus_wwii",
#                 "_id": docno,
#                 "_source": doc_source
#             }
            es.index(index='corpus_wwii', doc_type='_doc', id=docno, body=doc_source)


In [9]:
# for test merge we should see that melanie1-4 docs were added
# also that in history9 an additional inlink was added and melanie was added to author

file_path = "C:/6200-IR/hw3-mplatt27/docs/"
all_files = get_files_in_dir(file_path)

In [10]:
print(len(all_files))

40000


In [11]:
yield_docs(all_files)