## Imports

In [None]:
from itertools import islice
import xml.etree.cElementTree as ET
import datetime
import requests
import json
import time

## Show file

In [None]:
##TODO: download file from url if not exist

name = "plwiki-latest-abstract.xml"
N = 50

with open(name, encoding="utf8") as myfile:
    head = list(islice(myfile, N))

for row in list(head):
    print(row)

## Parse file to XML structure and show no of nodes

In [None]:
root = ET.parse(name).getroot()
len(root)

## Some articles have not been well exported to the xml file, so I should skip them. I use a simple scheme - if the description is not long enough, I ignore it. It will also be useful for further exploration of documents, because too short entries are worthless

In [None]:
datas = []

for doc in list(root.findall('doc')):
    title = doc.find('title').text
    content = doc.find('abstract').text
    if (content == None):
        continue
    if (len(content.replace(" ", "")) > 40):
        data = {}
        try:
          data['Title'] = title.replace("Wikipedia: ", "")
        except:
          continue
        data['Content'] = content
        datas.append(data)

## Preview of the prepared structure that will be added to the elasticsearch database

In [None]:
for data in datas[:10]:
    print(data['Title'])
    print(data['Content'])

## Loaded XML file should be deleted from memory, because it requires a lot of memory

In [None]:
root.clear()
del root

## Number of elements after filtering based on length. It is important how much has been ignored

In [None]:
len(datas)

## We can now add entries to the database using the configured elasticsearch database API

In [None]:
base_push_url = "http://192.168.1.107:9200/wiki_abstract_pl/"
push_url = base_push_url + "_doc/"
headers = {
    'Content-type': 'application/json'
}

start_time = time.time()

## TODO:
## Bulk add http://queirozf.com/entries/elasticsearch-bulk-inserting-examples

failed_datas = []

for data in datas:
    date = datetime.datetime.today().strftime('%Y-%m-%dT%H:%m:%S')
    body = {
        "title" : data['Title'],
        "content" : data['Content'],
        "post_date" : date
    }
    try:
        requests.post(push_url, data = json.dumps(data), headers = headers)
    except:
        print("error for:" + json.dumps(data))
        time.sleep(10)
        failed_datas.append(data)
        
for data in failed_datas:
    date = datetime.datetime.today().strftime('%Y-%m-%dT%H:%m:%S')
    body = {
        "title" : data['Title'],
        "content" : data['Content'],
        "post_date" : date
    }
    try:
        requests.post(push_url, data = json.dumps(data), headers = headers)
    except:
        print("permanent error for:" + json.dumps(data))
        time.sleep(3)

print("done")

elapsed_time = time.time() - start_time
print("elapsed time: " + str(elapsed_time) + " seconds")

## Check if files exists (response = 200)

In [None]:
response = requests.post(base_push_url)
response.status_code