# Maintenance HEDA

### Import modules

In [167]:
from elasticsearch import Elasticsearch

import PyPDF2
import json
import pandas as pd
import numpy  as np

In [168]:
# CONSTANTS
# Directory pdfs
PDF_PATH    = './../pdfs/'

# save menuArray.json at
MENU_ARRAY  = '../client/src/components/menuArray.json'

# access pdfs from component
PDF_PATH_JS = ''

## Import Settings

In [169]:
df_setting = pd.read_csv('./setting.csv', sep=';')
df_setting = df_setting.replace(np.nan, '' , regex=True)
df_setting.head()

Unnamed: 0,title,parent,description,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,Aktuelles,,,,,,
1,sub11,Aktuelles,,,,,
2,sub12,Aktuelles,,,,,
3,sub13,Aktuelles,Hier könnte ihre beschreibung stehen Hier könn...,Vertrag 1,ex06.pdf,Vertrag 2,5-3.pdf
4,sub111,sub11,Hier könnte ihre beschreibung stehen Hier könn...,Vertrag 1,ex06.pdf,,


## ElasticSearch
### Read Data from Setting file

In [170]:
def createPdfArray(row):
    pdfArray = []
    for i in range(int(row[3::].shape[0]/2)):
        if row[3+2*i] != "":
            pdfPath  = row[3+1+2*i]
            pdfArray.append(pdfPath)
    return pdfArray

def getChildren(df, dfEs, parent="", path=""):
    
    dfChildren = df[df['parent']==parent]   
    
    for index, row in dfChildren.iterrows():
        pdfPath     = createPdfArray(row)
        
        if path == "": pathChild = row["title"]
        else: pathChild = path + "/" + row["title"]
        
        if len(df[df["parent"]==row['title']]) == 0:
            for pdf in range(len(pdfPath)):
                dfEs.loc[len(dfEs)] = row["title"], pathChild, row["description"], "", pdfPath[pdf]
        else:
            getChildren(df, dfEs, row["title"], pathChild)
            
    return dfEs

def getData(df):
    dfEs = pd.DataFrame(columns = ("title", "path", "description", "content", "file"))
    dfEs = getChildren(df, dfEs)
    return dfEs

### Read files into DataFrame

In [171]:
def extractPdfFiles(df):
    
    # get unique endPages
    uniqueTitle = df.title.unique()
    
    finalDf = pd.DataFrame(columns = ("title", "path", "description", "content", "file"))
    
    for title in uniqueTitle:

        dfTemp   = df[df["title"]==title]
        
        this_doc = ''
        files    = ''
        numFiles = len(dfTemp)
                
        # loop over pdfs for given title
        i = 0
        for index, row in dfTemp.iterrows():
            
            pdfFileObj = open(PDF_PATH+row["file"], 'rb')
            pdfReader  = PyPDF2.PdfFileReader(pdfFileObj)
            numPages   = pdfReader.numPages
            
            for p in range(numPages):
                pageObj   = pdfReader.getPage(p)
                this_text = pageObj.extractText()
                this_doc += this_text
            
            if numFiles > 1:
                if i < (numFiles-1):
                    this_doc += ''
                    files    += row["file"] + ', '
                else: 
                    files += row["file"]
            else: files += row["file"]
            i += 1
        
        finalDf.loc[len(finalDf)] = row["title"], row["path"], row["description"], this_doc, files

    return finalDf

In [172]:
dfEs = extractPdfFiles(getData(df_setting))
dfEs.head()

Unnamed: 0,title,path,description,content,file
0,sub111,Aktuelles/sub11/sub111,Hier könnte ihre beschreibung stehen Hier könn...,FundamentalsofMachineLearning\nWinterSemester2...,ex06.pdf
1,sub112,Aktuelles/sub11/sub112,Hier könnte ihre beschreibung stehen Hier könn...,FundamentalsofMachineLearning\nWinterSemester2...,"ex06.pdf, 5-3.pdf"
2,sub121,Aktuelles/sub12/sub121,Hier könnte ihre beschreibung stehen Hier könn...,FundamentalsofMachineLearning\nWinterSemester2...,ex06.pdf
3,sub122,Aktuelles/sub12/sub122,Hier könnte ihre beschreibung stehen Hier könn...,FundamentalsofMachineLearning\nWinterSemester2...,"ex06.pdf, 5-3.pdf"
4,sub13,Aktuelles/sub13,Hier könnte ihre beschreibung stehen Hier könn...,FundamentalsofMachineLearning\nWinterSemester2...,"ex06.pdf, 5-3.pdf"


### Create Index in Elasticsearch

In [173]:
es = Elasticsearch()

In [174]:
col_names = dfEs.columns
for row in range(dfEs.shape[0]):
    body = dict([(name, str(dfEs.iloc[row][name])) for name in col_names])
    es.index(index = 'pypdf_test2', doc_type = 'files', body = body)

POST http://localhost:9200/pypdf_test2/files [status:403 request:0.009s]


AuthorizationException: AuthorizationException(403, 'cluster_block_exception', 'blocked by: [FORBIDDEN/12/index read-only / allow delete (api)];')

## Create JSON Array

### Load setting File

In [175]:
def createNewEntry(path, title, description, pdfArray, submenu):
    newJSON = {"values": {"title":title, "path":path, "id":title+"-id", "description":description, "pdfpath":pdfArray}, "submenu":submenu}
    return newJSON

def createPdfArray(row):
    pdfArray = []
    for i in range(int(row[3::].shape[0]/2)):
        if row[3+2*i] != "":
            pdfTitle = row[3+2*i]
            pdfPath  = row[3+1+2*i]
            pdfArray.append({"title":pdfTitle,"path":PDF_PATH_JS+pdfPath})
    return pdfArray

# create json array
def createSub(df, jsonArray, parent="", path=""):
    
    dfChildren = df[df['parent']==parent]   
    
    for index, row in dfChildren.iterrows():
        title       = row["title"]
        description = row["description"]
        pdfPath     = createPdfArray(row)
        
        if path == "": pathChild = row["title"]
        else: pathChild = path + "/" + row["title"]
        
        if len(df[df["parent"]==row['title']]) == 0:
            jsonArray.append(createNewEntry(pathChild , title, description, pdfPath, []))
            
        else:
            jsonArray.append(createNewEntry(pathChild, title, description, pdfPath, createSub(df, [], row["title"], pathChild)))
            
    return jsonArray

#### Create JSON and write to file

In [176]:
jsonArray = createSub(df_setting, [])

with open(MENU_ARRAY, 'w') as outfile:  
    json.dump(jsonArray, outfile, indent=4)