# 1. Basic Setup
    a. ElasticSearch Config (ES_CONFIG)
    b. File Path (PATH)
    c. Necessary imports

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING)
from collections import namedtuple
import spacy
from rcare.nlp.customize_nlp import rcare
rcare = rcare(spacy.load('en_core_web_sm'))
nlp = rcare.nlp
from rcare.nlp.doc2vec_rcare import SentenceDocument
from nltk.tokenize import sent_tokenize 

import pandas as pd
from rcare.readutils import DataFrameUtils as Utils
from collections import namedtuple
from rcare.nlp.sections_rcare import get_sections

from rcare.es_helper import ESHelper
ES_CONFIG = {
    "host": [
        {
            "host": "localhost",                  
            "port":"9200"
        }
    ],
    "log": "error"
};

esHelper = ESHelper(ES_CONFIG)
import pandas as pd
'''
from rcare.sqlutils import SqlUtils
sqlUtils = SqlUtils("/home/paperspace/dev/sqllite-node/app/data/training.db")
path = "/home/paperspace/dev/en-nlp/data/lexpredict-contraxsuite-samples/cca_2011_Q3/text/"
pd_txtFileNames = sqlUtils.select("select txtFileName from meta_data where useForTraining = 1 \
and agreementFileName like 'creditcardagreement_%' and lineCount > 200 \
order by agreementFileName")
filenames = list(Utils.flatten(pd_txtFileNames[:30].values )) 
filenames = filenames[18:19]
#filenames[18:19]
'''
#PATH SETUP for files
PATH = "/home/paperspace/dev/en-nlp/data/demo/"  



# 2. Function Definitions 

In [2]:
Document = namedtuple('Document', 'id name filePath text sectionCount sections sentCount sents ents rank')
Section = namedtuple('Section', 'id sectionId text sentCount index rank')
Sentence = namedtuple('Sentence', 'id sectionId sentId text startChar endChar sectionText rank')
Entity = namedtuple('Entity', 'sectionId text startChar endChar label')

def __tokens__(self, text, spacy_obj = False):
    text = text if (spacy_obj) else self.nlp(text) 
    tokens = []
    for w in text:  
        if (w.is_digit):
            #tokens.append("-DIGIT-")  
            continue;
        if (w.like_num):
            #tokens.append("-DIGIT-")  
            continue;                    
        elif (w._.is_cardinal):
            #tokens.append("-CARDINAL-") 
            continue;
        elif (w.is_punct):
            tokens.append(w.text)     
            #continue;
        elif (w.is_stop):
            #tokens.append("-STOP-")     
            tokens.append(w.lower_)
            continue;
        elif (w.like_url):
            #tokens.append("-URL-")                  
            continue;
        elif(w.lemma_ == '-PRON-'):
             tokens.append(w.lower_)
        elif (w.text.strip() not in ('\n', '$','\n\n' , '\n\n\n', '\n\n\n\n','' ) ) :                 
            tokens.append(w.lemma_)  

    return tokens;
    
def get_id(item):
    return "{}".format(item['name'])

def prep_sent_tokenize( filenames, path, numfiles = None):
    df_g = Utils.read_data_gen(filenames = filenames, path = path, numfiles = numfiles)
    taggedDocs , tokens = [], []   
    index = 0
    docSents = []
    docEnts = []
    sents = []
    for file_idx, fileData in enumerate(df_g):    
        #text =  Utils.clean_document(fileData.text, rm_stop_words = True)
        name = fileData.name
        text = fileData.text   
        filePath = "{}{}".format(path, name)
        sents = sents + sent_tokenize(text)
        
    return sents        
        
def prep_training_data( filenames, path, numfiles = None):
    df_g = Utils.read_data_gen(filenames = filenames, path = path, numfiles = numfiles)
    taggedDocs , tokens = [], []   
    index = 0
    docSents = []
    docEnts = []
    for file_idx, fileData in enumerate(df_g):    
        #text =  Utils.clean_document(fileData.text, rm_stop_words = True)
        name = fileData.name
        text = fileData.text   
        filePath = "{}{}".format(path, name)
        
        logging.info("Processing {}".format(name))        
        sections = []
        
        sectionWithSent = []
        docEnts = []
        docSents = []
        for sectionId, sectionText in enumerate(get_sections(text, score_threshold=0.6, nlp = nlp)):
            tokens = []                       
            
            section = nlp(sectionText.strip() if (len(sectionText.strip()) > 0) else " ")
            sents = []
            nsents = sent_tokenize(sectionText.strip() if (len(sectionText.strip()) > 0) else " ")
            
            for sentId, sent in enumerate(nsents):                
                sents.append(Sentence( "{}_{}_{}".format(name, sectionId, sentId), sectionId, sentId, sent, 
                                      -1, -1, section.text, None))
            
            sections.append(Section("{}_{}".format(name, sectionId), 
                                    sectionId, sectionText, len(sents), 
                                    "{}_{}".format(name.lower(), sectionId),None ))
            docSents = docSents + sents
            docEnts = docEnts + \
            list(map(lambda ent: Entity(sectionId, ent.text, ent.start_char, ent.end_char,  ent.label_), section.ents))   
            
        taggedDocs.append(Document("{}".format(name), name, filePath, text, len(sections), sections, len(docSents), docSents, docEnts, None) )
    logging.info("prep_training_data done ")           
    return taggedDocs

# 3. Preps the file data for ElasticSearch

In [3]:
tdocs = prep_training_data(["demo_bokvisaplat.txt"],  path = PATH, numfiles = None)


# 4. Upserts data to ElasticSearch

In [4]:
 esHelper.bulk_stream_collection( pd.DataFrame( tdocs,columns = tdocs[0]._fields).to_dict(orient='records'), index = "demo.meta", doc_type = "data",  get_id = get_id)

Success count: 1, Failure count: 0


# 5. Debug (if any)

In [5]:
doc = tdocs[0]
print (doc.name, doc.sectionCount, doc.sentCount)

for sec in doc.sections:
    print("++++++++++++++")
    print (sec.text[:40], sec.sectionId, sec.sentCount)
    print("**************")

#tdocs[0].sents[0].id



demo_bokvisaplat.txt 23 183
++++++++++++++
BBOK VISA CLASSIC & VISA PLATINUM  
CARD 0 9
**************
++++++++++++++
3. This agreement governs any account fo 1 12
**************
++++++++++++++
6. You authorize us to charge to your ac 2 9
**************
++++++++++++++
9. You agree to pay us within 25 days fr 3 17
**************
++++++++++++++
B. CASH ADVANCES  A Finance Charge will  4 4
**************
++++++++++++++
11. INTEREST RATE
A. PURCHASES 
If your  5 6
**************
++++++++++++++
B. CASH ADVANCES  If your account is sub 6 4
**************
++++++++++++++
12. OTHER CHARGES – In addition to the F 7 2
**************
++++++++++++++
A. LATE PAYMENT FEE
If you do not pay th 8 2
**************
++++++++++++++
B. RETURN CHECK FEE
We will assess a ret 9 2
**************
++++++++++++++
C. INTERNATIONAL TRANSACTION FEE
Foreign 10 3
**************
++++++++++++++
E. All costs of collections including re 11 1
**************
++++++++++++++
F. ATM USAGE FEE
A fee may be imposed by 12 2
*******