In [1]:
from sqlite_db import thesisDB, corpusDB, enclose, docData
from pynlpl.clients.frogclient import FrogClient
from lexisnexisparse import LexisParser
import json
import pandas as pd
from ipywidgets import FloatProgress
from IPython.display import display
import re
import os

settings_file = 'D:/thesis/settings - nl_final.json'
frogip = '192.168.33.10'
frogport = 9772

#Read settings
settings = json.loads(open(settings_file).read())["settings"]
dbAddress = settings['db_file']

datasets = [settings['json_folder']+fname for fname in os.listdir(settings['json_folder']) if fname.lower().endswith(".json.gz")]

db = thesisDB(dbAddress)

In [28]:
##Connect to frog client
frog = FrogClient(frogip,frogport,returnall = True)

In [30]:
#Progressbar!
f = FloatProgress(min = 0, max = 1, bar_style = 'success')
display(f) # display the bar

for dataset in datasets:
    df = pd.read_json(dataset, compression = 'gzip')
    df.sort_index(inplace = True)
    f.description = dataset[54:-8]
    f.value = 0
    f.max = len(df)
    
    counter = 0 #commit every 5 documents
    for index,row in df.iterrows():
        counter += 1

        ##First input document, save its rowid for cross-reference
        document = {'date':str(row['DATE_dt']),
               'medium':enclose(row['MEDIUM']),
               'headline':enclose(row['HEADLINE']),
               'length':str(row['LENGTH'])}
        if row['BYLINE']: #sometimes byline is None
            document['byline'] = enclose(row['BYLINE'])
        if row['SECTION']: #sometimes sections is None
            document['section'] = enclose(row['SECTION'])
        if counter % 10 == 0:
            lastRow = db.insertRow('documents',document)
        else:
            lastRow = db.insertRow('documents',document,False)

        paragraph_no = 1

        entities = []
        entity = ['','']

        for paragraph in row['TEXT']:

            res = frog.process(paragraph)

            position = 1
            for row in res:
                if row[0] is None:
                    continue

                if row[0] == '"':
                    row = list(row)
                    row[0] = 'DOUBLE_QUOTE'
                    row[1] = 'DOUBLE_QUOTE'
                    row = tuple(row)
                data = {
                    'token':enclose(row[0]),
                    'lemma':enclose(row[1]),
                    'paragraph_no':str(paragraph_no),
                    'position':str(position),
                    'docid':str(lastRow),
                    'pos':enclose(re.search('^[A-Z]+',row[3])[0])#, Exclude pos_long because it takes storage and is not needed
                    #'pos_long':enclose(row[3])
                }

                db.insertRow('tokens',data,False)

                if row[4] != 'O': #Found an entity
                    if re.match('^B-',row[4]) is not None:
                        #Entity is new. Save old entity if something is stored
                        if entity[0] != '':
                            entities.append(entity)
                            entity = ['','']
                        entity[0] = row[0]
                        entity[1] = re.search('[A-Z]+$',row[4])[0]
                    else: 
                        entity[0] += ' '+row[0] #append next term of entity

                position += 1
            paragraph_no += 1

        if entity[0] != '': #append last entity if present
            entities.append(entity)

        for ent,t in entities:
            data = {'entity':enclose(ent),
                   'category':enclose(t),
                   'docid':str(lastRow)}
            db.insertRow('entities',data,False)


        f.value += 1
    db.commit()

A Jupyter Widget