# This notebook is for the project Portugal Overseas Identity - Historical Network Research

@Agatha, @Michal: feel free to add anything, like explanations about the project, comments, etc.

## Workflow:

1- To improve accuracy on existing entity types. Entity types that already exist in the spaCy xx_ent_wiki_sm (international language, including Portuguese) model can be found here: https://spacy.io/docs/usage/entity-recognition#entity-types

2- To extend the named entity recognizer. New entity types to be added to the model. Model will be saved as pt_poihnr. Entities to be added:
- Role
- Title?
- Type of document?

3- After having a satisfactory NER model, to get senders and recipients from data as well as their roles 

## Importing packages

In [1]:
import pandas as pd
import json
import spacy
import xx_ent_wiki_sm
import sys
import random
import re
import csv
import os

##### Creating a new file with only the full_text column in it and without square brackets

In [None]:
def create_fulltext_file():
    df = pd.read_csv('portugal_documents.csv') 
    full_text = df['full_text'].values         

    with open('poi_full_text.csv', 'w') as outcsv:
        writer = csv.writer(outcsv)
        writer.writerow(['full_text'])
        chars_to_remove = ['[', ']']
        rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
        for i in range(len(full_text)):
            line = re.sub(rx, '', full_text[i])
            writer.writerow([line])

In [None]:
create_fulltext_file()

## Reading CSV file and creating JSON/TXT files with the data to train the model

In [8]:
df = pd.read_csv('poi_full_text.csv') #reads all the data and store in a dataframe
full_text = df['full_text'].values    #gets only the 'full_tex' column data in a numpy array

In [9]:
##### DUMPING IN A JSON FILE #####
def dump_json(full_text):
    
    #Identified mixed labels
    #DATE as PER
    months = ['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho', 'Julho', \
              'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro']    
    
    reload(sys)  
    sys.setdefaultencoding('utf8')

    nlp = xx_ent_wiki_sm.load() #loads NLP model for 'international' languages
    train_poi = []
    
    roles = pd.read_csv('roles_poi.csv')['roles'].values  #reads the list of roles manually added to the csv file

    random.seed(2001)
    #for i in range(4230):  #sample of 2.5% of the whole corpus
    for i in range(200):
        a = random.randint(0,169221) #random sampling
        doc = nlp(full_text[a].decode('utf-8'))
        doc_dict = {'text':full_text[a].decode('utf-8'), 'entity':0}

        date_match = re.search(r'\d{4},(.*),\ \d{1,2}', full_text[a].decode('utf-8')) #search for date: YYYY, month, DD

        #iterate through all the entities found
        entity_list = []
        for ent in doc.ents:

            entity_dict = {'entity_text': ent.text.decode('utf-8'), 'start': ent.start_char, \
                           'end': ent.end_char, 'label': ent.label_.encode('utf-8')}

            #correcting DATE entity for the training file
            if ent.text.decode('utf-8') in months:
                if date_match is not None:
                    entity_dict = {'entity_text': date_match.group(0), 'start': date_match.span()[0], \
                           'end': date_match.span()[1], 'label': 'DATE'}
                else:  
                    entity_dict['label'] = 'DATE'

            entity_list.append(entity_dict)
            
        ### Adds ROLE entities based on roles_poi.csv file to the entity list
        for role in roles:
            role_match = re.search(role.decode('utf-8'), full_text[a].decode('utf-8'))
            if role_match is not None:
                dict_role = {'entity_text': role_match.group(0), 'start': role_match.span()[0], \
                               'end': role_match.span()[1], 'label': 'ROLE'}
                entity_list.append(dict_role)

        ### Gets the list of sentences       
        doc_dict['entity'] = entity_list                                 
        train_poi.append(doc_dict)

    ### In .json format    
    output = {
        'train_data': train_poi
    }

    ### Saves file
    with open('train_poi_data.json', 'wb') as jsonfile:
        json.dump(output,jsonfile,ensure_ascii=False, indent=2)

In [10]:
##### WRITING THE JSON FILE #####
dump_json(full_text)

In [None]:
##### DUMPING IN A TXT FILE #####
def dump_txt(full_text):

    reload(sys)  
    sys.setdefaultencoding('utf8')

    #Identified mixed labels
    #DATE as PER
    months = ['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho', 'Julho', \
              'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro']

    nlp = xx_ent_wiki_sm.load()

    roles = pd.read_csv('roles_poi.csv')['roles'].values  #reads the list of roles manually added to the csv file

    g = open('train_poi_data.txt','w')

    random.seed(2001)
    for i in range(4230):  #sample of 2.5% of the whole corpus
    #for i in range(200):
        a = random.randint(0,169221) #random sampling
        doc = nlp(full_text[a].decode('utf-8'))

        date_match = re.search(r'\d{4},(.*),\ \d{1,2}', full_text[a].decode('utf-8')) #search for date: YYYY, month, DD

        g.write(doc.text.lstrip(' '))
        g.write('\n')

        for ent in doc.ents:

            #correcting DATE entity for the training file
            if ent.text in months:
                if date_match is not None:
                    g.write('('+date_match.group(0)+'; '+str(date_match.span()[0])+'; '+str(date_match.span()[1])+'; DATE)')
                    g.write('\n')

            else:   
                g.write('('+ent.text+'; '+str(ent.start_char)+'; '+str(ent.end_char)+'; '+ent.label_.encode('utf-8')+')')
                g.write('\n')

        g.write('\n')
        g.write('\n')

    g.close()

In [None]:
##### WRITING THE TXT FILE #####
dump_txt(full_text)

In [None]:
##### DUMPING IN A FOLDER INDIVIDUAL TXT FILEs #####
def dump_txt_individual(full_text):
    
    reload(sys)  
    sys.setdefaultencoding('utf8')

    #Identified mixed labels
    #DATE as PER
    months = ['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho', 'Julho', \
              'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro']

    nlp = xx_ent_wiki_sm.load()

    roles = pd.read_csv('roles_poi.csv')['roles'].values  #reads the list of roles manually added to the csv file
    
    if not os.path.exists('train_poi_data'):
        os.makedirs('train_poi_data')

    random.seed(2001)
    for i in range(4230):  #sample of 2.5% of the whole corpus
    #for i in range(200):

        a = random.randint(0,169221) #random sampling
        doc = nlp(full_text[a].decode('utf-8'))

        date_match = re.search(r'\d{4},(.*),\ \d{1,2}', full_text[a].decode('utf-8')) #search for date: YYYY, month, DD
        id_match = re.match(r'(\d+)[-.]', full_text[a].decode('utf-8').lstrip(' '))

        if id_match is not None:
            g = open('train_poi_data/'+str(i)+'_'+id_match.group(1)+'.txt','w')
        else:
            g = open('train_poi_data/'+str(i)+'_noid.txt','w')

        g.write(doc.text.lstrip(' '))
        g.write('\n')

        for ent in doc.ents:

            #correcting DATE entity for the training file
            if ent.text in months:
                if date_match is not None:
                    g.write('('+date_match.group(0)+'; '+str(date_match.span()[0])+'; '+str(date_match.span()[1])+'; DATE)')
                    g.write('\n')

            else:   
                g.write('('+ent.text+'; '+str(ent.start_char)+'; '+str(ent.end_char)+'; '+ent.label_.encode('utf-8')+')')
                g.write('\n')

        g.close()

In [None]:
##### CREATING THE FOLDER AND WRITING INDIVIDUAL TXT FILES IN THERE #####
dump_txt_individual(full_text)

# Testing Area!