# This notebook is for the project Portugal Overseas Identity - Historical Network Research

@Agatha, @Michal: feel free to add anything, like explanations about the project, comments, etc.

## Workflow:

1- To improve accuracy on existing entity types. Entity types that already exist in the spaCy xx_ent_wiki_sm (international language, including Portuguese) model can be found here: https://spacy.io/docs/usage/entity-recognition#entity-types
UPDATE: spaCy package was updated to its version 2.0 with a Portuguese trained model. Nonetheless, for our purposes the model got a little bit less accurate. So re-training is even more important now.

2- To extend the named entity recognizer. New entity types to be added to the model. Model will be saved as pt_poihnr. Entities to be added:
- Role as `ROLE`
- Type of document as `TYPE`
- Afiiliation as 'AFF'
- Date as 'DATE'


3- After having a satisfactory NER model, to get senders and recipients from data as well as their roles. This is done using a mix of previous scripts using regular expressions and NER.

## Importing packages

In [1]:
import pandas as pd
import json
import spacy
import sys
import random
import re
import csv
import os

## Preparing data for training the NER model

In [None]:
# This functions reads 'portugal_documents.csv' (the original csv document of the project) and 
# uses only the 'full_text' column to create a new file with this information edited without square
# breackets.

def create_fulltext_file():
    df = pd.read_csv('portugal_documents.csv') 
    full_text = df['full_text'].values         

    with open('poi_full_text.csv', 'w') as outcsv:
        writer = csv.writer(outcsv)
        writer.writerow(['full_text'])
        chars_to_remove = ['[', ']']
        rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
        for i in range(len(full_text)):
            line = re.sub(rx, '', full_text[i])
            writer.writerow([line])

In [None]:
create_fulltext_file()

In [None]:
# This function reads the new 'poi_full_text.csv' created from function 'create_fulltext_file'. 
# It creates 4230 txt files, with only the abstract of the full_text (no technical information) picked
# randomly.
# These files feed maxQDA for tagging the entities, their types and their position in
# the text. Moreover, they will serve as the training data for the NER model. 

def create_abstract_files()

    reload(sys)  
    sys.setdefaultencoding('utf8')

    df = pd.read_csv('poi_full_text.csv')
    full_text = df['full_text'].values

    if not os.path.exists('train_poi_data_abstract_only'):
        os.makedirs('train_poi_data_abstract_only')

    random.seed(2001)
    for i in range(4230):  #sample of 2.5% of the whole corpus
    #for i in range(200):

        a = random.randint(0,169221) #random sampling
        text = full_text[a].decode('utf-8')

        id_match = re.match(r'(\d+)[-.]', full_text[a].decode('utf-8').lstrip(' '))

        if id_match is not None:
            g = open('train_poi_data_abstract_only/'+str(i)+'_'+id_match.group(1)+'.txt','w')
        else:
            g = open('train_poi_data_abstract_only/'+str(i)+'_noid.txt','w')

        pattern = re.compile(r'.+?(?=\.\s\b(An\w{3,4}|AH|N|Obs))')
        match = re.search(pattern,text)

        if match:
            g.write(match.group(0))
            g.write('.')
            g.write('\n')
        else:
            g.write(text)

        g.close()

In [None]:
create_abstract_files()

### After maxQDA

As a result of the work done with maxQDA, two '.rar' files were created with several excel files (corresponding to whoever used maxQDA for tagging the entities): AGATA MAXQDA.rar and OUTROS.rar. 

In [None]:
# Renaming the class of entities for training the NER model later
ner_folders = ['DATE', 'INSTITUTION', 'LOCALIZATION', 'PERSON', 'ROLE', 'TYPE', 'AFFILIATION']
ners = ['DATE', 'ORG', 'LOC', 'PER', 'ROLE', 'TYPE', 'AFF']

In [None]:
# First the files are rearranged in folders by entity class: AFFILIATION, DATE, INSTITUTION, 
# LOCALIZATION, PERSON, ROLE AND TYPE (type of the document).
# Obs: some data cleaning was needed in this stage 

def rearrange_maxQDA_files(ner_folders):

    for folder in sorted(os.listdir('maxqda_tagging/AGATA MAXQDA')):
        for document in sorted(os.listdir('maxqda_tagging/AGATA MAXQDA/'+folder)):
            for ner in ner_folders:
                if ner in document:
                    os.rename('maxqda_tagging/AGATA MAXQDA/'+folder+'/'+document, 'maxqda_tagging/'+ner+'/'+document)

    for folder in sorted(os.listdir('maxqda_tagging/OUTROS')):
        for document in sorted(os.listdir('maxqda_tagging/OUTROS/'+folder)):
            for ner in ner_folders:
                if ner in document:
                    os.rename('maxqda_tagging/OUTROS/'+folder+'/'+document, 'maxqda_tagging/'+ner+'/'+document)

In [None]:
rearrange_maxQDA_files(ners)

In [None]:
# Writing training files with maxQDA output files
# Obs: INPUT are the maxQDA files that NEED editing for corrections. 
# UPDATE 13-03-2018: AFFILIATION files OK!
# UPDATE 24-04-2018: all entity classes OK! - Agata

def write_training_files(ner_folders,ners):
    reload(sys)  
    sys.setdefaultencoding('utf8')

    for i in range(len(ners)):

        for excel in os.listdir('maxqda_tagging/'+ner_folders[i]):

            print excel
            df = pd.ExcelFile('maxqda_tagging/'+ner_folders[i]+'/'+excel)
            df = df.parse(df.sheet_names[0], header=None,index_col=None)
            filename = df.iloc[:,1].values[1:]
            fragment = df.iloc[:,4].values[1:]

            for j in range(len(filename)):

                f = open('train_poi_data_abstract_only/'+filename[j]+'.txt','r')
                text = f.readline().decode('utf-8')

                if fragment[j][0] == ' ':
                    fragment[j] = fragment[j][1:]

                if fragment[j][-1] == ' ':
                    fragment[j] = fragment[j][:-1]

                match = re.search(fragment[j].decode('utf-8'), text)
                #print match
                if match is not None:

                    if os.path.isfile('train_poi_data_maxqda/'+filename[j]+'.txt'):
                        g = open('train_poi_data_maxqda/'+filename[j]+'.txt','a+')
                        g.write('('+match.group(0)+'; '+str(match.span()[0])+'; '+str(match.span()[1])+'; '+ners[i]+')')
                        g.write('\n')

                    else:
                        g = open('train_poi_data_maxqda/'+filename[j]+'.txt','w')
                        g.write(text)
                        g.write('('+match.group(0)+'; '+str(match.span()[0])+'; '+str(match.span()[1])+'; '+ners[i]+')')
                        g.write('\n')

                    g.close()

                f.close()   

In [None]:
write_training_files(ner_folders,ners)

## Training NER

First, new entity classes to the already existing model 'pt' need to be added:
- File: training_new_entities.py  

New labels are in the list: LABEL = ['DATE','ROLE','TYPE','AFF']

Second, new model with new entity classes:
- File: training_model.py

## Finding sender/receiver and export json file

In [None]:
reload(sys)  
sys.setdefaultencoding('utf8')

doc_id = pd.read_csv('portugal_documents.csv')['id'].values #documents id
full_text = pd.read_csv('poi_full_text.csv')['full_text'] #documents text pre-processed

#roles = pd.read_csv('roles_poi.csv')['roles'].values  #reads the list of roles manually 
                                                       #added to the csv file - NOT IN USE!

#types = pd.read_csv('roles_poi.csv')['docs_avoided'].values #reads the list of doc to be avoided
#types = types[:18]

network_poi = []

nlp = spacy.load('trained_models/pt_exist_af') #loads trained model

#indicis for the pliting the files
start = 0
stop = len(doc_id)

# testing first with few documents
for i in range(start,stop,1):
    
    text = full_text[i].decode('utf-8')
    
    pattern = re.compile(r'.+?(?=\.\s\b(An\w{3,4}|AH|N|Obs))')
    match = re.search(pattern,text)

    if match:
        text = match.group(0)
    else:
        text = text
    
    #matching doc type to filter types not wanted in the network - NOT IN USE NOW (CHECK WITH AGATA)
    doc_pattern = re.compile(r'\s?\b([A-ZÃÁÀÂÇÉÊÍÕÓÔÚÜ]{2,}\s?)+\b')
    doc_type = re.search(doc_pattern, text)
    if doc_type:
        doc_type = doc_type.group(0).replace(' ','')
    else:
        doc_type = 'NA'
    
    # variable to continue loop avoiding certain types of document
    avoid = 0
    #for doc in types:
        #if doc.decode('utf-8') == doc_type:
            #avoid = 1
    
    if avoid:
        continue
            
    else:
        #doc = nlp(text)
        doc_dict = {'doc_id':doc_id[i], 'doc_type':doc_type, 'text':text, \
                    'sender':{'names':[],'roles':[], 'aff':[], 'org':[]}, \
                    'recipient':{'names':[],'roles':[], 'aff':[], 'org':[]}, \
                    'others':[]}
        
        s_pattern = re.compile(doc_type + r'.*?\b(do|pelo|de|dos|da)\b\s(.*?)\s+(à|a|ao|aos|para)\s?(.+)(\b.+ndo\b|\b\w+ar\b)?')
        sender = re.search(s_pattern, text)
        
        r_pattern = re.compile(doc_type + r'.*?\b(do|pelo|de|dos|da)\b\s(.*?)\s+(à|a|ao|aos|para)\s(.+)(\b.+ndo\b|\ba\b\s\w+r\b|sobre|em que)')
        recipient = re.search(r_pattern, text)
        
        #treating sender and recipients (NERs involved: PER, ROLE, AFF, ORG)
        if sender:
            doc = nlp(sender.group(2))
            for ent in doc.ents:
                if ent.label_ == 'PER':
                    doc_dict['sender']['names'].append(ent.text)
                if ent.label_ == 'ROLE':
                    doc_dict['sender']['roles'].append(ent.text)
                if ent.label_ == 'AFF':
                    doc_dict['sender']['aff'].append(ent.text)
                if ent.label_ == 'ORG':
                    doc_dict['sender']['org'].append(ent.text)
            text = text.replace(sender.group(2),'')
                    
        if recipient:
            doc = nlp(recipient.group(4))
            for ent in doc.ents:
                if ent.label_ == 'PER':
                    doc_dict['recipient']['names'].append(ent.text)
                if ent.label_ == 'ROLE':
                    doc_dict['recipient']['roles'].append(ent.text)
                if ent.label_ == 'AFF':
                    doc_dict['recipient']['aff'].append(ent.text)
                if ent.label_ == 'ORG':
                    doc_dict['recipient']['org'].append(ent.text)
            text = text.replace(recipient.group(4),'')
        
        #treating type of document (NER involved: TYPE)
        text = text.replace(doc_type,'')
        
        #treating date of document (NER involved: DATE)
        date_match = re.search(r'\d{4},(.*),\ \d{1,2}', text.decode('utf-8')) #search for date: YYYY, month, DD
        if date_match is not None:
            doc_dict['date'] = date_match.group(0)
            text = text.replace(date_match.group(0),'')
        else:
            doc = nlp(text)
            for ent in doc.ents:
                if ent.label_ == 'DATE':
                    doc_dict['date'] = ent.text
                    text = text.replace(ent.text,'')
                    
        #treating other NERs
        doc = nlp(text)
        for ent in doc.ents:
            doc_dict['others'].append(ent.text)
            
            
    network_poi.append(doc_dict)            
                    

### In .json format    
output = {
    'network_data': network_poi
}

### Saves file
with open('network_data/network_poihnr_'+str(stop)+'_pt_exist_af.json', 'wb') as jsonfile:
    json.dump(output,jsonfile,ensure_ascii=False, indent=2)

## Old scripts: can be deleted

## Reading CSV file and creating JSON/TXT files with the data to train the model

Update 13-03-2018: This was the fisrt attemp to create training files for the NER model. It was not used in the end.

In [8]:
df = pd.read_csv('poi_full_text.csv') #reads all the data and store in a dataframe
full_text = df['full_text'].values    #gets only the 'full_tex' column data in a numpy array

In [9]:
##### DUMPING IN A JSON FILE #####
def dump_json(full_text):
    
    #Identified mixed labels
    #DATE as PER
    months = ['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho', 'Julho', \
              'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro']    
    
    reload(sys)  
    sys.setdefaultencoding('utf8')

    nlp = xx_ent_wiki_sm.load() #loads NLP model for 'international' languages
    train_poi = []
    
    roles = pd.read_csv('roles_poi.csv')['roles'].values  #reads the list of roles manually added to the csv file

    random.seed(2001)
    for i in range(4230):  #sample of 2.5% of the whole corpus
        a = random.randint(0,169221) #random sampling
        doc = nlp(full_text[a].decode('utf-8'))
        doc_dict = {'text':full_text[a].decode('utf-8'), 'entity':0}

        date_match = re.search(r'\d{4},(.*),\ \d{1,2}', full_text[a].decode('utf-8')) #search for date: YYYY, month, DD

        #iterate through all the entities found
        entity_list = []
        for ent in doc.ents:

            entity_dict = {'entity_text': ent.text.decode('utf-8'), 'start': ent.start_char, \
                           'end': ent.end_char, 'label': ent.label_.encode('utf-8')}

            #correcting DATE entity for the training file
            if ent.text.decode('utf-8') in months:
                if date_match is not None:
                    entity_dict = {'entity_text': date_match.group(0), 'start': date_match.span()[0], \
                           'end': date_match.span()[1], 'label': 'DATE'}
                else:  
                    entity_dict['label'] = 'DATE'

            entity_list.append(entity_dict)
            
        ### Adds ROLE entities based on roles_poi.csv file to the entity list
        for role in roles:
            role_match = re.search(role.decode('utf-8'), full_text[a].decode('utf-8'))
            if role_match is not None:
                dict_role = {'entity_text': role_match.group(0), 'start': role_match.span()[0], \
                               'end': role_match.span()[1], 'label': 'ROLE'}
                entity_list.append(dict_role)

        ### Gets the list of sentences       
        doc_dict['entity'] = entity_list                                 
        train_poi.append(doc_dict)

    ### In .json format    
    output = {
        'train_data': train_poi
    }

    ### Saves file
    with open('train_poi_data.json', 'wb') as jsonfile:
        json.dump(output,jsonfile,ensure_ascii=False, indent=2)

In [10]:
##### WRITING THE JSON FILE #####
dump_json(full_text)

In [None]:
##### DUMPING IN A TXT FILE #####
def dump_txt(full_text):

    reload(sys)  
    sys.setdefaultencoding('utf8')

    #Identified mixed labels
    #DATE as PER
    months = ['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho', 'Julho', \
              'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro']

    nlp = xx_ent_wiki_sm.load()

    roles = pd.read_csv('roles_poi.csv')['roles'].values  #reads the list of roles manually added to the csv file

    g = open('train_poi_data.txt','w')

    random.seed(2001)
    for i in range(4230):  #sample of 2.5% of the whole corpus
    #for i in range(200):
        a = random.randint(0,169221) #random sampling
        doc = nlp(full_text[a].decode('utf-8'))

        date_match = re.search(r'\d{4},(.*),\ \d{1,2}', full_text[a].decode('utf-8')) #search for date: YYYY, month, DD

        g.write('\n')

        for ent in doc.ents:

            #correcting DATE entity for the training file
            if ent.text in months:
                if date_match is not None:
                    g.write('('+date_match.group(0)+'; '+str(date_match.span()[0])+'; '+str(date_match.span()[1])+'; DATE)')
                    g.write('\n')

            else:   
                g.write('('+ent.text+'; '+str(ent.start_char)+'; '+str(ent.end_char)+'; '+ent.label_.encode('utf-8')+')')
                g.write('\n')

        g.write('\n')
        g.write('\n')

    g.close()

In [None]:
##### WRITING THE TXT FILE #####
dump_txt(full_text)

In [None]:
##### DUMPING IN A FOLDER INDIVIDUAL TXT FILEs #####
def dump_txt_individual(full_text):
    
    reload(sys)  
    sys.setdefaultencoding('utf8')

    #Identified mixed labels
    #DATE as PER
    months = ['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho', 'Julho', \
              'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro']

    nlp = xx_ent_wiki_sm.load()

    roles = pd.read_csv('roles_poi.csv')['roles'].values  #reads the list of roles manually added to the csv file
    
    if not os.path.exists('train_poi_data'):
        os.makedirs('train_poi_data')

    random.seed(2001)
    for i in range(4230):  #sample of 2.5% of the whole corpus
        a = random.randint(0,169221) #random sampling
        doc = nlp(full_text[a].decode('utf-8'))

        date_match = re.search(r'\d{4},(.*),\ \d{1,2}', full_text[a].decode('utf-8')) #search for date: YYYY, month, DD
        id_match = re.match(r'(\d+)[-.]', full_text[a].decode('utf-8').lstrip(' '))

        if id_match is not None:
            g = open('train_poi_data/'+str(i)+'_'+id_match.group(1)+'.txt','w')
        else:
            g = open('train_poi_data/'+str(i)+'_noid.txt','w')

        g.write('\n')

        for ent in doc.ents:

            #correcting DATE entity for the training file
            if ent.text in months:
                if date_match is not None:
                    g.write('('+date_match.group(0)+'; '+str(date_match.span()[0])+'; '+str(date_match.span()[1])+'; DATE)')
                    g.write('\n')

            else:   
                g.write('('+ent.text+'; '+str(ent.start_char)+'; '+str(ent.end_char)+'; '+ent.label_.encode('utf-8')+')')
                g.write('\n')

        g.close()

In [None]:
##### CREATING THE FOLDER AND WRITING INDIVIDUAL TXT FILES IN THERE #####
dump_txt_individual(full_text)

# Testing Area!