In [2]:
#We import the necessary Packages
import pandas as pd
import re
import numpy as np
import datetime

In [3]:
#Read the saved data frames that include file name and extracted text from the PDFs
#For python>=3.8 pd.read_pickle does the work. Otherwise, it is necessary to use pickle5. 

try:
    Files_tesseract=pd.read_pickle('../../pytesseract_results.dat')
except:
    #!pip3 install pickle5
    import pickle5 as pickle
    with open('../../pytesseract_results.dat','rb') as fh:
        Files_tesseract = pickle.load(fh,encoding='unicode')

try:
    Files_pdfminer=pd.read_pickle('../../pdfminer_ables.dat')
except:
    #!pip3 install pickle5
    import pickle5 as pickle
    with open('../../pdfminer_ables.dat','rb') as fh:
        Files_pdfminer = pickle.load(fh,encoding='unicode')

ModuleNotFoundError: No module named 'pickle5'

In [4]:
#Create a common column for the extracted text in both data frames.
Files_tesseract['extracted_text']=Files_tesseract['reparse']
Files_pdfminer['extracted_text']=Files_pdfminer['de_headed']

#Combine the data frames, substituting the rows on Files_pdfminer by those included in Files_tesseract.
All_Texts_DF=Files_tesseract.combine_first(Files_pdfminer)

#Keep just the useful columns
All_Texts_DF=All_Texts_DF[['filename','extracted_text']]

NameError: name 'Files_tesseract' is not defined

In [5]:
All_Texts_DF

NameError: name 'All_Texts_DF' is not defined

# Preprocessing

Below we print the extracted text from a random PDF. Note that to extract sensible information it makes sense to parse the document line by line. 

In [5]:
import random
random_text=random.choice(All_Texts_DF.extracted_text)
print(random_text[0:1000])

LOAN  NUMBER  3955-CO  (Floating  Rate)

Loan  Agreement

(Power Market  Development  Project)

INTERNATIONAL  BANK  FOR RECONSTRUCTION
AND  DEVELOPMENT

between

and

INTERCONEXION ELECTRICA S.A.  "E.S.P."

Dated 

, 1996

LOAN  NUMBER  3955-CO

LOAN  AGREEMENT

AGREEMENT,  dated  Po4t 

U 

BANK  FOR  RECONSTRUCTION  AND  DEVELOPMENT 
INTERCONEXION  ELECTRICA  S.A."E.S.P."  (the  Borrower).

,  1996  between  INTERNATIONAL
and

(the  Bank) 

WHEREAS  (A)  REPUBLIC OF COLOMBIA (the Guarantor)  and the Borrower,
having been satisfied as to the feasibility  and priority  of the Project  described in  Schedule
2 to this Agreement,  have requested  the  Bank to  assist in the  financing  of the Project;

(B) 

by  an  agreement  (the  Guarantee  Agreement)  of  even  date  herewith
between  the  Guarantor  and  the  Bank,  the  Guarantor  has  agreed  to  guarantee  the
obligations of the Borrower in respect of the Loan and to undertake  such other obligations
as  set forth  in the  Guar

To preprocess the text we do the following:
    - Replace non alpha numeric values (except for periods, parenthesis and possible currency characters.
    - Treat double newline and period, colon or semicolon followed by a newline as a new paragraph. 
    - Treat simple newlines as part of the same paragraph. 
    - Eliminate extra spaces. 

In [6]:
#The following function formats the text to be able to differentiate paragraphs and lines. 
#Declare currency characters, to identify amount of loan. 
currency_characters=u'$¢£¤¥֏؋৲৳৻૱௹฿៛\u20a0-\u20bd\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6'

def separate_lines(text,currency_characters=currency_characters,simple_line=' ',separator='\n'):  
    #Treat dot, colon and semicolon followed by newline as a new paragraph. 
    text=re.sub('[:;.](\n)','\n\n',text)
    
    #Eliminate lines of only space characters
    text=re.sub('\n\s+\n','\n\n',text)
    
    #Replace single newlines as an space (since we are still in the same paragraph)
    text=re.sub('\n(?!\n)',simple_line,text)
    
    #Replace multiple newlines with a single one. 
    text=re.sub('\n+[\n\s]*','\n',text)
    
    #Erase non alphanumeric, dots, parenthesis or currency characters
    re_string='[^0-9a-zA-Z\s.\(\)'+currency_characters+']+'
    text=re.sub(re_string, '',text)
    text=text.replace('"','')
    text=re.sub('( ){2,}',' ',text)
    
    #Separate into lines/paragraphs
    lines=re.split(separator,text)
    lines=[l.strip() for l in lines]
    return lines

In [7]:
#Function to work with parenthesis
#Taken from https://stackoverflow.com/a/38212660/3254178
def extract_parenthesis(string):
    flag = 0
    result, accum = [], []
    for c in string:
        if c == ')':
            flag -= 1
        if flag:
            accum.append(c)
        if c == '(':
            flag += 1
        if not flag and accum:
            result.append(''.join(accum))
            accum = []
    return result

# Data Extraction
## Project Name
We scan the document line by line until we find the first line with the words "loan" or "agreement" on them. 
We then interpret the first line with text in between parenthesis as the project name. 
We only look at the first 35 lines, if we dont find a line following this pattern, we return none. 
We also make sure the Project Name doesn't include any of the unwanted_words, that sometimes appear in a similar format, but are not the name. We also discard the possible name if it is too short (less than 12 characters). 

## Project Description
Many documents include a section describing the project. We look for the line with the words "project" and "description", followed by a paragrah with the word "objective(s)". We return the first 3 lines (paragraphs) after this point.  

## Loan Amount and Currency
We find the paragraph with the key words "bank", "agrees", "borrower" and "amount". Then we look for the loan amount in that line or the next with a regular expressioon, it usually looks like $ 2000000. 

## Country Code
Many documents include in the first page a Loan number that includes a country code of 2 or 3 letters. 
We find the first line following the pattern "number AAA" and return that code. 
As we describe elsewhere, these codes are not unique and dont follow any ISO standard, so it isn't enough to accurately identify the country. 

## Possible Country Name
Many documents include the name of the country in the first page. We look for the first line with the word "between" and the first line with the word "article", and consider just the lines close to them. We then extract all small sentences of all uppercase letters as they may be the country name. We clean them by eliminating stop words and words that could confuse our fuzzy matching to identify countries, and return a list of all these possible sentences. 

##Address
We find the word "addresses" in the document, and then extract everything in the address for the borrower. We identified 4 possible patterns for this. 

## Date
We extract the date from the file name. All file names follow the pattern YYY--MM--DD--numbers_and_text. It returns a datetime object with the date. 

In [8]:
unwanted_words=['borrower','lender','rate','guarantee','agreement','guarantor','amendment']
max_line_first_page=35 #This was decided after trying some values

#Returns: Project Name, line number where it was found.
def get_project_name(lines,line_num,unwanted_words=unwanted_words,max_line_first_page=max_line_first_page):
    found_start=False
    line_max=min([max_line_first_page,len(lines)]) 
    while line_num <line_max:
        l=lines[line_num]
        if found_start:
            #Look for a line of text in between two parenthesis
            m=re.search('\(\s*([a-z]+[(),:;a-z\s\d]+)(project)?\s*\)',l.lower())
            if m!=None and len(m.group(1))>12:
                #Sometimes Parenthesis can be messy, so we extract the text on the outermost parenthesis
                if len(re.findall('\(',m.group(0)))>1:
                    name=extract_parenthesis(m.group(0))
                    if len(name)>0: #This must have at least one string if the parenthesis were balanced
                        name=name[0]
                    else: #if the parenthesis were not balanced, return everything before the second (
                        m=re.search('(.*)\(',m.group(1))
                        name=m.group(1)
                else:
                    name=m.group(1)
                if len(name)>12 and not(any(elem in name.split(' ') for elem in unwanted_words)) and not ('general conditions' in name):
                    return name,line_num
        else:
            words=l.lower().split(' ')
            if ('loan' in words) or ('agreement' in words):
                #Once we found the sentence with loan or agreement in it, we start looking for the Project Name
                found_start=True
                line_num-=1
        line_num+=1
    #If couldnt find a Project Name following the format, returns None
    return None,0

In [9]:
#Returns: Project Description, line number where it was found.
def get_project_description(lines, line_num):
    line_max=len(lines)
    while line_num <line_max-2:
        l=lines[line_num]
        words=l.split(' ')
        if all([w in words for w in ['Project','Description']]):
            description=' '.join([lines[line_num], lines[line_num+1],lines[line_num+2]])
            if 'objective' in description.lower():
                return description, line_num
        line_num+=1
    return None,line_num

In [10]:
#Returns loan_amount, currency, number of line where it was found and the paragraph
def get_loan_amount_currency(lines, line_num0):
    #Extract dollar amount
    #Extract the values using Regular Expressions
    currency_characters=u"[$¢£¤¥֏؋৲৳৻૱௹฿៛\u20a0-\u20bd\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6]|"
    currency_abbs=u'[\s\(]\s?[a-z]{1,3}'
    regexp_amount='(amount|value).*?('+currency_characters+currency_abbs+')([ \d]{6,})'

    line_num=line_num0
    while line_num<len(lines)-3:
        #Combine with the following three lines
        combined_lines=' '.join([lines[line_num],lines[line_num+1],lines[line_num+2],lines[line_num+3]]) 
        combined_lines=combined_lines.lower()
        #Find the line where it talks about the loan amount
        if all(elem in combined_lines for elem in ['bank','agrees','borrower','amount']):  
            #Eliminate commas
            combined_lines=combined_lines.replace(',','')
            #Search the regular expression
            value=re.search(regexp_amount,combined_lines)
            if value!=None:
                amount=value.group(3)
                amount=amount.replace(' ','')
                amount=int(amount)
                currency=value.group(2)
                currency=currency.replace('(','')
                currency=currency.strip()
                #Make double sure that the amount makes sense. 
                if amount>100000:
                    return amount,currency,line_num, combined_lines
        line_num+=1
    #If we couldnt find an amount, return None
    return None,None,line_num0,None


In [11]:
#CHANGE HOW THIS WORKS

def get_loan_paragraph(lines,line_num0):
    line_num=line_num0
    while line_num<len(lines)-3:
        #Combine with the following three lines
        combined_lines=' '.join([lines[line_num],lines[line_num+1],lines[line_num+2],lines[line_num+3]]) 
        combined_lines=combined_lines.lower()
        #Find the line where it talks about the loan amount
        if all(elem in combined_lines for elem in ['bank','agrees','borrower','amount']):  
            #Eliminate commas and periods 
            #(sometimes periods are used as thousands separator, and we dont need to worry about cents)
            combined_lines=combined_lines.replace(',','')
            combined_lines=combined_lines.replace('.','')
            
            #Search for money with a regular expression
            value=re.search('[ \d]{6,}',combined_lines)
            if value!=None:
                    return combined_lines,line_num
        line_num+=1
    return '',line_num0


#Returns loan_amount, currency, number of line where it was found and the paragraph
def get_loan_amount_currency(loan_paragraph):
    #Extract dollar amount
    #Extract the values using Regular Expressions
    currency_characters=u"[$¢£¤¥֏؋৲৳৻૱௹฿៛\u20a0-\u20bd\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6]|"
    currency_abbs=u'[\s\(]\s?[a-z]{1,3}'
    regexp_amount='(amount|value).*?('+currency_characters+currency_abbs+')([ \d]{6,})'

    value=re.search(regexp_amount,paragraph)
    if value!=None:
        amount=value.group(3)
        amount=amount.replace(' ','')
        amount=int(amount)
        currency=value.group(2)
        currency=currency.replace('(','')
        currency=currency.strip()
        #Make double sure that the amount makes sense. 
        if amount>100000:
            return amount,currency

    #If we couldnt find an amount, return None
    return None,None


In [12]:
def get_country_code(lines,max_line_first_page=max_line_first_page):
    max_line=min(max_line_first_page,len(lines))
    line_num=0
    while line_num<max_line:
        l=lines[line_num].lower()
        m=re.search('numbers? .*?([a-z]{2,3})[ ]?$',l)
        if m!=None:
            code=m.group(1)
            code=code.upper()
            return code,line_num
        line_num+=1
    return None,0

In [13]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

#extra common stop_words
stop_words=stop_words.union(['bank', 'state','agreement', 'loan', 'development','number', 'international', 'whereas', 'therefore'])

In [14]:
def clean(text, stop_words=stop_words):
    words=text.lower().split(' ')
    words=[w for w in words if not w in stop_words]
    text=' '.join(words).strip()
    return text

In [15]:
def get_possible_country_names(lines, stop_words=stop_words):
    for ind, line in enumerate(lines):
        start=0
        if start==0 and 'between' in line.lower():
            start=max(ind-2,0)
        if 'article' in line.lower():
            #Combine all lines between start and the line before the word "article"
            text='\n'.join(lines[start:ind-1])
            #Find all short sentences of all caps, of 4 or more letters. 
            possible_countries=re.findall('[A-Z]{4,}[A-Z ]+',text)
            
            #Clean all possible countries
            possible_countries=[clean(possible,stop_words) for possible in possible_countries]
            possible_countries=[possible for possible in possible_countries if not possible=='' and len(possible)>3]  
            return '\n'.join(possible_countries)
        #Otherwise return None
    return None

In [16]:
def spacing_and_lower(docu):
    docu = docu.replace('  ',' ')
    docu = docu.replace('   ',' ')
    docu = docu.lower()
    return docu

def read_address(docu, pin0= 'addresses', pin1 = 'start', pin2 = 'end'):
    start = docu.find(pin0)
    borrower = docu[start:].find(pin1)
    colon = docu[start+borrower:].find(':')
    bank = docu[start+borrower+colon:].find(pin2)
    add = docu[start+borrower+colon+1:start+borrower+colon+bank]
    if 'in witness' in add:
        end2 = docu[start+borrower+colon+1:].find('in witness')
        add = add[:end2]
    return add

def get_address(text):
    text=spacing_and_lower(text)
    address = read_address(text, pin0 = 'addresses', pin1 = 'for the borrower', pin2 = 'for the bank')
    if len(address) < 8:
        address = read_address(text, pin0 = 'addresses', pin1 = 'for the borrower', pin2 = 'international bank')
    if len(address) < 8 or len(address) >500:
        address = read_address(text, pin0 = 'addresses', pin1 = 'address is', pin2 = 'the bank')
    if len(address) < 8:
        address = read_address(text, pin0 = 'addresses', pin1 = 'the borrower', pin2 = 'the world bank')
    if len(address) < 8:
        return np.nan
    else:
        return address

In [17]:
#The following Function extracts the date from the file_name and returns it as a datetime.datetime object
def get_date_file_name(file_name):
    file_split=file_name.split('--')
    date=datetime.datetime.strptime(' '.join(file_split[0].split('_')[0:3]),'%Y %B %d')
    return date

Finally we create empty columns for all the attributes and we extract them. 

In [18]:
All_Texts_DF['Project_name']=None
All_Texts_DF['Project_desc']=None
All_Texts_DF['Date']=None
All_Texts_DF['Currency']=''
All_Texts_DF['Amount_loan']=None
All_Texts_DF['Loan_paragraph']=None
All_Texts_DF['Country_code_pdf']=None
All_Texts_DF['Possible_country_name']=None
All_Texts_DF['Address']=None

In [19]:
for i in All_Texts_DF.index:
    text=All_Texts_DF['extracted_text'][i]
    file_name=All_Texts_DF['filename'][i]
    
    All_Texts_DF['Date'][i]=get_date_file_name(file_name)
    
    lines=separate_lines(text)
    All_Texts_DF['Country_code_pdf'][i],line_num=get_country_code(lines)
    All_Texts_DF['Project_name'][i],line_num=get_project_name(lines,line_num)
    
    paragraph,line_num=get_loan_paragraph(lines,0)
    All_Texts_DF['Loan_paragraph'][i]=paragraph
    
    amount,currency=get_loan_amount_currency(paragraph)
    All_Texts_DF['Currency'][i]=currency
    All_Texts_DF['Amount_loan'][i]=amount
    
    
    description, _=get_project_description(lines,line_num)
    All_Texts_DF['Project_desc'][i]=description
    
    All_Texts_DF['Possible_country_name'][i]=get_possible_country_names(lines)
    
    All_Texts_DF['Address'][i]=get_address(text)

In [20]:
#How many NONEs?
print('No Name ',All_Texts_DF.Project_name.isnull().sum())
print('No Currency ',All_Texts_DF.Currency.isnull().sum())
print('No Amount ',All_Texts_DF.Amount_loan.isnull().sum())
print('No Country Code ',All_Texts_DF.Country_code_pdf.isnull().sum())
print('No Project Description ',All_Texts_DF.Project_desc.isnull().sum())
print('No Address ',All_Texts_DF.Address.isnull().sum())

No Name  78
No Currency  222
No Amount  222
No Country Code  111
No Project Description  838
No Address  118


In [1]:
All_Texts_DF[All_Texts_DF.Address==All_Texts_DF.Address_2].shape

NameError: name 'All_Texts_DF' is not defined

In [21]:
#Dont Want to count empty description as a NONE PDF (maybe the title has enough information)
All_Texts_DF.loc[All_Texts_DF.Project_desc.isnull(),'Project_desc']=''
print('We couldnt extract all the info for: ',All_Texts_DF.shape[0]-All_Texts_DF.dropna().shape[0],' PDFs.')

We couldnt extract all the info for:  428  PDFs.


In [22]:
#Export
DF_to_export=All_Texts_DF.drop('extracted_text',axis=1)
DF_to_export.to_csv('../data/Extracted_Attributes.csv',index=False)

In [23]:
All_Texts_DF.to_pickle('../../All_Texts.dat')