In [4]:
import pandas as pd
import re
import numpy as np
import datetime

In [5]:
#Read the saved data frames that include file name and extracted text. 
#For python>=3.8 pd.read_pickle does the work. Otherwise, it is necessary to use pickle5. 

try:
    Files_tesseract=pd.read_pickle('../pytesseract_results.dat')
except:
    #!pip3 install pickle5
    import pickle5 as pickle
    with open('../pytesseract_results.dat','rb') as fh:
        Files_tesseract = pickle.load(fh,encoding='unicode')

try:
    Files_pdfminer=pd.read_pickle('../pdfminer_ables.dat')
except:
    #!pip3 install pickle5
    import pickle5 as pickle
    with open('../pdfminer_ables.dat','rb') as fh:
        Files_pdfminer = pickle.load(fh,encoding='unicode')

In [6]:
#Create a common column for the extracted text in both data frames.
Files_tesseract['extracted_text']=Files_tesseract['reparse']
Files_pdfminer['extracted_text']=Files_pdfminer['de_headed']

#Combine the data frames, substituting the rows on Files_pdfminer by those included in Files_tesseract.
Combined_DF=Files_tesseract.combine_first(Files_pdfminer)

#Keep just the useful columns
Combined_DF=Combined_DF[['filename','extracted_text']]

In [7]:
Combined_DF

Unnamed: 0,filename,extracted_text
0,1990_april_24_587321468019152780_conformed-cop...,CONFORMED COPY\n\nLOAN NUMBER 3186 IVC\n\nLoan...
1,1990_april_24_668811468165272290_conformed-cop...,CONFORMED COPY\n\n ...
2,1990_april_25_904191468298750561_conformed-cop...,CONFORMED COPY\n\n ...
3,1990_april_30_410811468040573756_conformed-cop...,CONFORMED COPY\n\n ...
4,1990_april_30_725911468042268845_conformed-cop...,CONFORMED COPY\n\n ...
...,...,...
3200,2019_september_13_300871570120923592_official-...,OFFICIAL\nDOCUMENTS\n\n=＝ ニニ ニニニ ニニ ニニ にニ ニニニ ...
3201,2019_september_13_710891569417913880_official-...,Public Disclosure Authorized\n\nPublic Disclos...
3202,2019_september_23_867961572361092133_official-...,Public Disclosure Authorized\n\nPublic Disclos...
3203,2019_september_25_810411569965213165_official-...,Public Disclosure Authorized\n\nPublic Disclos...


In [8]:
#The following function formats the text to be able to differentiate paragraphs and titles. 

#Declare currency characters, to identify amount of loan. 
currency_characters=u'$¢£¤¥֏؋৲৳৻૱௹฿៛\u20a0-\u20bd\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6'
def separate_lines(text,currency_characters=currency_characters):  
    #Treat dot, colon and semicolon followed by newline as a new paragraph. 
    text=re.sub('[:;.](\n)','\n\n',text)
    
    #Eliminate lines of only space characters
    text=re.sub('\n\s+\n','\n\n',text)
    
    #Replace single newlines as an space (since we are still in the same paragraph)
    text=re.sub('\n(?!\n)',' ',text)
    
    #Replace multiple newlines with a single one. 
    text=re.sub('\n+[\n\s]*','\n',text)
    
    #Erase non alphanumeric, dots, parenthesis or currency charachters
    re_string='[^0-9a-zA-Z\s.\(\)'+currency_characters+']+'
    text=re.sub(re_string, '',text)
    text=text.replace('"','')
    text=re.sub('( ){2,}',' ',text)
    
    #Separate into lines/paragraphs
    lines=text.split('\n')
    lines=[l.strip() for l in lines]
    return lines

In [9]:
#Function to work with parenthesis
#Taken from https://stackoverflow.com/a/38212660/3254178
def extract_parenthesis(string):
    flag = 0
    result, accum = [], []
    for c in string:
        if c == ')':
            flag -= 1
        if flag:
            accum.append(c)
        if c == '(':
            flag += 1
        if not flag and accum:
            result.append(''.join(accum))
            accum = []
    return result

In [10]:
#Returns: Project Name, line number where it was found. 
unwanted_words=['borrower','lender','rate','guarantee','agreement','guarantor','amendment']
def get_project_name(lines,line_num,unwanted_words=unwanted_words):
    found_start=False
    line_max=min([35,len(lines)]) #This was decided after changing many values
    while line_num <line_max:
        l=lines[line_num]
        if found_start:
            m=re.search('\(\s*(\w+[(),:;\w\s\d]+)(project)?\s*\)',l.lower())
            if m!=None and len(m.group(1))>12:
                #Sometimes Parenthesis can be messy
                if len(re.findall('\(',m.group(0)))>1:
                    name=extract_parenthesis(m.group(0))
                    if len(name)>0: #This must have at least one string if the parenthesis were balanced
                        name=name[0]
                    else: #if the parenthesis were not balanced, return everything before the second (
                        m=re.search('(.*)\(',m.group(1))
                        name=m.group(1)
                else:
                    name=m.group(1)
                if len(name)>12 and not(any(elem in name.split(' ') for elem in unwanted_words)) and not ('general conditions' in name):
                    return name,line_num
        else:
            words=l.lower().split(' ')
            if ('loan' in words) or ('agreement' in words):
                found_start=True
                line_num-=1
        line_num+=1
    return None,0

In [11]:
#The following Function extracts the date from the file_name and returns it as a datetime.datetime object
def get_date_file_name(file_name):
    file_split=file_name.split('--')
    date=datetime.datetime.strptime(' '.join(file_split[0].split('_')[0:3]),'%Y %B %d')
    return date

In [12]:
#Returns loan_amount, currency, line where it was found
def get_loan_amount_currency(lines, line_num0):
    #Extract dollar amount
    #Extract the values using Regular Expressions
    currency_characters=u"[$¢£¤¥֏؋৲৳৻૱௹฿៛\u20a0-\u20bd\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6]|"
    currency_abbs=u'[\s\(]\s?[a-z]{1,3}'
    regexp_amount='(amount|value).*?('+currency_characters+currency_abbs+')([ \d]{6,})'

    line_num=line_num0
    while line_num<len(lines)-3:
        #Combine with the following three lines
        combined_lines=' '.join([lines[line_num],lines[line_num+1],lines[line_num+2],lines[line_num+3]]) 
        combined_lines=combined_lines.lower()
        #Find the line where it talks about the loan amount
        if all(elem in combined_lines for elem in ['bank','agrees','borrower','amount']):    
            value=re.search(regexp_amount,combined_lines)
            combined_lines=combined_lines.replace(',','')
            if value!=None:
                amount=value.group(3)
                amount=amount.replace(' ','')
                amount=int(amount)
                currency=value.group(2)
                currency=currency.replace('(','')
                currency=currency.strip()
                if amount>100000:
                    return amount,currency,line_num
        line_num+=1
    return None,None,line_num0


In [68]:
def get_country_code(lines):
    max_line=min(35,len(lines))
    line_num=0
    while line_num<max_line:
        l=lines[line_num].lower()
        m=re.search('numbers? .*?([a-z]{2,3})[ ]?$',l)
        if m!=None:
            code=m.group(1)
            code=code.upper()
            return code,line_num
        line_num+=1
    return None,0

In [69]:
Combined_DF['Project_name']=None
Combined_DF['Date']=None
Combined_DF['Currency']=''
Combined_DF['Amount_loan']=None
Combined_DF['Country_code']=None

In [70]:
for i in Combined_DF.index:
    text=Combined_DF['extracted_text'][i]
    file_name=Combined_DF['filename'][i]
    
    Combined_DF['Date'][i]=get_date_file_name(file_name)
    
    lines=separate_lines(text)
    Combined_DF['Country_code'][i],line_num0=get_country_code(lines)
    Combined_DF['Project_name'][i],line_num0=get_project_name(lines,line_num0)
    
    amount,currency, line_num=get_loan_amount_currency(lines,line_num0)
    Combined_DF['Currency'][i]=currency
    Combined_DF['Amount_loan'][i]=amount

In [71]:
#How many NONEs?
print('No Name ',Combined_DF.Project_name.isnull().sum())
print('No Currency ',Combined_DF.Currency.isnull().sum())
print('No Amount ',Combined_DF.Amount_loan.isnull().sum())
print('No Country Code ',Combined_DF.Country_code.isnull().sum())

No Name  76
No Currency  228
No Amount  228
No Country Code  111


In [72]:
#Drop nulls
print('We couldnt extract all the info for: ',Combined_DF.shape[0]-Combined_DF.dropna().shape[0],' PDFs.')

We couldnt extract all the info for:  359  PDFs.


In [73]:
#Export
DF_to_export=Combined_DF[['filename','Date','Project_name','Currency','Amount_loan','Country_code']]
DF_to_export.to_pickle('Extracted_Information')
DF_to_export.to_csv('Extracted_Information')

In [74]:
#Export All to local
Combined_DF.to_pickle('../All_Texts.dat')

In [75]:
import random
ind=random.randint(0,2000)
text=Combined_DF.extracted_text[ind]
lines=separate_lines(text)
_,_,line_num=get_loan_amount_currency()
found_start=False
    line_max=min([35,len(lines)]) #This was decided after changing many values
    while line_num <line_max:
        l=lines[line_num]
        if found_start:
            m=re.search('\(\s*(\w+[(),:;\w\s\d]+)(project)?\s*\)',l.lower())
            if m!=None and len(m.group(1))>12:
                #Sometimes Parenthesis can be messy
                if len(re.findall('\(',m.group(0)))>1:
                    name=extract_parenthesis(m.group(0))
                    if len(name)>0: #This must have at least one string if the parenthesis were balanced
                        name=name[0]
                    else: #if the parenthesis were not balanced, return everything before the second (
                        m=re.search('(.*)\(',m.group(1))
                        name=m.group(1)
                else:
                    name=m.group(1)
                if len(name)>12 and not(any(elem in name.split(' ') for elem in unwanted_words)) and not ('general conditions' in name):
                    return name,line_num
        else:
            words=l.lower().split(' ')
            if ('loan' in words) or ('agreement' in words):
                found_start=True
                line_num-=1
        line_num+=1


IndentationError: unexpected indent (<ipython-input-75-530c5fc761d5>, line 7)

In [None]:
Combined_DF.Country_code.value_counts(dropna=False).to_frame()[15:30]

In [None]:
Codes=list(Combined_DF.Country_code.dropna().unique())

In [None]:
Codes.sort()

In [None]:
Codes

In [None]:
Codes0=['AG',
 'AIC',
 'AL',
 'ALB',
 'ALK',
 'AM',
 'AO',
 'AR',
 'AZ',
 'BA',
 'BAR',
 'BB',
 'BEL',
 'BF',
 'BG',
 'BO',
 'BOT',
 'BR',
 'BUL',
 'BW',
 'BY',
 'BZ',
 'CG',
 'CH',
 'CHA',
 'CI',
 'CL',
 'CM',
 'CN',
 'CO',
 'COB',
 'CR',
 'CRG',
 'CS',
 'CV',
 'CY',
 'CZ',
 'DM',
 'DO',
 'DOM',
 'DRO',
 'DZ',
 'EC',
 'EE',
 'EG',
 'EGT',
 'ENT',
 'ERO',
 'ES',
 'EST',
 'ET',
 'FIJ',
 'FJ',
 'FRO',
 'GA',
 'GD',
 'GE',
 'GLY',
 'GR',
 'GRD',
 'GRO',
 'GT',
 'GU',
 'HE',
 'HO',
 'HR',
 'HRO',
 'HU',
 'IA',
 'ID',
 'IN',
 'IND',
 'IQ',
 'IRN',
 'IVC',
 'JM',
 'JO',
 'JOR',
 'KE',
 'KO',
 'KZ',
 'LB',
 'LC',
 'LD',
 'LE',
 'LK',
 'LPE',
 'LSO',
 'LT',
 'LV',
 'MA',
 'MAS',
 'MD',
 'ME',
 'MK',
 'MN',
 'MOR',
 'MU',
 'MX',
 'MZ',
 'NA',
 'NE',
 'ONT',
 'OWS',
 'PA',
 'PAK',
 'PAN',
 'PE',
 'PH',
 'PK',
 'PL',
 'PNG',
 'POL',
 'PY',
 'RO',
 'ROM',
 'RU',
 'RUS',
 'SA',
 'SC',
 'SEY',
 'SIA',
 'SK',
 'SLO',
 'SLU',
 'SR',
 'SV',
 'SW',
 'TH',
 'TL',
 'TM',
 'TN',
 'TND',
 'TR',
 'TU',
 'TUN',
 'UA',
 'UNI',
 'UR',
 'UY',
 'UZ',
 'VCC',
 'VE',
 'VN',
 'WAF',
 'XLC',
 'YF',
 'YU',
 'ZA',
 'ZIM',
 'ZM']

In [None]:
Codes=set(Codes)

In [None]:
Codes.difference(Codes0)

In [None]:
Combined_DF[Combined_DF.Country_code=='AND']

In [None]:
num=515
text=Combined_DF.extracted_text[num]
print(text)

In [None]:
import os
from pdf2image import convert_from_path
def winapi_path(dos_path, encoding=None):
    if (not isinstance(dos_path, str) and encoding is not None): 
        dos_path = dos_path.decode(encoding)
    path = os.path.abspath(dos_path)
    if path.startswith(u"\\\\"):
        return u"\\\\?\\UNC\\" + path[2:]
    return u"\\\\?\\" + path

ROOT_DIR = '../world_bank_loans_full_upload/'
def get_pdf_pages(file,ROOT_DIR=ROOT_DIR):
    path=os.path.abspath(ROOT_DIR+file)
    path=winapi_path(path)
    images=convert_from_path(path)
    return images

In [None]:
file=Combined_DF.filename[num]
get_pdf_pages(file)[0]