In [None]:
%reset -f

In [None]:
# Import Packages & Path
# pip install pdfplumber
import os
import pdfplumber
import re
import pandas as pd
from IPython.display import display

dir = os.path.join(os.getcwd(), "SADC All") # Folder containing all pdfs

In [None]:
# Define Indexing function using regex
def substringindex(inputlist, inputsubstring):
    '''
    Identify the 1st substring in list of strings
    Return (index, string)
    '''
    s = [x for x in inputlist if re.search(inputsubstring, x)]

    if s != []:
        return (inputlist.index(s[0]), s[0])
    return 'Unidentified'

def substringindex2(inputlist, inputsubstring):
    '''
    Identify the 2nd substring in list of strings
    Return (index, string)
    '''
    s = [x for x in inputlist if re.search(inputsubstring, x)]

    if s != []:
        return (inputlist.index(s[1]), s[1])
    return 'Unidentified'


In [None]:
# Define Extraction Function
def pdfextract_report(inputfilename):
    '''
    Extracts Info from PDFs
    Input: File name (without directory info); directory should be specified in advance
    Returns: A list of filename, Company_Name, Country, Sector, ReportDate
    '''
    inputfilepath = os.path.join(dir, inputfilename) 
    with pdfplumber.open(inputfilepath) as pdf:
        page1 = pdf.pages[0].extract_text() # Only page 1 will be used

        # Extract Company Name
        lines = page1.split('\n') # Split page1 into various lines
        filter1 = filter(lambda x: x.strip(), lines) # Remove empty string elements
        lines_clean = list(filter1)                 # Remove empty string elements

        Company_Name = lines_clean[0].strip() # Company name is always the first non-empty line

        # Extract Country, Sector, ReportDate
        try:
            if substringindex(lines_clean,'Analysis') == 'Unidentified': # Get the 1st string element with 'Analysis' in it
                Country = ''
                Sector = ''
                ReportDate = ''
            else: 
                Country_Sector_Date = substringindex(lines_clean,'Analysis')[1].split(" Analysis ") # Create a list of [Company+Sector, Date]
                Country_Sector = Country_Sector_Date[0].strip() # Create a list of [Company+Sector, Date]
                Country = Country_Sector.rsplit(' ', 1)[0].strip() # Country is the first part of Company_Sector
                Sector = Country_Sector.rsplit(' ', 1)[1].strip() # Sector is the second part of Company_Sector
                ReportDate = Country_Sector_Date[1].strip() # Report is the 2nd element of list Country_Sector_Date
        except:
            Country = ''
            Sector = ''
            ReportDate = ''


        # Extract Long-term Rating Info
        try:
            if substringindex(lines_clean,'Long') == 'Unidentified':
                RatingInfo = ''
            else: 
                RatingInfo = substringindex(lines_clean,'Long')[1]     # Long-term rating info is the first string element with 'Long'
        except:
            RatingInfo = ''

        # Extract Outlook & Expiry Date
        try: 
            list_of_outlook_expirydate_raw = substringindex2(lines_clean, '(\d){4}')[1].strip().split(' ')  # Outlook & Expiry Dates might not be in the long term rating row, so they'd better to be extracted separately
            Outlook_Expirydate = [x for x in list_of_outlook_expirydate_raw if x][-3:] # Take the last 4 nonempty items from the string containing Outlook & Exp Dates
            Outlook = ' '.join(Outlook_Expirydate[:-2])
            ExpirydateMonth = Outlook_Expirydate[-2]
            ExpirydateYear = Outlook_Expirydate[-1]
        except:
            Outlook = ''
            ExpirydateMonth = ''
            ExpirydateYear = ''

        # Extract Financial Year Dates
        try: 
            FYs_raw = substringindex(lines_clean, '(\d){2}/(\d){2}/(\d){2}')[1].strip().split(' ') # Financial Year dates are the first string element containing pattern 12/34/56
            FYs = [x for x in FYs_raw if x] # Remove empty strings ('') from list 
            FY1 = FYs[0]
            FY2 = FYs[1]
        except:
            FY1 = ''
            FY2 = ''
        
        # Extract Total Assets
        try:
            TotalAssets_Raw = substringindex(lines_clean, 'Total assets')[1].strip() 
            TotalAssets_Clean = re.sub("(?<=\d) (?=\d)", ",", TotalAssets_Raw).split(' ') # Remove spaces between two numbers - important since there are assets larger than 999 denoted by '1 234' instead of '1,234'
            TotalAssets = [x for x in TotalAssets_Clean if x] # Remove empty strings ('') from list 
            TotalAssets1 = TotalAssets[2]
            TotalAssets2 = TotalAssets[3]
        except:
            TotalAssets1 = ''
            TotalAssets2 = ''
        
        # Extract Rating Histories
        try:
            InitialDate = substringindex(lines_clean, 'Initial')[1] 
            lines_below = lines_clean[substringindex(lines_clean, 'Initial')[0]+1:] # List of all lines below 'Initial Date'
            InitialRating = substringindex(lines_below, 'Long-term|Long term')[1] # Initial Long-term rating 
            try:
                LastDate = substringindex(lines_below, 'Last')[1] # The first occurence of 'Last' in the lines below 'Initial Date'
                lines_below2 = lines_below[substringindex(lines_below, 'Last')[0]+1:] # List of all lines below 'Last Date' 
                LastRating = substringindex(lines_below2, 'Long-term|Long term')[1] # Last Long-term rating
            except:
                LastDate = ''
                LastRating = ''
        except:
            InitialDate = ''
            InitialRating = ''
            LastDate = ''
            LastRating = ''

        # Extract Primary Analyst
        try:
            GCRContacts = lines_clean[substringindex(lines_clean, 'Primary Analyst')[0]+1:] # Subset lines in the 'GCR Contacts' section
            PrimaryAnalyst = substringindex(GCRContacts, '[A-Z][a-z]+\s[A-Z][a-z]+')[1] # Look for full name pattern: Azzzz Zaa
            PrimaryAnalystEmail = substringindex(GCRContacts, '@globalratings.net')[1] 
            PrimaryAnalystPosition_raw = GCRContacts[substringindex(GCRContacts, '[A-Z][a-z]+\s[A-Z][a-z]+')[0]+1:substringindex(GCRContacts, '@globalratings.net')[0]] # Subset a list between [PA name] and [PA email] - should contain [PA position]]
            PrimaryAnalystPosition = substringindex(PrimaryAnalystPosition_raw, 'Analyst|Head')[1] # All positions I've seen have 'Analyst' or 'Head' in there - needs further cleaning
        except:
            PrimaryAnalyst = ''
            PrimaryAnalystEmail = ''
            PrimaryAnalystPosition = ''

        # Extract Committee Chairperson
        try:
            GCRContacts_afterPA = lines_clean[substringindex(lines_clean, 'Committee Chairperson')[0]+1:] # The method is exactly the same as the PA section above

            CommitteeChairperson = substringindex(GCRContacts_afterPA, '[A-Z][a-z]+\s[A-Z][a-z]+')[1] 
            CommitteeChairpersonEmail = substringindex(GCRContacts_afterPA, '@globalratings.net')[1]
            CommitteeChairpersonPosition_raw = GCRContacts_afterPA[substringindex(GCRContacts_afterPA, '[A-Z][a-z]+\s[A-Z][a-z]+')[0]+1:substringindex(GCRContacts_afterPA, '@globalratings.net')[0]]
            CommitteeChairpersonPosition = substringindex(CommitteeChairpersonPosition_raw, 'Analyst|Head')[1]
        except:
            CommitteeChairperson = ''
            CommitteeChairpersonEmail = ''
            CommitteeChairpersonPosition = ''

        # Extract Tel
        try:
            Tel_raw = substringindex(lines_clean, 'Tel:')[1]
            Tel = re.sub('[^0-9\+\-]', '', Tel_raw) # Tel only has number, '+' and/or '-'
        except:
            Tel = ''

        # Extract Analyst location:
        try:
            ALoc_raw = substringindex(lines_clean, 'Analyst location')[1].split('Analyst location')[-1]
            ALoc = re.sub('[^A-z\,\s]', '', ALoc_raw).strip() # Location has only Alphabet, ',', and/or space
        except:
            ALoc = ''

        return (inputfilename, Company_Name, Country, Sector, ReportDate, RatingInfo, Outlook, ExpirydateMonth, ExpirydateYear, FY1, FY2, TotalAssets1, TotalAssets2, InitialDate, InitialRating, LastDate,LastRating, PrimaryAnalyst, PrimaryAnalystPosition , PrimaryAnalystEmail, CommitteeChairperson, CommitteeChairpersonPosition ,CommitteeChairpersonEmail, Tel, ALoc )




In [None]:
# Execution

# Create Dataframe 
df = pd.DataFrame(columns = ['Filename', 'Company_Name', 'Country', 'Sector', 'Report_Date', 'RatingInfo', 'Outlook', 'ExpirydateMonth', 'ExpirydateYear', 'FY1', 'FY2','TotalAssets1','TotalAssets2', 'InitialDate', 'InitialRating', 'LastDate','LastRating', 'PrimaryAnalyst', 'PrimaryAnalystPosition' , 'PrimaryAnalystEmail', 'CommitteeChairperson', 'CommitteeChairpersonPosition' ,'CommitteeChairpersonEmail', 'Tel', 'Analyst_Location'])

# Code to Loop through all files
row = 0 # Record current row info
success = 0 # Record the current number of successful extracts
skipped = 0 # Record number of files skipped because the format is not supported
total_N = len(os.listdir(dir)) # Record the total number of files in the directory

for pdf in os.listdir(dir):
    # Sending Message
    if pdf.endswith('.pdf'): # Only extract PDF files
        print('Extracting', pdf, ' ; ', row + 1, '/', total_N) # Signal that file is being extracted, and the current progress

        # Filtering single format
        try:
            try:
                df.loc[row] = pdfextract_report(pdf) # Append the info from the pdf to df as a new line
                success += 1
            except:
                print("***Error: file", pdf)
                df.loc[row] = [pdf] + ['']*(df.shape[1]-1) # If there's an error, append a new line to df with only filename

            row = row + 1
        except:
            print('*Warning: file skipped - format not supported', pdf)
            skipped += 1

print('Extraction Complete')
print('Success:', success, '; total: ', row,  '; skipped: ', skipped)

# Output Dataframe
df.to_csv(dir+' output.csv', index=False)