In [1]:
import os
import pdfplumber
import pandas as pd
import re
from bs4 import BeautifulSoup

## Download the file from EDGAR and load the pdf

In [None]:
from sec_edgar_downloader import Downloader

dl = Downloader('EDGARfiles')

ciks = ['0000910631', '0001577639', '0000100122', '0001370946']

for cik in ciks:
    dl.get('424B2', cik, after="2016-01-01", before="2021-06-25", query = 'green', include_amends = True)

In [4]:
def loadPDF(path):
    with open(path) as fp:
        soup = BeautifulSoup(fp)
        text = soup.get_text()
    
    return text

In [5]:
path1 = '/Users/longmini/Kyeongho/Internship/EDGARfiles/sec-edgar-filings/0000910631/424B2/0001193125-20-234875/filing-details.html'
text1 = loadPDF(path1)

In [8]:
path2 = '/Users/longmini/Kyeongho/Internship/EDGARfiles/sec-edgar-filings/0001577639/424B2/0001193125-20-212136/filing-details.html'
text2 = loadPDF(path2)
lines2 = cleaning(text2)

## Splitting and cleaning

In [7]:
def cleaning(text):
    elements = text.split('\n')
    
    lines = []
    for sent in elements:
        sent = sent.replace('\xa0', ' ')
        lines.append(sent)

    while('' in lines):
        lines.remove('')
    while(' ' in lines):
        lines.remove(' ')
    while('  ' in lines):
        lines.remove('  ')
    while('   ' in lines):
        lines.remove('   ')
    
    return lines

In [15]:
lines1 = cleaning(text1)

### Find issuer

##### Load CIK number data from EDGAR

In [9]:
import pandas as pd

companyInfo = pd.read_csv('sec__edgar_company_info.csv')
print(len(companyInfo))
companyInfo.head()

663000


Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512
3,4,#1 PAINTBALL CORP,1433777
4,5,$ LLC,1427189


In [10]:
del companyInfo['Line Number']
companyInfo.head()

Unnamed: 0,Company Name,Company CIK Key
0,!J INC,1438823
1,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512
3,#1 PAINTBALL CORP,1433777
4,$ LLC,1427189


In [11]:
def findIssuer(ciknum):
    return companyInfo[companyInfo['Company CIK Key'] == ciknum].values.tolist()[0][0]

In [12]:
findIssuer(100122)

'TUCSON ELECTRIC POWER CO'

### Find amount issued

In [13]:
import re
from re import search

def findAmount(lines):
    dollarList = []
    # dollars = [x[0] for x in re.findall('(\$[0-9]+(\.[0-9]+)?)', report)]
    for i in range(len(lines)):
        dollars = re.compile(r"[$|€]\d+(?:,\d{3})*(?:\.\d{2})?")
        if search(dollars, lines[i]):
            dollarList.append(lines[i])
    
    return dollarList[0]

In [16]:
findAmount(lines1)

'U.S.$705,000,000'

### Find Interest Rate

In [17]:
def findRate(lines):
    rateList = []
    for i in range(len(lines)):
        rates = re.compile(r'(\d+(\.\d+)?%)')
        if search(rates, lines[i]):
            rateList.append(lines[i])
    
    return rateList[0]

In [18]:
findRate(lines1)

' 1.850% Senior Notes due 2032'

### Use of Proceeds

In [19]:
proceedList = ['Biofuels', 'Solar', 'offshore wind', 'Water treatment', 'Low emission vehicles', 
               'Other energy related', 'Waste prevention', 'Waste to energy', 'Water efficiency', 
               'Landfill, energy capture', 'Industry: components', 'Transport logistics', 'parks', 
               'Bus rapid transit', 'Pollution control', 'Onshore wind', 'Tidal', 'Sustainable agriculture', 
               'Energy storage/meters', 'Water storage', 'Coach / public bus', 'Afforestation', 
               'Adaptation & resilience', 'Storm water mgmt', 'Passenger trains', 'HVAC systems', 
               'FSC Forestry', 'Water performance', 'Recycling', 'Electricity grid', 'District heating', 
               'Desalinisation plants', 'Infrastructure', 'Energy', 'Energy storage', 'Bioenergy', 
               'Certified Buildings', 'Energy performance', 'Bicycle infrastructure', 'Hydro', 'Geothermal', 
               'Water distribution', 'Wastewater treatment', 'Freight rolling stock', 'Flood protection', 
               'Erosion control', 'FSC Cellulose & paper', 'Electric vehicles', 'Land remediation', 'Urban rail']

In [20]:
def findProceeds(lines, proceedList):
    lst = []
    for i in range(len(lines)):
        for word in proceedList:
            if word.lower() in lines[i].lower():
                lst.append(word)
    return list(set(lst))

In [21]:
findProceeds(lines1, proceedList)

['Infrastructure',
 'Recycling',
 'Solar',
 'Energy',
 'Water efficiency',
 'Hydro',
 'Geothermal',
 'Water treatment']

### Find dates

In [22]:
from dateparser.search import search_dates

In [23]:
'''def findDates(lines):
    sublst = ['issue date', 'maturity date', 'maturity', 'mature']
    datesList = []
    for i in range(len(lines)):
        dates = search_dates(lines[i])
        if dates != None:
            if any(word in lines[i].lower() for word in sublst):
                datesList.append(lines[i])
    return datesList
'''

def findDates(lines):
    sublst = ['issue date', 'maturity date', 'maturity', 'mature']
    datesList = []
    for i in range(len(lines)):
        if any(word in lines[i].lower() for word in sublst):
            dates = search_dates(lines[i])
            if dates != None:
                datesList.append(lines[i])
    return datesList


### Find underwriters

In [25]:
def findUnderwriters(lines):
    underwriters = []
    for i in range(len(lines)):
        if 'UNDERWRITING' in lines[i]:
            for sent in lines[i : i + 35]:
                underwriters.append(sent)
                #numbers = re.compile(r'\d{1,3}(,\d{3})*')
                #if search(numbers, sent):
                #    underwriters.append(sent)
    sublst = [' Underwriter', ' Underwriters']
    # finalUnderwriters = []
    #print(underwriters)
    for j in range(len(underwriters)):
        if underwriters[j] in sublst:
            num1 = j
            break
            #print(num1)
    for k in range(len(underwriters)):
        if underwriters[k] == ' Total':
            num2 = k
            break
           # print(num2)
            
    finalUnderwriters = underwriters[num1:num2]
            
    return finalUnderwriters

In [26]:
findUnderwriters(lines1)

[' Underwriter',
 'PrincipalAmount ofNotes',
 ' BofA Securities, Inc.',
 'U.S.$',
 '235,000,000',
 ' J.P. Morgan Securities LLC',
 '235,000,000',
 ' Morgan Stanley & Co. LLC',
 '235,000,000']

### Find proceed description

In [27]:
from transformers import pipeline
summarizer = pipeline("summarization")

In [58]:
def findProDesc(lines):
    for i in range(len(lines)):
        if lines[i] == 'USE OF PROCEEDS ':
            parag = ' '.join(lines[i+1:i+50])
            break
    
    summary = []
    for j in range(0,len(parag),1024):
        summary = summary + list(summarizer(parag[j:j+1024], max_length=75, min_length=25)[0].values())
    
    final_summary = " ".join(summary)
    
    return final_summary
    

In [59]:
summarization = findProDesc(lines1)
print(summarization)

 The net proceeds from the sale of the notes, after payment of the underwriting discount and transaction expenses, are expected to be approximately U.S.$699.0 million . We intend to use the proceeds to finance or refinance, in whole or in part, one or more new or existing Eligible Green Projects . See Risk Factors  The following are examples of Eligible Green Projects:   Climate Change Adaptation, Clean and Mass Transportation, Energy Efficiency and Waste Management .  Expenditures related to the construction, development, acquisition, maintenance, and operation of renewable energy . Refrigeration system optimization: upgrading refrigeration equipment to improve energy efficiency and electricity consumption of cooling and vending equipment . Renewable energy projects such as solar rooftop panels .  Rcing expenditures pursuant to long-term (³ 5 years), project-tied power purchase agreements that were entered into prior to the issuance of the notes . Expenditures related to preservatio

In [60]:
summarization2 = findProDesc(lines2)
print(summarization2)

 We expect the net proceeds from this offering will be approximately $294.9 million, after deducting the underwriters' discount and commissions and the estimated expenses of this offering payable by us . We intend to use net proceeds to repay borrowings outstanding under our $300 million unsecured 2020 term loan .  Our $300 million unsecured 2020 term loan has a maturity date of March 12, 2021, subject to extension at our option, and currently bears interest at LIBOR plus 140 basis points . See Description of other indebtedness for further information .  The definition of Eligible Green Projects includes the acquisition of Galleria Office Towers, which has received LEED Certified certification . We expect to allocate all of the net proceeds from this offering to previously incurred acquisition costs relating to that building .  The real estate operations team will select Eligible Green Projects for the review and approval of our corporate finance team . We intend to allocate an amoun

In [44]:
useofpro = findProDesc(lines2)
print(useofpro)

We expect the net proceeds from this offering will be approximately $294.9 million, after deducting the underwriters discount and commissions and the estimated expenses of this offering payable by us.  We intend to allocate an amount equal to the net proceeds from the sale of the notes to the previous acquisition of the Galleria Office Towers in February 2020 for $396.2 million, which has received LEED Certified certification. We may also allocate or re-allocate net proceeds from this offering to other Eligible Green Projects.  We intend to use the net proceeds from this offering to repay borrowings outstanding under our $300 million unsecured 2020 term loan, with any remaining amounts being used to repay borrowings outstanding on our revolving credit facility or under one of our other term loans or for other business purposes. We borrowed the full amount under our $300 million unsecured 2020 term loan in March 2020 to repay, in part, indebtedness we incurred in order to complete the 

In [56]:
print("Size of description =", len(useofpro)," | #Chunks =",int(len(useofpro)/1024))
lst = []
for j in range(0,len(useofpro),1024):
    lst = lst + list(summarizer(useofpro[j:j+1024], max_length=75, min_length=25)[0].values())

print(lst)

Size of description = 5871  | #Chunks = 5
[" We expect the net proceeds from this offering will be approximately $294.9 million, after deducting the underwriters' discount and commissions and the estimated expenses of this offering payable by us . We intend to use net proceeds to repay borrowings outstanding under our $300 million unsecured 2020 term loan .", ' Our $300 million unsecured 2020 term loan has a maturity date of March 12, 2021, subject to extension at our option, and currently bears interest at LIBOR plus 140 basis points . See \x93Description of other indebtedness\x93 for further information .', ' The definition of Eligible Green Projects includes the acquisition of Galleria Office Towers, which has received LEED Certified certification . We expect to allocate all of the net proceeds from this offering to previously incurred acquisition costs relating to that building .', ' The real estate operations team will select Eligible Green Projects for the review and approval of 

In [57]:
print(' '.join(lst))

 We expect the net proceeds from this offering will be approximately $294.9 million, after deducting the underwriters' discount and commissions and the estimated expenses of this offering payable by us . We intend to use net proceeds to repay borrowings outstanding under our $300 million unsecured 2020 term loan .  Our $300 million unsecured 2020 term loan has a maturity date of March 12, 2021, subject to extension at our option, and currently bears interest at LIBOR plus 140 basis points . See Description of other indebtedness for further information .  The definition of Eligible Green Projects includes the acquisition of Galleria Office Towers, which has received LEED Certified certification . We expect to allocate all of the net proceeds from this offering to previously incurred acquisition costs relating to that building .  The real estate operations team will select Eligible Green Projects for the review and approval of our corporate finance team . We intend to allocate an amoun

In [29]:
for i in range(len(lines1)):
    if 'USE OF PROCEEDS ' == lines1[i]:
        parag = ' '.join(lines1[i+1:i+50])
        break

print(parag)

The net proceeds from the sale of the notes, after payment of the underwriting discount and transaction expenses, are expected to be approximately U.S.$699.0 million. We intend to allocate an amount equal to the net proceeds from the sale of the notes to finance or refinance, in whole or in part, one or more new or existing Eligible Green Projects. See Risk FactorsRisks Relating to the Notes in this prospectus supplement.  Eligible Green Projects are investments and expenditures to be made by us after the issuance date of the notes or made by us in the 24 months prior to such date, in eligible Green Projects as defined in and aligned with the four core components of the Green Bond Principles, which recommend transparency and disclosure and promote integrity with respect to sustainable bonds. We expect that each of our Eligible Green Projects meets or will meet one or more of the eligibility criteria described below.  We have identified Eligible Green Projects in three main categ

In [30]:
for i in range(len(lines2)):
    if 'USE OF PROCEEDS ' in lines2[i]:
        print(i, lines2[i:i+10])

591 ['USE OF PROCEEDS ', 'We expect the net proceeds from this offering will be approximately $294.9 million, after deducting the underwriters\x92 discount and commissions and', 'the estimated expenses of this offering payable by us.  We intend to allocate an amount equal to the net proceeds from the sale of the notes to the', 'previous acquisition of the Galleria Office Towers in February 2020 for $396.2 million, which has received LEED Certified certification. We may also allocate or re-allocate net proceeds from this offering', 'to other Eligible Green Projects.  We intend to use the net proceeds from this offering to repay borrowings outstanding under our $300 million', 'unsecured 2020 term loan, with any remaining amounts being used to repay borrowings outstanding on our revolving credit facility or under one of our other term loans or for other business purposes. We borrowed the full amount under our', '$300 million unsecured 2020 term loan in March 2020 to repay, in part, indebt

In [31]:
len(lines1)

2238

In [32]:
lines1[1209:1220]

[' USE OF PROCEEDS',
 '7',
 ' DESCRIPTION OF DEBT SECURITIES',
 '8',
 ' DESCRIPTION OF GUARANTEES',
 '23',
 ' FORM OF SECURITIES, CLEARING AND SETTLEMENT',
 '24',
 ' TAXATION',
 '30',
 ' PLAN OF DISTRIBUTION']

In [32]:
lines1

['424B2',
 '1',
 'd83065d424b2.htm',
 '424B2',
 '424B2',
 'Table of Contents',
 'Filed Pursuant to Rule 424(b)(2) ',
 'Registration No. 333-235558 ',
 'CALCULATION OF REGISTRATION FEE   ',
 ' Title of Each Class of ',
 'Securities Offered',
 ' Aggregate ',
 'Offering Price',
 'Amount ofRegistration Fee(1)',
 ' Debt securities',
 ' 1.850% Senior Notes due 2032',
 'U.S.$705,000,000',
 'U.S.$91,509',
 ' Guarantees',
 '\x97(2)',
 '(1)',
 ' Calculated in accordance with Rule 457(r) of the Securities Act of 1933. ',
 '(2)',
 ' Pursuant to Rule 457(n) under the Securities Act of 1933, no separate fee is payable with respect to the',
 'guarantees. ',
 'Table of Contents',
 'PROSPECTUS SUPPLEMENT  (To Prospectus Dated',
 'December 17, 2019)      ',
 ' Coca-Cola FEMSA, S.A.B. de C.V. ',
 'U.S.$705,000,000 1.850% Senior Notes due 2032 ',
 'Guaranteed by ',
 'Propimex, S. de R.L. de C.V. ',
 'Comercializadora La Pureza de Bebidas, S. de R.L. de C.V. ',
 'Grupo Embotellador Cimsa, S. de R.L. de C.V

## Create dateframe

In [33]:
import pandas as pd

In [61]:
columns = ['Issuer', 'IssueAmount', 'InterestRate', 'UseOfProceeds', 'Dates',
           'Underwriters_Amount', 'Proceed_Description']
factsDF = pd.DataFrame(columns = columns)
factsDF

Unnamed: 0,Issuer,IssueAmount,InterestRate,UseOfProceeds,Dates,Underwriters_Amount,Proceed_Description


## Main

In [62]:
def main(path, CIKnumber):
    text = loadPDF(path)
    lines = cleaning(text)
    issuer = findIssuer(CIKnumber)
    dollars = findAmount(lines)
    rates = findRate(lines)
    proceed_list = findProceeds(lines, proceedList)
    dates = findDates(lines)
    underwriters = findUnderwriters(lines)
    proceed_desc = findProDesc(lines)
    
    finalList = [issuer, dollars, rates, proceed_list, dates, underwriters, proceed_desc]
    factsDF.loc[len(factsDF.index)] = finalList
    
    return finalList

In [63]:
path1 = '/Users/longmini/Kyeongho/Internship/EDGARfiles/sec-edgar-filings/0000910631/424B2/0001193125-20-234875/filing-details.html'
path2 = '/Users/longmini/Kyeongho/Internship/EDGARfiles/sec-edgar-filings/0001577639/424B2/0001193125-20-212136/filing-details.html'
path3 = '/Users/longmini/Kyeongho/Internship/EDGARfiles/sec-edgar-filings/0000100122/424B2/0001193125-12-389271/filing-details.html'
path4 = '/Users/longmini/Kyeongho/Internship/EDGARfiles/sec-edgar-filings/0001370946/424B5/0001193125-19-207274/filing-details.html'

In [64]:
print('start')
main(path1, 910631)
main(path2, 1577639)
main(path3, 100122)
main(path4, 1370946)
print('end')

start


Your max_length is set to 75, but you input_length is only 58. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 75, but you input_length is only 40. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


end


In [65]:
factsDF

Unnamed: 0,Issuer,IssueAmount,InterestRate,UseOfProceeds,Dates,Underwriters_Amount,Proceed_Description
0,COCA COLA FEMSA SA DE CV,"U.S.$705,000,000",1.850% Senior Notes due 2032,"[Infrastructure, Recycling, Solar, Energy, Wat...",[We will pay interest on the notes on March 1 ...,"[ Underwriter, PrincipalAmount ofNotes, BofA ...","The net proceeds from the sale of the notes, ..."
1,"PIEDMONT OPERATING PARTNERSHIP, LP","$300,000,000",3.150% Senior Notes due 2030,"[Energy, Solar, Recycling, HVAC systems]","[will mature on August 15, 2030, unless earlie...","[ Underwriter, Principalamount, J.P. Morgan S...",We expect the net proceeds from this offering...
2,TUCSON ELECTRIC POWER CO,"$150,000,000",Tucson Electric Power Company 3.85% Notes due...,"[Energy, Pollution control]","[amount of our 3.85% notes due 2023, which we ...","[ Underwriters, Principal Amount, J.P. Morga...",We estimate that our net proceeds from the is...
3,OWENS CORNING (REORGANIZED) INC.,"$450,000,000",3.950% Senior Notes due 2029,"[Energy, Infrastructure, Solar]","[mature on August 15, 2029. We may redeem the ...","[ Underwriter, Principal Amountof Notes, BofA...",We estimate the net proceeds to us from the s...


In [66]:
factsDF.to_excel('factsheet_generic.xlsx')