# Reporting NLP

In [1]:
from IPython.display import HTML, display
import os
import time
import requests
from requests_html import HTMLSession
import pandas as pd
import fitz # this ist pymupdf
from nltk import FreqDist
from nltk.tokenize import word_tokenize, MWETokenizer

## Downloading Annual-Reports
Annual Reports of DAX-Companies are gathered from an Overview page maintained at `boersengefluester.de`.

In [2]:
dax_annuals_overview_url = 'https://boersengefluester.de/download-der-dax-geschaftsberichte-von-2007-bis-2018/'
download_folder = './downloads'
session = HTMLSession()
r = session.get(dax_annuals_overview_url)
dax_annuals_links = r.html.xpath('//table[@class="ath_table"] // a[img][starts-with(@title, "Geschäftsbericht")]')
print('Links to {} dax annual reports found'.format(len(dax_annuals_links)))

# list of tuples like (company, year, report_url)
dax_annuals_information = [
    ("_".join(r.attrs['title'].split()[1:-3]),
     r.attrs['title'].split()[-3],
     r.absolute_links.pop(),
    ) for r in dax_annuals_links
]

#manually add beiersdorf as not any more in overview (kicked from dax as of 2021)
dax_annuals_information += [
    ("Beiersdorf",
     year,
     'https://boersengefluester.de/wp-content/uploads/assets/annuals/{}/520000.pdf'.format(year)
    ) for year in range(2015, 2021)
]

dax_annuals_information = [
    (company,
     year,
     url,
     '{}/{}-{}.pdf'.format(download_folder, company, year)) for company, year, url in dax_annuals_information
]

dax_annuals_information[-3:]

Links to 393 dax annual reports found


[('Beiersdorf',
  2018,
  'https://boersengefluester.de/wp-content/uploads/assets/annuals/2018/520000.pdf',
  './downloads/Beiersdorf-2018.pdf'),
 ('Beiersdorf',
  2019,
  'https://boersengefluester.de/wp-content/uploads/assets/annuals/2019/520000.pdf',
  './downloads/Beiersdorf-2019.pdf'),
 ('Beiersdorf',
  2020,
  'https://boersengefluester.de/wp-content/uploads/assets/annuals/2020/520000.pdf',
  './downloads/Beiersdorf-2020.pdf')]

Links to annual Reports for MDAX-Companies are read from manually supplied csv

In [3]:
mdax_df = pd.read_csv('./mdax_annuals.csv', ';')

mdax_annuals_information = []

for index, row in mdax_df.iterrows():
    company = row['Unternehmen']
    for year in range(2012, 2021):
        url = row[str(year)]
        if url and not str(url) == 'nan':
            mdax_annuals_information.append(
                (company, year, url, '{}/{}-{}.pdf'.format(download_folder, company, year))
            )

mdax_annuals_information[-3:]

[('Zalando SE',
  2018,
  'https://boersengefluester.de/wp-content/uploads/assets/annuals/2018/ZAL111.pdf',
  './downloads/Zalando SE-2018.pdf'),
 ('Zalando SE',
  2019,
  'https://boersengefluester.de/wp-content/uploads/assets/annuals/2019/ZAL111.pdf',
  './downloads/Zalando SE-2019.pdf'),
 ('Zalando SE',
  2020,
  'https://boersengefluester.de/wp-content/uploads/assets/annuals/2020/ZAL111.pdf',
  './downloads/Zalando SE-2020.pdf')]

In [4]:
try:
    os.makedirs(download_folder)
    print('Created Download directory, proceed downloading reports')
except:
    print('Download directory already exists, proceed to download missing annuals')
    
i = dax_annuals_information + mdax_annuals_information
    
counter = 0
for company, year, url, download_path in i:
    if(not os.path.exists(download_path)):
        print('Downloading: ', url)

        try:
            response = requests.get(url)
        except requests.exceptions.SSLError:
            response = requests.get(url, verify=False)
            
        with open(download_path, 'wb') as fp:
            fp.write(response.content)
            print("Saved: ", '{}-{}.pdf'.format(company, year))
            counter += 1

        print("Waiting for 3 seconds")
        time.sleep(1) # wait to not overrun server...

print('Succesfully downloaded ', counter, ' reports.')


Download directory already exists, proceed to download missing annuals
Succesfully downloaded  0  reports.


## Reading Sustainability Reporting Dax and MDax
Sustainability Reports are supplied in a manually created folder

In [5]:
dax_sustainability_information = []
mdax_sustainability_information = []

try:
    dax_files = os.listdir('./sustainability-reports/Dax_Nachhaltigkeitsberichte')

    dax_sustainability_information = [
        (f.split('_')[0],
         ",".join(f.split('_')[1:])[:-4],
         None,
         './sustainability-reports/Dax_Nachhaltigkeitsberichte/' + f
        ) for f in dax_files if f.endswith('.pdf')
    ]
except FileNotFoundError:
    print('Could not find Dax sustainability reports')

    
try:
    mdax_files = os.listdir('./sustainability-reports/MDax_Nachhaltigkeitsberichte')

    mdax_sustainability_information = [
        (f.split('_')[0],
         ",".join(f.split('_')[1:])[:-4],
         None,
         './sustainability-reports/MDax_Nachhaltigkeitsberichte/' + f
        ) for f in mdax_files if f.endswith('.pdf')
    ]
except FileNotFoundError:
    print('Could not find Dax sustainability reports')

mdax_sustainability_information[:3]

[('AarealBank',
  '2017',
  None,
  './sustainability-reports/MDax_Nachhaltigkeitsberichte/AarealBank_2017.pdf'),
 ('AarealBank',
  '2018',
  None,
  './sustainability-reports/MDax_Nachhaltigkeitsberichte/AarealBank_2018.pdf'),
 ('AarealBank',
  '2019',
  None,
  './sustainability-reports/MDax_Nachhaltigkeitsberichte/AarealBank_2019.pdf')]

## Analyze Reports
Counting the occurence of several words and its synonyms

### Keywords related to digitalization and CDOs

In [6]:
digital_keywords = [
    ('digitalisierung'),
    ('digitalization'),
    ('digitalisation'),
    ('digital', 'transformation'),
    ('digitale', 'transformation'),
    ('digital', 'innovation'),
    ('digitale', 'innovation'),
    ('digital', 'business', 'strategy')
]
cdo_keywords = [
    ('cdo'),
    ('chief', 'digital', 'officier'),
    ('chief', 'digital', 'officer')
]


### Keywords related to Artificial Intelligence

In [7]:
ai_keywords_en = [
    ('artificial', 'intelligence'),
    ('ai'), ('machine', 'learning'),
    ('intelligent'),
    ('deep', 'learning'),
    ('neural', 'networks')
]

ai_keywords_de = [
    ('künstliche', 'intelligenz'),
    ('ki'),
    ('maschinelles', 'lernen'),
    ('intelligent'),
    ('deep', 'learning'),
    ('neuronale', 'netze')
]

ai_keywords = ai_keywords_en + ai_keywords_de

### Function to Count ocurences of given keywords

In [8]:
keyword_mwe_tokenizer = MWETokenizer(digital_keywords + cdo_keywords + ai_keywords)

def count_occurences(reports_information, *keyword_lists):
    occurences = []
    for company, year, _, path in reports_information:
        try:
            with fitz.open(path) as doc:
                text = ""
                for page in doc:
                    text += page.getText().lower()

                tokens = [];
                for w in word_tokenize(text):
                    for t in w.split('-'):
                        if t.isalpha():
                            tokens.append(t)

                tokens = keyword_mwe_tokenizer.tokenize(tokens) # re-tokenize to with given multi word token.
                freqDist = FreqDist(tokens)

                keyword_occurences = [
                    sum(freqDist['_'.join(k)] for k in k_list) for k_list in keyword_lists
                ]

                words_total = len(tokens)
                occurences.append(
                    (company, year, *keyword_occurences, words_total)
                )
        except:
            print('failed to open ', path)
    return occurences

In [9]:
def render_table(occurences, *col_names):
    head = '<thead>' + ( '<th>{}</th>'*len(col_names) ).format(*col_names) + '</thead>'
    row = '<tr>' + '<td>{}</td>'*len(col_names) + '</tr>'
    return HTML('''
            <table>
                <thead>
                    {}
                </thead>
                <tbody>
                    {}
                </tbody>
            </table>    
            '''.format(
                head,
                "".join(row.format(*o) for o in occurences)
                )
            )

### Occurences of keywords related to digitalization, CDOs AI in DAX Annual Reporting

In [10]:
occurences = count_occurences(dax_annuals_information, digital_keywords, cdo_keywords, ai_keywords_en, ai_keywords_de, ai_keywords)

render_table(
    occurences,
    'Company',
    'Year',
    'Occurences Digital',
    'Occurences CDO',
    'Occurences AI (en)',
    'Occurences AI (de)',
    'Occurences AI (combined)',
    'Words Total'
)

mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object


Company,Year,Occurences Digital,Occurences CDO,Occurences AI (en),Occurences AI (de),Occurences AI (combined),Words Total
Adidas,2007,0,0,0,0,0,86281
Adidas,2008,0,0,0,0,0,89077
Adidas,2009,0,0,0,0,0,102075
Adidas,2010,0,0,0,0,0,102766
Adidas,2011,0,0,1,0,1,102265
Adidas,2012,0,0,2,0,2,117044
Adidas,2013,0,0,1,0,1,110617
Adidas,2014,0,0,0,0,0,93325
Adidas,2015,0,0,0,0,0,96688
Adidas,2016,2,0,0,0,0,94996


### Occurences of keywords related to digitalization, CDOs AI in MDAX Annual Reporting

In [11]:
occurences = count_occurences(mdax_annuals_information, digital_keywords, cdo_keywords, ai_keywords_en, ai_keywords_de, ai_keywords)

render_table(
    occurences,
    'Company',
    'Year',
    'Occurences Digital',
    'Occurences CDO',
    'Occurences AI (en)',
    'Occurences AI (de)',
    'Occurences AI (combined)',
    'Words Total'
)

mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
m

Company,Year,Occurences Digital,Occurences CDO,Occurences AI (en),Occurences AI (de),Occurences AI (combined),Words Total
Aareal Bank,2012,0,0,0,0,0,78154
Aareal Bank,2013,0,0,0,0,0,77171
Aareal Bank,2014,0,0,0,0,0,84161
Aareal Bank,2015,0,0,0,0,0,84531
Aareal Bank,2016,5,0,0,2,2,86715
Aareal Bank,2017,0,0,0,0,0,74289
Aareal Bank,2018,0,0,0,0,0,85702
Aareal Bank,2019,2,0,0,0,0,82420
Aareal Bank,2020,1,0,1,3,4,98874
Airbus SE,2012,0,0,0,0,0,53232


### Occurences of keywords related to AI in DAX-Sustainability Reporting

In [12]:
occurences = count_occurences(dax_sustainability_information, ai_keywords_en, ai_keywords_de, ai_keywords)

render_table(
    occurences,
    'Company',
    'Year',
    'Occurences AI (en)',
    'Occurences AI (de)',
    'Occurences AI (combined)',
    'Words Total'
)

mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object


Company,Year,Occurences AI (en),Occurences AI (de),Occurences AI (combined),Words Total
Adidas,2017,0,2,2,116549
Adidas,2018,0,1,1,118445
Adidas,2019,0,0,0,111680
Adidas,2020,0,0,0,106409
Allianz,2017,2,0,2,35405
Allianz,2018,0,0,0,47148
Allianz,2019,1,0,1,50474
BASF,2017,0,0,0,104535
BASF,2018,0,1,1,126505
BASF,2019,0,0,0,129261


### Occurences of keywords related to AI in MDAX-Sustainability Reporting

In [13]:
occurences = count_occurences(mdax_sustainability_information, ai_keywords_en, ai_keywords_de, ai_keywords)

render_table(
    occurences,
    'Company',
    'Year',
    'Occurences AI (en)',
    'Occurences AI (de)',
    'Occurences AI (combined)',
    'Words Total'
)

mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
m

Company,Year,Occurences AI (en),Occurences AI (de),Occurences AI (combined),Words Total
AarealBank,2017,0,0,0,5829
AarealBank,2018,0,0,0,6653
AarealBank,2019,0,0,0,8227
Aixtron,2018,0,0,0,12487
Aixtron,2019,0,0,0,14743
Aixtron,2020,0,0,0,22102
Alstria,2017-2018,0,0,0,22584
Alstria,2018-2019,4,0,4,23249
Alstria,2019-2020,0,0,0,26972
AroundTown,2019,0,0,0,23129
