# DAX Reporting NLP

In [1]:
from IPython.display import HTML, display
import os
import time
import requests
from requests_html import HTMLSession
import fitz # this ist pymupdf
from nltk import FreqDist
from nltk.tokenize import word_tokenize, MWETokenizer

## Downloading Reports
Reports are gathered from an verview page maintained at `boersengefluester.de`.

In [2]:
reporting_overview_url = 'https://boersengefluester.de/download-der-dax-geschaftsberichte-von-2007-bis-2018/'
download_folder = './downloads'
session = HTMLSession()
r = session.get(reporting_overview_url)

In [3]:
report_links = r.html.xpath('//table[@class="ath_table"] // a[img][starts-with(@title, "Geschäftsbericht")]')
print('Links to {} reports found'.format(len(report_links)))

# list of tuples like (company, year, report_url)
reports_information = [("_".join(r.attrs['title'].split()[1:-3]), r.attrs['title'].split()[-3], r.absolute_links.pop()) for r in report_links]

#manually add beiersdorf as not any more in overview (kicked from dax as of 2021)
reports_information += [
    ("Beiersdorf", year, 'https://boersengefluester.de/wp-content/uploads/assets/annuals/{}/520000.pdf'.format(year)) for year in range(2015, 2021)
]

reports_information[-6:]

Links to 391 reports found


[('Beiersdorf',
  2015,
  'https://boersengefluester.de/wp-content/uploads/assets/annuals/2015/520000.pdf'),
 ('Beiersdorf',
  2016,
  'https://boersengefluester.de/wp-content/uploads/assets/annuals/2016/520000.pdf'),
 ('Beiersdorf',
  2017,
  'https://boersengefluester.de/wp-content/uploads/assets/annuals/2017/520000.pdf'),
 ('Beiersdorf',
  2018,
  'https://boersengefluester.de/wp-content/uploads/assets/annuals/2018/520000.pdf'),
 ('Beiersdorf',
  2019,
  'https://boersengefluester.de/wp-content/uploads/assets/annuals/2019/520000.pdf'),
 ('Beiersdorf',
  2020,
  'https://boersengefluester.de/wp-content/uploads/assets/annuals/2020/520000.pdf')]

In [4]:
try:
    os.makedirs(download_folder)
    print('Created Download directory, proceed downloading reports')
    
    counter = 0
    for company, year, url in reports_information:
        print('Downloading: ', url)

        response = requests.get(url)
        with open('{}/{}-{}.pdf'.format(download_folder, company, year), 'wb') as fp:
            fp.write(response.content)
            print("Saved: ", '{}-{}.pdf'.format(company, year))
            counter += 1

        print("Waiting for 3 seconds")
        time.sleep(3) # wait to not overrun server...

    print('Succedfully downloaded ', counter, ' reports.')
except:
    print('Download directory already exists, delete to re-download reports')

Download directory already exists, delete to re-download reports


## Analyze Reports
counting the occurence of several words and its synonyms

In [5]:
digital_keywords = [
    ('digitalisierung'),
    ('digitalization'),
    ('digitalisation'),
    ('digital', 'transformation'),
    ('digitale', 'transformation'),
    ('digital', 'innovation'),
    ('digitale', 'innovation'),
    ('digital', 'business', 'strategy')
]
cdo_keywords = [
    ('cdo'),
    ('chief', 'digital', 'officier'),
    ('chief', 'digital', 'officer')
]

keyword_mwe_tokenizer = MWETokenizer(digital_keywords + cdo_keywords)

In [7]:
occurences = []
for company, year, _ in reports_information:
    with fitz.open('{}/{}-{}.pdf'.format(download_folder, company, year)) as doc:
        text = ""
        for page in doc:
            text += page.getText().lower()
        
        tokens = [w for w in word_tokenize(text) if w.isalpha()]
        tokens = keyword_mwe_tokenizer.tokenize(tokens)
        freqDist = FreqDist(tokens)
        
        digital_occurences = sum(freqDist['_'.join(k)] for k in digital_keywords)
        cdo_occurences = sum(freqDist['_'.join(k)] for k in cdo_keywords)
        occurences.append(
            (company, year, digital_occurences, cdo_occurences)
        )

mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object


In [8]:
HTML('''
<table>
    <thead><th>Company</th><th>Year</th><th>Occurences Digital</th><th>Occurences CDO</th></thead>
    <tbody>
        {}
    </tbody>
</table>    
'''.format(
    "".join(
        '<tr><td>{}</td><td>{}</td><td>{}</td><td>{}</td></tr>'.format(*o) for o in occurences
        )
    )
)

Company,Year,Occurences Digital,Occurences CDO
Adidas,2007,0,0
Adidas,2008,0,0
Adidas,2009,0,0
Adidas,2010,0,0
Adidas,2011,0,0
Adidas,2012,0,0
Adidas,2013,0,0
Adidas,2014,0,0
Adidas,2015,0,0
Adidas,2016,2,0
