In [199]:
import spacy
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS
import pandas as pd
from sklearn.cluster import KMeans
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
nlp = spacy.load("de")

In [3]:
%run src/file_utils.py
%run src/configuration.py

In [74]:
documents = [
    'BMW-AnnualReport-2010.json', 'BMW-AnnualReport-2011.json', 
    'BMW-AnnualReport-2012.json','BMW-AnnualReport-2013.json', 
    'BMW-AnnualReport-2014.json','BMW-AnnualReport-2015.json', 
    'BMW-AnnualReport-2016.json', 'BMW-AnnualReport-2017.json', 
 
    'CarlZeissMeditec-AnnualReport-2011.json',
    'CarlZeissMeditec-AnnualReport-2012.json', 'CarlZeissMeditec-AnnualReport-2013.json', 
    'CarlZeissMeditec-AnnualReport-2014.json','CarlZeissMeditec-AnnualReport-2015.json', 
    'CarlZeissMeditec-AnnualReport-2016.json', 'CarlZeissMeditec-AnnualReport-2017.json',
 
    'BVB-AnnualReport-2010.json', 'BVB-AnnualReport-2011.json', 
    'BVB-AnnualReport-2012.json', 'BVB-AnnualReport-2013.json', 
    'BVB-AnnualReport-2014.json', 'BVB-AnnualReport-2015.json',
    'BVB-AnnualReport-2016.json', 'BVB-AnnualReport-2017.json']

In [5]:
TYPE = 'type'
PARAGRAPH = 'paragraph'
CONTENT = 'content'

In [6]:
def readContentOfFile(file_name):
    content = ''
    try:
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    content += item[CONTENT]
    except:
        FileUtils.fix_json(file_name)
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    content += item[CONTENT]
    return content

In [8]:
extra_stop = ['million', 'tausend', 'eur', 'teur', '*', '+', '&','%']
def perform_lemmatization(document):
    content_of_document = readContentOfFile(document)
    sentence = nlp(content_of_document)
    filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
    filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
    filtered_words_withoutcurrency = [word for word in filtered_words_withoutdigits if not word.is_currency]
    filtered_words_withoutverbs = [word for word in filtered_words_withoutcurrency if word.pos_ != 'VERB']
    filtered_words_withoutnum = [word for word in filtered_words_withoutverbs if word.pos_ != 'NUM']
    filtered_words_withoutsym = [word for word in filtered_words_withoutnum if word.pos_ != 'SYM']
    filtered_words_withoutpunc = [word for word in filtered_words_withoutsym if word.pos_ != 'PUNCT']
    filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
    filtered_lemmas = [word for word in filtered_lemmas if not word in extra_stop ]
    lemmatized_content = " ".join(item for item in filtered_lemmas)
    return lemmatized_content.lower()

In [240]:
from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(FILE_PATH) if isfile(join(FILE_PATH, f))]
onlyfiles = onlyfiles[0:200]

# Lemmatization of documents

In [174]:
start_time = time.time()
lemm_docs_prep = [ perform_lemmatization(FILE_PATH + document) for document in onlyfiles]
print (time.time() - start_time)

313.35024881362915


# Construct vocabulary for BMW

In [244]:
bmw_lemm_docs_prep = [doc.replace("\n", "") for doc in bmw_lemm_docs_prep]

In [146]:
bmw_lemm_docs_prep = [
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2010.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2011.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2012.json'),
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2013.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2014.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2015.json'),
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2016.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2017.json')]

In [303]:
def clean_text(text):
    lines = []
    for line in text.splitlines():
        if line.endswith('-\n'):
            line.replace('-\n', '')
        else:
            line.replace('\n', ' \n')
        lines.append(line)
    return ''.join(lines)

In [280]:
def print_with_rep_top_words(model, feature_names, n_top_words):
    matr = model.components_ / model.components_.sum(axis=1)[:, np.newaxis]
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        
        message += " ".join([str(feature_names[i]) + ": " + "{:.5f}".format(matr[topic_idx, i])
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [304]:
bmw_files = [f for f in listdir(FILE_PATH) if isfile(join(FILE_PATH, f)) and 'BMW' in f]
bmw_files

['BMW-AnnualReport-2017.json',
 'BMW-QuarterlyReport-2011-Q3.json',
 'BMW-QuarterlyReport-2016-Q1.json',
 'BMW-QuarterlyReport-2014-Q3.json',
 'BMW-QuarterlyReport-2015-Q1.json',
 'BMW-QuarterlyReport-2011-Q1.json',
 'BMW-QuarterlyReport-2017-Q1.json',
 'BMW-AnnualReport-2013.json',
 'BMW-QuarterlyReport-2012-Q2.json',
 'BMW-QuarterlyReport-2016-Q3.json',
 'BMW-AnnualReport-2011.json',
 'BMW-QuarterlyReport-2013-Q2.json',
 'BMW-AnnualReport-2014.json',
 'BMW-QuarterlyReport-2010-Q3.json',
 'BMW-QuarterlyReport-2014-Q1.json',
 'BMW-QuarterlyReport-2015-Q2.json',
 'BMW-QuarterlyReport-2013-Q1.json',
 'BMW-QuarterlyReport-2012-Q1.json',
 'BMW-QuarterlyReport-2015-Q3.json',
 'BMW-QuarterlyReport-2012-Q3.json',
 'BMW-AnnualReport-2016.json',
 'BMW-QuarterlyReport-2016-Q2.json',
 'BMW-QuarterlyReport-2013-Q3.json',
 'BMW-QuarterlyReport-2010-Q1.json',
 'BMW-QuarterlyReport-2010-Q2.json',
 'BMW-AnnualReport-2012.json',
 'BMW-AnnualReport-2015.json',
 'BMW-QuarterlyReport-2011-Q2.json',
 'BMW-

In [306]:
bmw_lemm_docs_prep = [clean_text(perform_lemmatization(FILE_PATH + file)) for file in bmw_files if 'Annual' in file]

In [307]:
vectorizer_bmw = TfidfVectorizer()
start_time = time.time()
tfidf_matrix_bmw = vectorizer_bmw.fit_transform(bmw_lemm_docs_prep)
print (time.time() - start_time)

0.2548213005065918


In [308]:
bmw_feature_names = vectorizer_bmw.get_feature_names()
bmw_corpus_index = [n for n in [
    'BMW-2010', 'BMW-2011', 'BMW-2012', 
    'BMW-2013', 'BMW-2014', 'BMW-2015',
    'BMW-2016', 'BMW-2017']]
idf = vectorizer_bmw.idf_
df = pd.DataFrame(tfidf_matrix_bmw.T.todense(), index=bmw_feature_names)
df['idf'] = idf

In [309]:
#df = df.sort_values(by=['BMW-2010'], ascending=False)
bmw_df = df[(df['idf'] != 1)]

In [310]:
bmw_vocab = set()
bmw_vocab.update(bmw_df.index.tolist())

In [319]:
tf_vectorizer = CountVectorizer(vocabulary=bmw_vocab)
tf = tf_vectorizer.fit_transform(bmw_lemm_docs_prep)

lda = LatentDirichletAllocation(n_components=7,
                                learning_method='batch')
topic_model = lda.fit_transform(tf)

tf_feature_names = tf_vectorizer.get_feature_names()
print_with_rep_top_words(lda, tf_feature_names, 10)

Topic #0: an: 0.00332 vehicle: 0.00321 megacity: 0.00310 cfk: 0.00299 husqvarna: 0.00178 mcv: 0.00134 bmw5er: 0.00123 unterwegs: 0.00112 bmwgroup: 0.00112 ab: 0.00112
Topic #1: eine: 0.00006 ergebnisgrößen: 0.00006 entwicklerteam: 0.00006 mitarbeiterbezügen: 0.00006 mindestver: 0.00006 millimeter: 0.00006 xdrive23d: 0.00006 miete: 0.00006 anlagenauslastung: 0.00006 maßgebend: 0.00006
Topic #2: eine: 0.00006 ergebnisgrößen: 0.00006 entwicklerteam: 0.00006 mitarbeiterbezügen: 0.00006 mindestver: 0.00006 millimeter: 0.00006 xdrive23d: 0.00006 miete: 0.00006 anlagenauslastung: 0.00006 maßgebend: 0.00006
Topic #3: eine: 0.00006 ergebnisgrößen: 0.00006 entwicklerteam: 0.00006 mitarbeiterbezügen: 0.00006 mindestver: 0.00006 millimeter: 0.00006 xdrive23d: 0.00006 miete: 0.00006 anlagenauslastung: 0.00006 maßgebend: 0.00006
Topic #4: textziffer: 0.00360 an: 0.00283 i3: 0.00277 drivenow: 0.00206 aktienbasierte: 0.00190 betrachtungszeitraum: 0.00145 plug: 0.00139 i8: 0.00128 risikohöhe: 0.00128 c

In [317]:
tf

<8x15515 sparse matrix of type '<class 'numpy.int64'>'
	with 38640 stored elements in Compressed Sparse Row format>

In [316]:
print()
print(' Dominant topics per document ')
print('------------------------------')


for doc, document_name in enumerate([file for file in bmw_files if 'Annual' in file]):
    
    print('\n{:40.40}: '.format(document_name), end ='')
    most_probable = np.argsort(topic_model[doc, :])[:-6:-1]

    cummulated = 0
    for topic in most_probable:
        probability = topic_model[doc, topic]
        print('{:6.2%} {:3} '.format(probability, topic), end = '')
        cummulated = cummulated + probability
        if cummulated > 0.95: break


 Dominant topics per document 
------------------------------

BMW-AnnualReport-2017.json              : 99.99%   2 
BMW-AnnualReport-2013.json              : 99.99%   2 
BMW-AnnualReport-2011.json              : 99.99%   5 
BMW-AnnualReport-2014.json              : 99.99%   2 
BMW-AnnualReport-2016.json              : 99.99%   2 
BMW-AnnualReport-2012.json              : 99.99%   5 
BMW-AnnualReport-2015.json              : 66.88%   2 33.11%   5 
BMW-AnnualReport-2010.json              : 99.99%   5 

# LDA for one document (analyze paragraphs)

In [321]:
def readContentOfParagraphs(file_name):
    contents = []
    try:
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    contents.append(item[CONTENT])
    except:
        FileUtils.fix_json(file_name)
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    contents.append(item[CONTENT])
    return contents

In [323]:
nlp = spacy.load("de")
def perform_lemmatization(document):
    content_of_document = readContentOfFile(document)
    
    #remove minus sign and next-line sign
    content_of_document = content_of_document.replace('-\n','')
    content_of_document = content_of_document.replace('\n',' ')
    
    #replace all gco2 with co2
    content_of_document = content_of_document.replace('gCO2','CO2')
    
    #remove the character we don't need
    remove_char = content_of_document.maketrans('-',' ','+*<>%/&$')
    content_of_document = content_of_document.translate(remove_char)
    
    sentence = nlp(content_of_document)
    filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
    filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
    filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
    filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
    
    final = []  
    for item in filtered_lemmas:
        #remove the words contain digit except of co2
        if(any(c.isdigit() for c in item)):
            if 'CO2' in item:
                final.append(item)
        else:
            #remove the words contain dot
            if '.' not in item:
                final.append(item)
    
    lemmatized_content = " ".join(item for item in final)
    
    #output the result into file 
    #if document.startswith(FILE_PATH):
    #    filename = "filtered_" + document[len(FILE_PATH):]
    
    #with open(filename, 'w') as outfile:
    #    json.dump(lemmatized_content.lower(), outfile)
    
    return lemmatized_content.lower()

In [422]:
def lemmatize_paragraphs(paragraphs):
    lemmatized_paragraphs = []
    for paragraph in paragraphs:
        content_of_document = paragraph.replace('-\n','')
        content_of_document = content_of_document.replace('\n',' ')
    
        #replace all gco2 with co2
        content_of_document = content_of_document.replace('gCO2','CO2')
    
        #remove the character we don't need
        remove_char = content_of_document.maketrans('-',' ','+*<>%/&$')
        content_of_document = content_of_document.translate(remove_char)
    
        sentence = nlp(content_of_document)
        filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
        filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
        filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
        filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
    
        final = []  
        for item in filtered_lemmas:
            #remove the words contain digit except of co2
            if 'bmw' in item.lower():
                continue
                
            if 'group' in item.lower():
                continue
            
            if(any(c.isdigit() for c in item)):
                if 'CO2' in item:
                    final.append(item)
            else:
                #remove the words contain dot
                if '.' not in item:
                    final.append(item)
    
        lemmatized_content = " ".join(item for item in final)
        lemmatized_paragraphs.append(lemmatized_content.lower())
    return lemmatized_paragraphs

In [431]:
lem_pars = lemmatize_paragraphs(readContentOfParagraphs(FILE_PATH + 'BMW-AnnualReport-2016.json'))

In [432]:
readContentOfParagraphs(FILE_PATH + 'BMW-AnnualReport-2016.json')[7]

'In den Sitzungen berichtete uns der Vorstand regelmäßig und ausführlich über die Lage des\nUnternehmens. Dabei ging der Vorstand auf die Absatzentwicklung und die Wettbewerbssituation\nin den Segmenten Automobile und Motorräder sowie die Entwicklung der Personalzahlen ein. Er\nunterrichtete uns über die konjunkturelle Entwicklung in wichtigen Regionen der Welt und die\njeweiligen wirtschaftlichen Prognosen. Der Vorstand zeigte uns regelmäßig die Entwicklung des\nNeugeschäfts mit Endkunden und das Geschäftsvolumen im Segment Finanzdienstleistungen\nauf. Dabei erläuterte der Vorstand jeweils auch Planungsabweichungen.\n'

In [433]:
vectorizer_bmw = TfidfVectorizer()
start_time = time.time()
tfidf_matrix_bmw = vectorizer_bmw.fit_transform(lem_pars)
print (time.time() - start_time)

0.057863712310791016


In [434]:
bmw_feature_names = vectorizer_bmw.get_feature_names()
bmw_corpus_index = [n for n in [
    'BMW-2010', 'BMW-2011', 'BMW-2012', 
    'BMW-2013', 'BMW-2014', 'BMW-2015',
    'BMW-2016', 'BMW-2017']]
idf = vectorizer_bmw.idf_
df = pd.DataFrame(tfidf_matrix_bmw.T.todense(), index=bmw_feature_names)
df['idf'] = idf

In [435]:
bmw_df = df[(df['idf'] != 1)]

In [436]:
bmw_vocab = set()
bmw_vocab.update(bmw_df.index.tolist())

In [437]:
tf_vectorizer = CountVectorizer()
tf = tf_vectorizer.fit_transform(lem_pars)

lda = LatentDirichletAllocation(n_components=20,
                                learning_method='batch',
                                max_iter = 200)
topic_model = lda.fit_transform(tf)

tf_feature_names = tf_vectorizer.get_feature_names()
print_with_rep_top_words(lda, tf_feature_names, 5)

Topic #0: joint: 0.01994 unternehmen: 0.01624 konzernabschluss: 0.01092 finanz: 0.01013 mitglied: 0.00895
Topic #1: mitglied: 0.00653 risiko: 0.00627 aufsichtsrats: 0.00557 fahrzeug: 0.00536 insbesondere: 0.00451
Topic #2: ergebnis: 0.00701 ebit: 0.00642 marge: 0.00552 negativ: 0.00531 steuerung: 0.00463
Topic #3: compliance: 0.02959 vorstehen: 0.00792 mitarbeiter: 0.00792 system: 0.00619 management: 0.00591
Topic #4: million: 0.11366 höhe: 0.02825 vorjahr: 0.01830 wesentliche: 0.01278 betragen: 0.01180
Topic #5: fahrzeug: 0.00601 unternehmen: 0.00542 modell: 0.00505 kunde: 0.00465 ander: 0.00445
Topic #6: next: 0.01360 unternehmen: 0.01028 strategie: 0.00838 one: 0.00829 number: 0.00829
Topic #7: it: 0.01081 unternehmen: 0.00872 hoch: 0.00709 datenschutz: 0.00656 sgl: 0.00647
Topic #8: vermögenswerte: 0.01235 finanziell: 0.00950 bewerten: 0.00754 zeitwert: 0.00747 beizulegenden: 0.00742
Topic #9: vorstehen: 0.01665 aufsichtsrat: 0.00963 unternehmen: 0.00760 insbesondere: 0.00658 vorst

In [439]:
print()
print(' Dominant topics per document ')
print('------------------------------')


for doc, document_name in enumerate(range(0,len(lem_pars),1)):
    
    print('\n{:40.40}: '.format(str(document_name)), end ='')
    most_probable = np.argsort(topic_model[doc, :])[:-6:-1]

    cummulated = 0
    for topic in most_probable:
        probability = topic_model[doc, topic]
        print('{:6.2%} {:3} '.format(probability, topic), end = '')
        cummulated = cummulated + probability
        if cummulated > 0.95: break


 Dominant topics per document 
------------------------------

0                                       : 48.23%  11 44.85%   0  0.38%  16  0.38%   2  0.38%  18 
1                                       : 95.87%   6 
2                                       : 95.48%  13 
3                                       : 95.48%   3 
4                                       : 96.35%  13 
5                                       : 32.24%  17 26.27%  14 22.87%  19 14.61%   0 
6                                       : 88.12%   7  0.63%   9  0.63%  12  0.63%   5  0.63%  19 
7                                       : 50.00%   9 30.79%  16 16.97%  19 
8                                       : 46.60%   4 36.99%   9 14.75%   0 
9                                       : 48.26%   4 29.97%  13 17.90%  11 
10                                      : 77.90%   3 19.45%   6 
11                                      : 46.71%  16 24.90%  19 19.45%  11  7.34%   1 
12                                      : 94.72%   6  0.2

639                                     : 94.06%  12  0.31%   4  0.31%   0  0.31%   2  0.31%   5 
640                                     : 97.68%  18 
641                                     : 53.45%   8 43.74%  18 
642                                     : 97.43%  10 
643                                     : 91.36%  19  0.45%   8  0.45%  13  0.45%  15  0.45%   1 
644                                     : 92.08%  12  0.42%  15  0.42%  13  0.42%   9  0.42%  19 
645                                     : 93.21%   1  0.36%   4  0.36%  11  0.36%   8  0.36%   3 
646                                     : 98.06%   7 
647                                     : 98.30%   4 
648                                     : 96.35%   7 
649                                     : 95.87%   8 
650                                     : 94.06%  16  0.31%   4  0.31%   8  0.31%   7  0.31%  14 
651                                     : 96.94%   1 
652                                     : 92.08%   1  0.42%  13  0.

# Construct vocabulary

In [175]:
start_time = time.time()
common_vocabularly_lem = set()
for document in onlyfiles:
    content_of_document = readContentOfFile(FILE_PATH + document)
    sentence = nlp(content_of_document)
    # problems with non-lower case stop words
    filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
    filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
    filtered_words_withoutcurrency = [word for word in filtered_words_withoutdigits if not word.is_currency]
    filtered_words_withoutverbs = [word for word in filtered_words_withoutcurrency if word.pos_ != 'VERB']
    filtered_words_withoutnum = [word for word in filtered_words_withoutverbs if word.pos_ != 'NUM']
    filtered_words_withoutsym = [word for word in filtered_words_withoutnum if word.pos_ != 'SYM']
    filtered_words_withoutpunc = [word for word in filtered_words_withoutsym if word.pos_ != 'PUNCT']
    filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
    vocabularly = set()
    for word in filtered_lemmas:
        vocabularly.add(word.replace('\n', '').strip().lower())
    new_vocab = set()
    for u in vocabularly:
        if u != '':
            new_vocab.add(u)

#     lemmatized_content = " ".join(item for item in filtered_lemmas)
#     vectorizer = TfidfVectorizer(vocabulary=new_vocab)
#     tfidf_matrix = vectorizer.fit_transform([lemmatized_content])
#     feature_names = vectorizer.get_feature_names()
#     corpus_index = [n for n in ['Values']]
#     df = pd.DataFrame(tfidf_matrix.T.todense(), index=feature_names, columns=corpus_index)
#     df = df.sort_values(by=['Values'], ascending=False)
#     print (df.head(5).index.values.tolist())
#     common_vocabularly_lem.update(df.head(1000).index.values.tolist())
    common_vocabularly_lem.update(new_vocab)
# not removed by spacy.    
common_vocabularly_lem.remove('million')
common_vocabularly_lem.remove('tausend')
common_vocabularly_lem.remove('eur')
common_vocabularly_lem.remove('teur')
common_vocabularly_lem.remove('*')
common_vocabularly_lem.remove('+')
common_vocabularly_lem.remove('&')
common_vocabularly_lem.remove('%')
print (time.time() - start_time)

313.852187871933


In [176]:
def print_top_words(model, feature_names, n_top_words):
    glob_set = set()
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        l = []
        #print(glob_set)
        for i in topic.argsort()[:-100 - 1:-1]:
            if len(l) == n_top_words:
                break
            if feature_names[i] not in glob_set:
                glob_set.add(feature_names[i])
                l.append(feature_names[i])
        message += " ".join(l)
        print(message)
    print()

In [77]:
#common_vocabularly_lem.remove('bmw')
#common_vocabularly_lem.remove('group')
#common_vocabularly_lem.remove('carl')
#common_vocabularly_lem.remove('zeiss')
#common_vocabularly_lem.remove('borussia')
#common_vocabularly_lem.remove('dortmund')

In [192]:
tf_vectorizer = CountVectorizer(vocabulary=common_vocabularly_lem)
tf = tf_vectorizer.fit_transform(lemm_docs_prep)

In [231]:
lda = LatentDirichletAllocation(n_components=25,
                                learning_method='batch',
                                learning_offset=50.,
                                random_state=0,
                                   max_iter=50)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_components=25, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [232]:
tf_feature_names = tf_vectorizer.get_feature_names()
print (print_top_words(lda, tf_feature_names, 10))

Topic #0: � fraport-spezifischer fremd- fremdanteile fremdanteilen fremdbesitz fremdbezug fremdbezugsteile fremddarlehen fremddruck-
Topic #1: evt bionamics innovate euprotec execute earn janssen gegenleistung hyperion vergleichsperiode
Topic #2: fremddruckleistungen fremdeinwirkungen fremdfertigungsquote fremdfi fremdfinan- fremdfinanzie- fremdfinanzierung fremdfinanzierungen fremdfinanzierungs- fremdfinanzierungsangebot
Topic #3: bayer vorjahr sondereinflüssen umsatz cropscience usa ebitda healthcare ebit wpb
Topic #4: fremdfinanzierungsanteil fremdfinanzierungsbasis fremdfinanzierungsbedarf fremd frem- freizügigkeitsleis- freital freistaat freiste- freistehend
Topic #5: aktie euro unternehmen bilfinger gesellschaft höhe konzern aufsichtsrat prozent vorstehen
Topic #6: freistehende freistellung freistellungs- freistellungserklärungen freistellungsphase freitag freiverkehr freizeitphase freiwerdende freiwillig
Topic #7: evonik aktivität bereinigen fortgeführt ergebnis operativ vivawes

In [234]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_with_rep_top_words(lda, tf_feature_names, 10)

Topic #0: �: 0.04000 fraport-spezifischer: 0.04000 fremd-: 0.04000 fremdanteile: 0.04000 fremdanteilen: 0.04000 fremdbesitz: 0.04000 fremdbezug: 0.04000 fremdbezugsteile: 0.04000 fremddarlehen: 0.04000 fremddruck-: 0.04000
Topic #1: evt: 30.58348 bionamics: 21.96573 innovate: 21.10578 euprotec: 20.04000 execute: 19.33106 earn: 16.09364 janssen: 11.01888 gegenleistung: 10.94376 hyperion: 9.04000 vergleichsperiode: 9.03042
Topic #2: �: 0.04000 fraport-spezifischer: 0.04000 fremd-: 0.04000 fremdanteile: 0.04000 fremdanteilen: 0.04000 fremdbesitz: 0.04000 fremdbezug: 0.04000 fremdbezugsteile: 0.04000 fremddarlehen: 0.04000 fremddruck-: 0.04000
Topic #3: bayer: 242.47646 vorjahr: 136.19577 sondereinflüssen: 82.15943 umsatz: 70.88890 cropscience: 61.03360 usa: 60.59338 ebitda: 58.25339 healthcare: 52.58574 ebit: 51.87215 wpb: 49.04000
Topic #4: �: 0.04000 fraport-spezifischer: 0.04000 fremd-: 0.04000 fremdanteile: 0.04000 fremdanteilen: 0.04000 fremdbesitz: 0.04000 fremdbezug: 0.04000 fremdb