In [199]:
import spacy
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS
import pandas as pd
from sklearn.cluster import KMeans
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
nlp = spacy.load("de")

In [3]:
%run src/file_utils.py
%run src/configuration.py

In [74]:
documents = [
    'BMW-AnnualReport-2010.json', 'BMW-AnnualReport-2011.json', 
    'BMW-AnnualReport-2012.json','BMW-AnnualReport-2013.json', 
    'BMW-AnnualReport-2014.json','BMW-AnnualReport-2015.json', 
    'BMW-AnnualReport-2016.json', 'BMW-AnnualReport-2017.json', 
 
    'CarlZeissMeditec-AnnualReport-2011.json',
    'CarlZeissMeditec-AnnualReport-2012.json', 'CarlZeissMeditec-AnnualReport-2013.json', 
    'CarlZeissMeditec-AnnualReport-2014.json','CarlZeissMeditec-AnnualReport-2015.json', 
    'CarlZeissMeditec-AnnualReport-2016.json', 'CarlZeissMeditec-AnnualReport-2017.json',
 
    'BVB-AnnualReport-2010.json', 'BVB-AnnualReport-2011.json', 
    'BVB-AnnualReport-2012.json', 'BVB-AnnualReport-2013.json', 
    'BVB-AnnualReport-2014.json', 'BVB-AnnualReport-2015.json',
    'BVB-AnnualReport-2016.json', 'BVB-AnnualReport-2017.json']

In [5]:
TYPE = 'type'
PARAGRAPH = 'paragraph'
CONTENT = 'content'

In [6]:
def readContentOfFile(file_name):
    content = ''
    try:
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    content += item[CONTENT]
    except:
        FileUtils.fix_json(file_name)
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    content += item[CONTENT]
    return content

In [8]:
extra_stop = ['million', 'tausend', 'eur', 'teur', '*', '+', '&','%']
def perform_lemmatization(document):
    content_of_document = readContentOfFile(document)
    sentence = nlp(content_of_document)
    filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
    filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
    filtered_words_withoutcurrency = [word for word in filtered_words_withoutdigits if not word.is_currency]
    filtered_words_withoutverbs = [word for word in filtered_words_withoutcurrency if word.pos_ != 'VERB']
    filtered_words_withoutnum = [word for word in filtered_words_withoutverbs if word.pos_ != 'NUM']
    filtered_words_withoutsym = [word for word in filtered_words_withoutnum if word.pos_ != 'SYM']
    filtered_words_withoutpunc = [word for word in filtered_words_withoutsym if word.pos_ != 'PUNCT']
    filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
    filtered_lemmas = [word for word in filtered_lemmas if not word in extra_stop ]
    lemmatized_content = " ".join(item for item in filtered_lemmas)
    return lemmatized_content.lower()

In [240]:
from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(FILE_PATH) if isfile(join(FILE_PATH, f))]
onlyfiles = onlyfiles[0:200]

# Lemmatization of documents

In [174]:
start_time = time.time()
lemm_docs_prep = [ perform_lemmatization(FILE_PATH + document) for document in onlyfiles]
print (time.time() - start_time)

313.35024881362915


# Construct vocabulary for BMW

In [146]:
bmw_lemm_docs_prep = [
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2010.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2011.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2012.json'),
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2013.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2014.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2015.json'),
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2016.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2017.json')]

In [147]:
vectorizer_bmw = TfidfVectorizer(max_df=0.7)
start_time = time.time()
tfidf_matrix_bmw = vectorizer_bmw.fit_transform(bmw_lemm_docs_prep)
print (time.time() - start_time)

0.16842913627624512


In [148]:
bmw_feature_names = vectorizer_bmw.get_feature_names()
bmw_corpus_index = [n for n in [
    'BMW-2010', 'BMW-2011', 'BMW-2012', 
    'BMW-2013', 'BMW-2014', 'BMW-2015',
    'BMW-2016', 'BMW-2017']]
idf = vectorizer_bmw.idf_
df = pd.DataFrame(tfidf_matrix_bmw.T.todense(), index=bmw_feature_names, columns=bmw_corpus_index)
df['idf'] = idf

In [149]:
#df = df.sort_values(by=['BMW-2010'], ascending=False)
bmw_df = df[(df['idf'] != 1)]
bmw_df.head(10).index.tolist()

['000',
 '000er',
 '000quadratme',
 '000ste',
 '000sten',
 '030',
 '032',
 '036',
 '04',
 '043']

In [150]:
bmw_vocab = set()
bmw_vocab.update(bmw_df.index.tolist())

In [241]:
tf_vectorizer = CountVectorizer(vocabulary = bmw_vocab, max_df=0.7)
tf = tf_vectorizer.fit_transform(bmw_lemm_docs_prep)

lda = LatentDirichletAllocation(n_components=3,  max_iter=50,
                                learning_method='batch',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
print_with_rep_top_words(lda, tf_feature_names, 10)

Topic #0: bewertungsmethode: 24.33352 vermietete: 24.33344 concept: 23.33371 icl: 23.33333 amsterdam: 22.33362 citroën: 21.33358 iasb: 21.33353 operate: 19.33348 there: 19.33274 em: 17.33354
Topic #1: vehicle: 31.33375 megacity: 28.33372 leistungsindikatoren: 21.33334 nettoschuld: 20.33369 dem: 18.33357 vermietete: 18.33323 ergebnisauswirkungen: 18.33297 zweijährig: 17.33291 überarbeitet: 16.33342 bilanzsumme: 15.33347
Topic #2: next: 46.33456 co2: 36.33408 ergebnisauswirkungen: 28.33439 zweijährig: 28.33439 there: 28.33392 here: 24.33384 brexit: 24.33333 digitalisierung: 20.33417 iperformance: 20.33389 chancenbericht: 18.33398



# Construct vocabulary

In [175]:
start_time = time.time()
common_vocabularly_lem = set()
for document in onlyfiles:
    content_of_document = readContentOfFile(FILE_PATH + document)
    sentence = nlp(content_of_document)
    # problems with non-lower case stop words
    filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
    filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
    filtered_words_withoutcurrency = [word for word in filtered_words_withoutdigits if not word.is_currency]
    filtered_words_withoutverbs = [word for word in filtered_words_withoutcurrency if word.pos_ != 'VERB']
    filtered_words_withoutnum = [word for word in filtered_words_withoutverbs if word.pos_ != 'NUM']
    filtered_words_withoutsym = [word for word in filtered_words_withoutnum if word.pos_ != 'SYM']
    filtered_words_withoutpunc = [word for word in filtered_words_withoutsym if word.pos_ != 'PUNCT']
    filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
    vocabularly = set()
    for word in filtered_lemmas:
        vocabularly.add(word.replace('\n', '').strip().lower())
    new_vocab = set()
    for u in vocabularly:
        if u != '':
            new_vocab.add(u)

#     lemmatized_content = " ".join(item for item in filtered_lemmas)
#     vectorizer = TfidfVectorizer(vocabulary=new_vocab)
#     tfidf_matrix = vectorizer.fit_transform([lemmatized_content])
#     feature_names = vectorizer.get_feature_names()
#     corpus_index = [n for n in ['Values']]
#     df = pd.DataFrame(tfidf_matrix.T.todense(), index=feature_names, columns=corpus_index)
#     df = df.sort_values(by=['Values'], ascending=False)
#     print (df.head(5).index.values.tolist())
#     common_vocabularly_lem.update(df.head(1000).index.values.tolist())
    common_vocabularly_lem.update(new_vocab)
# not removed by spacy.    
common_vocabularly_lem.remove('million')
common_vocabularly_lem.remove('tausend')
common_vocabularly_lem.remove('eur')
common_vocabularly_lem.remove('teur')
common_vocabularly_lem.remove('*')
common_vocabularly_lem.remove('+')
common_vocabularly_lem.remove('&')
common_vocabularly_lem.remove('%')
print (time.time() - start_time)

313.852187871933


In [176]:
def print_top_words(model, feature_names, n_top_words):
    glob_set = set()
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        l = []
        #print(glob_set)
        for i in topic.argsort()[:-100 - 1:-1]:
            if len(l) == n_top_words:
                break
            if feature_names[i] not in glob_set:
                glob_set.add(feature_names[i])
                l.append(feature_names[i])
        message += " ".join(l)
        print(message)
    print()

In [77]:
#common_vocabularly_lem.remove('bmw')
#common_vocabularly_lem.remove('group')
#common_vocabularly_lem.remove('carl')
#common_vocabularly_lem.remove('zeiss')
#common_vocabularly_lem.remove('borussia')
#common_vocabularly_lem.remove('dortmund')

In [192]:
tf_vectorizer = CountVectorizer(vocabulary=common_vocabularly_lem)
tf = tf_vectorizer.fit_transform(lemm_docs_prep)

In [231]:
lda = LatentDirichletAllocation(n_components=25,
                                learning_method='batch',
                                learning_offset=50.,
                                random_state=0,
                                   max_iter=50)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_components=25, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [232]:
tf_feature_names = tf_vectorizer.get_feature_names()
print (print_top_words(lda, tf_feature_names, 10))

Topic #0: � fraport-spezifischer fremd- fremdanteile fremdanteilen fremdbesitz fremdbezug fremdbezugsteile fremddarlehen fremddruck-
Topic #1: evt bionamics innovate euprotec execute earn janssen gegenleistung hyperion vergleichsperiode
Topic #2: fremddruckleistungen fremdeinwirkungen fremdfertigungsquote fremdfi fremdfinan- fremdfinanzie- fremdfinanzierung fremdfinanzierungen fremdfinanzierungs- fremdfinanzierungsangebot
Topic #3: bayer vorjahr sondereinflüssen umsatz cropscience usa ebitda healthcare ebit wpb
Topic #4: fremdfinanzierungsanteil fremdfinanzierungsbasis fremdfinanzierungsbedarf fremd frem- freizügigkeitsleis- freital freistaat freiste- freistehend
Topic #5: aktie euro unternehmen bilfinger gesellschaft höhe konzern aufsichtsrat prozent vorstehen
Topic #6: freistehende freistellung freistellungs- freistellungserklärungen freistellungsphase freitag freiverkehr freizeitphase freiwerdende freiwillig
Topic #7: evonik aktivität bereinigen fortgeführt ergebnis operativ vivawes

In [233]:
def print_with_rep_top_words(model, feature_names, n_top_words):
    #matr = model.components_ / model.components_.sum(axis=1)[:, np.newaxis]
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        
        message += " ".join([str(feature_names[i]) + ": " + "{:.5f}".format(model.components_[topic_idx, i])
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [234]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_with_rep_top_words(lda, tf_feature_names, 10)

Topic #0: �: 0.04000 fraport-spezifischer: 0.04000 fremd-: 0.04000 fremdanteile: 0.04000 fremdanteilen: 0.04000 fremdbesitz: 0.04000 fremdbezug: 0.04000 fremdbezugsteile: 0.04000 fremddarlehen: 0.04000 fremddruck-: 0.04000
Topic #1: evt: 30.58348 bionamics: 21.96573 innovate: 21.10578 euprotec: 20.04000 execute: 19.33106 earn: 16.09364 janssen: 11.01888 gegenleistung: 10.94376 hyperion: 9.04000 vergleichsperiode: 9.03042
Topic #2: �: 0.04000 fraport-spezifischer: 0.04000 fremd-: 0.04000 fremdanteile: 0.04000 fremdanteilen: 0.04000 fremdbesitz: 0.04000 fremdbezug: 0.04000 fremdbezugsteile: 0.04000 fremddarlehen: 0.04000 fremddruck-: 0.04000
Topic #3: bayer: 242.47646 vorjahr: 136.19577 sondereinflüssen: 82.15943 umsatz: 70.88890 cropscience: 61.03360 usa: 60.59338 ebitda: 58.25339 healthcare: 52.58574 ebit: 51.87215 wpb: 49.04000
Topic #4: �: 0.04000 fraport-spezifischer: 0.04000 fremd-: 0.04000 fremdanteile: 0.04000 fremdanteilen: 0.04000 fremdbesitz: 0.04000 fremdbezug: 0.04000 fremdb