# Day2

In [113]:
import spacy
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS
import pandas as pd
from sklearn.cluster import KMeans
import time

In [3]:
%run src/file_utils.py
%run src/configuration.py

In [49]:
document_test = ['BMW-AnnualReport-2016.json']

In [24]:
vocab_documents = ['BMW-AnnualReport-2016.json', 'CarlZeissMeditec-AnnualReport-2016.json', 'BVB-AnnualReport-2016.json']

In [4]:
documents = ['BMW-AnnualReport-2015.json', 'BMW-AnnualReport-2016.json', 'BMW-AnnualReport-2017.json', 
 'CarlZeissMeditec-AnnualReport-2015.json', 'CarlZeissMeditec-AnnualReport-2016.json', 'CarlZeissMeditec-AnnualReport-2017.json',
 'BVB-AnnualReport-2015.json', 'BVB-AnnualReport-2016.json', 'BVB-AnnualReport-2017.json']

In [5]:
TYPE = 'type'
PARAGRAPH = 'paragraph'
CONTENT = 'content'

In [6]:
def readContentOfFile(file_name):
    content = ''
    try:
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    content += item[CONTENT]
    except:
        FileUtils.fix_json(file_name)
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    content += item[CONTENT]
    return content

# Clustering without lemmatization

In [40]:
common_vocabularly = set()
for document in documents:
    content_of_document = readContentOfFile(FILE_PATH + document)
    nlp = spacy.load("de")
    sentence = nlp(content_of_document)
    filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
    filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
    filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
    vocabularly = set()
    for word in filtered_words_withoutpunc:
        vocabularly.add(word.text.replace('\n', '').strip().lower())
    new_vocab = set()
    for u in vocabularly:
        if u != '':
            new_vocab.add(u)
    vectorizer = TfidfVectorizer(vocabulary=new_vocab)
    tfidf_matrix = vectorizer.fit_transform([content_of_document])
    feature_names = vectorizer.get_feature_names()
    corpus_index = [n for n in ['Values']]
    df = pd.DataFrame(tfidf_matrix.T.todense(), index=feature_names, columns=corpus_index)
    df = df.sort_values(by=['Values'], ascending=False)
    common_vocabularly.update(df.head(1000).index.values.tolist())


In [42]:
vectorizer = TfidfVectorizer(vocabulary=common_vocabularly)

In [46]:
tfidf_matrix = vectorizer.fit_transform(
    [readContentOfFile(FILE_PATH + 'BVB-AnnualReport-2011.json'), 
     readContentOfFile(FILE_PATH + 'BVB-AnnualReport-2012.json'), 
     readContentOfFile(FILE_PATH + 'BMW-AnnualReport-2012.json'),
     readContentOfFile(FILE_PATH + 'BMW-AnnualReport-2011.json'),
     readContentOfFile(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2013.json'),
     readContentOfFile(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2012.json'),
     readContentOfFile(FILE_PATH + 'BVB-AnnualReport-2013.json'), 
     readContentOfFile(FILE_PATH + 'BMW-AnnualReport-2013.json'),])

In [47]:
km = KMeans(n_clusters=3, init='k-means++')
km.fit(tfidf_matrix)
km.labels_

array([1, 1, 2, 2, 0, 0, 1, 2], dtype=int32)

# Clustering with lemmatization

In [159]:
nlp = spacy.load("de")
def perform_lemmatization(document):
    content_of_document = readContentOfFile(document)
    sentence = nlp(content_of_document)
    filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
    filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
    filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
    filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
    lemmatized_content = " ".join(item for item in filtered_lemmas)
    return lemmatized_content

In [160]:
start_time = time.time()
common_vocabularly_lem = set()
for document in documents:
    content_of_document = readContentOfFile(FILE_PATH + document)
    nlp = spacy.load("de")
    sentence = nlp(content_of_document)
    filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
    filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
    filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
    filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
    vocabularly = set()
    for word in filtered_lemmas:
        vocabularly.add(word.replace('\n', '').strip().lower())
    new_vocab = set()
    for u in vocabularly:
        if u != '':
            new_vocab.add(u)

#     lemmatized_content = " ".join(item for item in filtered_lemmas)
#     vectorizer = TfidfVectorizer(vocabulary=new_vocab)
#     tfidf_matrix = vectorizer.fit_transform([lemmatized_content])
#     feature_names = vectorizer.get_feature_names()
#     corpus_index = [n for n in ['Values']]
#     df = pd.DataFrame(tfidf_matrix.T.todense(), index=feature_names, columns=corpus_index)
#     df = df.sort_values(by=['Values'], ascending=False)
#     print (df.head(5).index.values.tolist())
#     common_vocabularly_lem.update(df.head(1000).index.values.tolist())
    common_vocabularly_lem.update(new_vocab)
print (time.time() - start_time)

60.13927459716797


In [93]:
#common_vocabularly_lem.remove("borussia")
#common_vocabularly_lem.remove("dortmund")
#common_vocabularly_lem.remove("bmw")
#common_vocabularly_lem.remove("group")
#common_vocabularly_lem.remove("zeiss")
#common_vocabularly_lem.remove("carl")

In [None]:
vectorizer = TfidfVectorizer(vocabulary=common_vocabularly_lem)

In [95]:
tfidf_matrix = vectorizer.fit_transform(
    [perform_lemmatization(FILE_PATH + 'BVB-AnnualReport-2011.json'), 
     perform_lemmatization(FILE_PATH + 'BVB-AnnualReport-2012.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2012.json'),
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2011.json'),
     perform_lemmatization(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2013.json'),
     perform_lemmatization(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2012.json'),
     perform_lemmatization(FILE_PATH + 'BVB-AnnualReport-2013.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2013.json'),])

In [161]:
start_time = time.time()
lemm_docs_prep = [
     perform_lemmatization(FILE_PATH + 'BVB-AnnualReport-2015.json'), 
     perform_lemmatization(FILE_PATH + 'BVB-AnnualReport-2016.json'), 
     perform_lemmatization(FILE_PATH + 'BVB-AnnualReport-2017.json'),
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2015.json'),
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2016.json'),
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2017.json'),
     perform_lemmatization(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2015.json'),
     perform_lemmatization(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2016.json'),
     perform_lemmatization(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2017.json')]
print (time.time() - start_time)

56.57222080230713


In [162]:
vectorizer = TfidfVectorizer(vocabulary=common_vocabularly_lem)
start_time = time.time()
tfidf_matrix = vectorizer.fit_transform(lemm_docs_prep)
print (time.time() - start_time)

0.27321958541870117


In [163]:
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in [
    'BVB-2015', 'BVB-2016', 'BVB-2017', 
    'BMW-2015', 'BMW-2016', 'BMW-2017',
    'CZM-2015', 'CZM-2016', 'CZM-2017']]
df = pd.DataFrame(tfidf_matrix.T.todense(), index=feature_names, columns=corpus_index)

In [167]:
print(len(feature_names))

23292


In [172]:
df = df.sort_values(by=['BVB-2016'], ascending=False)
df.head(7)

Unnamed: 0,BVB-2015,BVB-2016,BVB-2017,BMW-2015,BMW-2016,BMW-2017,CZM-2015,CZM-2016,CZM-2017
dortmund,0.512785,0.547217,0.524926,0.0,0.0,0.0,0.0,0.0,0.0
borussia,0.468832,0.512288,0.480867,0.0,0.0,0.0,0.0,0.0,0.0
teur,0.486414,0.457955,0.487161,0.0,0.0,0.0,0.0,0.0,0.0
gmbh,0.12768,0.124891,0.121527,0.009907,0.007879,0.007879,0.01534,0.013049,0.010231
risiko,0.088688,0.099237,0.094594,0.066991,0.071966,0.071966,0.064069,0.055456,0.057733
bvb,0.064464,0.078913,0.076788,0.0,0.0,0.0,0.0,0.0,0.0
uefa,0.079115,0.076326,0.059164,0.0,0.0,0.0,0.0,0.0,0.0


In [173]:
df = df.sort_values(by=['BMW-2016'], ascending=False)
df.head(7)

Unnamed: 0,BVB-2015,BVB-2016,BVB-2017,BMW-2015,BMW-2016,BMW-2017,CZM-2015,CZM-2016,CZM-2017
bmw,0.0,0.0,0.0,0.827197,0.830462,0.830462,0.0,0.0,0.0
group,0.0,0.000746,0.000726,0.261257,0.272321,0.272321,0.001995,0.003606,0.001616
million,0.003823,0.020253,0.013795,0.226919,0.167044,0.167044,0.079409,0.07666,0.099388
fahrzeug,0.0,0.0,0.0,0.093116,0.110728,0.110728,0.0,0.0,0.0
unternehmen,0.043579,0.048606,0.052552,0.075011,0.081946,0.081946,0.102871,0.11499,0.103773
automobile,0.0,0.0,0.005561,0.061505,0.079157,0.079157,0.0,0.0,0.0
risiko,0.088688,0.099237,0.094594,0.066991,0.071966,0.071966,0.064069,0.055456,0.057733


In [174]:
df = df.sort_values(by=['CZM-2016'], ascending=False)
df.head(7)

Unnamed: 0,BVB-2015,BVB-2016,BVB-2017,BMW-2015,BMW-2016,BMW-2017,CZM-2015,CZM-2016,CZM-2017
zeiss,0.0,0.0,0.0,0.0,0.0,0.0,0.556807,0.576673,0.57137
carl,0.0,0.0,0.0,0.0,0.0,0.0,0.477263,0.468839,0.483144
meditec,0.0,0.0,0.0,0.0,0.0,0.0,0.406365,0.41883,0.401919
konzern,0.036698,0.035779,0.0381,0.029721,0.029417,0.029417,0.196718,0.195728,0.233855
unternehmen,0.043579,0.048606,0.052552,0.075011,0.081946,0.081946,0.102871,0.11499,0.103773
vj,0.0,0.0,0.0,0.0,0.0,0.0,0.109385,0.09706,0.0
tausend,0.001037,0.001832,0.001782,0.0,0.0,0.0,0.09549,0.088513,0.071385


In [96]:
km = KMeans(n_clusters=3, init='k-means++')
km.fit(tfidf_matrix)
km.labels_

array([1, 1, 0, 0, 2, 2, 1, 0], dtype=int32)

# Picking topic of a paragraph

In [127]:
def readContentOfParagraphs(file_name):
    contents = []
    try:
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    contents.append(item[CONTENT])
    except:
        FileUtils.fix_json(file_name)
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    contents.append(item[CONTENT])
    return contents

In [128]:
def lemmatize_paragraphs(paragraphs):
    lemmatized_paragraphs = []
    for paragraph in paragraphs:
        nlp = spacy.load("de")
        sentence = nlp(paragraph)
        filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
        filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
        filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
        filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
        lemmatized_content = " ".join(item for item in filtered_lemmas)
        lemmatized_paragraphs.append(lemmatized_content)
    return lemmatized_paragraphs

In [176]:
lem_pars = lemmatize_paragraphs(readContentOfParagraphs(FILE_PATH + 'BVB-AnnualReport-2017.json'))

In [177]:
vectorizer_par = TfidfVectorizer(vocabulary=common_vocabularly_lem)

tfidf_matrix_par = vectorizer_par.fit_transform(lem_pars)

In [178]:
feature_names = vectorizer_par.get_feature_names()
corpus_index = [n for n in range(len(lem_pars))]
df_par = pd.DataFrame(tfidf_matrix_par.T.todense(), index=feature_names, columns=corpus_index)

In [218]:
row = df_par.loc[['prämie']]
for i in row:
    if(row[i].prämie != 0):
        print ((str(i) + " " + str(row[i].prämie)))

161 0.23991215404275826
167 0.16720434803083561
313 0.4214550630189431
449 0.16996541854098215
450 0.24473899536512775


In [221]:
readContentOfParagraphs(FILE_PATH + 'BVB-AnnualReport-2017.json')[167]

'Die Erlöse aus der internationalen TV-Vermarktung\nverzeichneten im abgelaufenen Geschäftsjahr\n2016/2017 den größten Anstieg. Der Umsatz be-\ntrug im Berichtszeitraum vom 01. Juli 2016 bis\nzum 30. Juni 2017 TEUR 50.993 (Vorjahr TEUR\n17.233). Die Erlöse enthalten die Ausschüttungen\nfür die Teilnahme an den Spielen der UEFA Cham-\npions League in 2016/2017 sowie die das Vorjahr\nbetreffenden Überschussbeteiligungen.\nDie Ausschüttung der UEFA für die UEFA Champions\nLeague erfolgte analog der letzten Jahre. Sie be-\ninhaltet den Marketpool, Start- und Spielprämien\nsowie leistungsbezogene Prämien.\n'

In [197]:
nlp = spacy.load("de")
sentence = nlp(content_of_all_documents)

In [198]:
filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
filtered_words_withoutpunc = [word for word in filtered_words if word.pos_ != 'PUNCT']

In [199]:
vocabularly = []
for word in filtered_words_withoutpunc:
    vocabularly.append(word.text.replace('\n', '').lower())

In [200]:
new_vocab = set()
for u in vocabularly:
    if u != '':
        new_vocab.add(u)

In [202]:
vectorizer = TfidfVectorizer(vocabulary=new_vocab)

In [232]:
tfidf_matrix = vectorizer.fit_transform([readContentOfFile(FILE_PATH + 'BVB-AnnualReport-2016.json'), readContentOfFile(FILE_PATH + 'BMW-AnnualReport-2016.json')])

In [233]:
print (len(new_vocab))

14420


In [234]:
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in ['A', 'B']]
import pandas as pd
df = pd.DataFrame(tfidf_matrix.T.todense(), index=feature_names, columns=corpus_index)
#df['1'].argmax()
# print(df.)

In [235]:
print (df['A'].argmax())

dortmund


will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  """Entry point for launching an IPython kernel.


In [238]:
print (df.sort_values(by=['B'], ascending=False))

                               A         B
bmw                     0.000000  0.778350
group                   0.000871  0.314828
2016                    0.160191  0.173860
2015                    0.147132  0.169833
automobile              0.000000  0.083967
unternehmen             0.040048  0.079211
höhe                    0.079225  0.079211
risiken                 0.060072  0.078539
vorstand                0.001741  0.073169
vorstands               0.000871  0.071826
aufsichtsrat            0.053107  0.066456
segment                 0.002612  0.062429
vorjahr                 0.084449  0.061086
geschäftsjahr           0.075743  0.061086
aufsichtsrats           0.010447  0.053702
beziehungsweise         0.000000  0.051890
entwicklung             0.017412  0.051017
fahrzeuge               0.000000  0.050947
mini                    0.000000  0.050947
wesentlichen            0.047883  0.050346
compliance              0.001741  0.049674
2017                    0.023506  0.047661
rahmen     