# Day1

In [113]:
import spacy
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS
import pandas as pd
from sklearn.cluster import KMeans
import time

In [3]:
%run src/file_utils.py
%run src/configuration.py

In [49]:
document_test = ['BMW-AnnualReport-2016.json']

In [24]:
vocab_documents = ['BMW-AnnualReport-2016.json', 'CarlZeissMeditec-AnnualReport-2016.json', 'BVB-AnnualReport-2016.json']

In [4]:
documents = ['BMW-AnnualReport-2015.json', 'BMW-AnnualReport-2016.json', 'BMW-AnnualReport-2017.json', 
 'CarlZeissMeditec-AnnualReport-2015.json', 'CarlZeissMeditec-AnnualReport-2016.json', 'CarlZeissMeditec-AnnualReport-2017.json',
 'BVB-AnnualReport-2015.json', 'BVB-AnnualReport-2016.json', 'BVB-AnnualReport-2017.json']

In [5]:
TYPE = 'type'
PARAGRAPH = 'paragraph'
CONTENT = 'content'

In [6]:
def readContentOfFile(file_name):
    content = ''
    try:
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    content += item[CONTENT]
    except:
        FileUtils.fix_json(file_name)
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    content += item[CONTENT]
    return content

# Clustering without lemmatization

In [40]:
common_vocabularly = set()
for document in documents:
    content_of_document = readContentOfFile(FILE_PATH + document)
    nlp = spacy.load("de")
    sentence = nlp(content_of_document)
    filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
    filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
    filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
    vocabularly = set()
    for word in filtered_words_withoutpunc:
        vocabularly.add(word.text.replace('\n', '').strip().lower())
    new_vocab = set()
    for u in vocabularly:
        if u != '':
            new_vocab.add(u)
    vectorizer = TfidfVectorizer(vocabulary=new_vocab)
    tfidf_matrix = vectorizer.fit_transform([content_of_document])
    feature_names = vectorizer.get_feature_names()
    corpus_index = [n for n in ['Values']]
    df = pd.DataFrame(tfidf_matrix.T.todense(), index=feature_names, columns=corpus_index)
    df = df.sort_values(by=['Values'], ascending=False)
    common_vocabularly.update(df.head(1000).index.values.tolist())


In [42]:
vectorizer = TfidfVectorizer(vocabulary=common_vocabularly)

In [46]:
tfidf_matrix = vectorizer.fit_transform(
    [readContentOfFile(FILE_PATH + 'BVB-AnnualReport-2011.json'), 
     readContentOfFile(FILE_PATH + 'BVB-AnnualReport-2012.json'), 
     readContentOfFile(FILE_PATH + 'BMW-AnnualReport-2012.json'),
     readContentOfFile(FILE_PATH + 'BMW-AnnualReport-2011.json'),
     readContentOfFile(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2013.json'),
     readContentOfFile(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2012.json'),
     readContentOfFile(FILE_PATH + 'BVB-AnnualReport-2013.json'), 
     readContentOfFile(FILE_PATH + 'BMW-AnnualReport-2013.json'),])

In [47]:
km = KMeans(n_clusters=3, init='k-means++')
km.fit(tfidf_matrix)
km.labels_

array([1, 1, 2, 2, 0, 0, 1, 2], dtype=int32)

# Clustering with lemmatization

In [63]:
def perform_lemmatization(document):
    content_of_document = readContentOfFile(document)
    nlp = spacy.load("de")
    sentence = nlp(content_of_document)
    filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
    filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
    filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
    filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
    lemmatized_content = " ".join(item for item in filtered_lemmas)
    return lemmatized_content

In [97]:
common_vocabularly_lem = set()
for document in documents:
    content_of_document = readContentOfFile(FILE_PATH + document)
    nlp = spacy.load("de")
    sentence = nlp(content_of_document)
    filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
    filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
    filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
    filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
    vocabularly = set()
    for word in filtered_lemmas:
        vocabularly.add(word.replace('\n', '').strip().lower())
    new_vocab = set()
    for u in vocabularly:
        if u != '':
            new_vocab.add(u)

    lemmatized_content = " ".join(item for item in filtered_lemmas)
    vectorizer = TfidfVectorizer(vocabulary=new_vocab)
    tfidf_matrix = vectorizer.fit_transform([lemmatized_content])
    feature_names = vectorizer.get_feature_names()
    corpus_index = [n for n in ['Values']]
    df = pd.DataFrame(tfidf_matrix.T.todense(), index=feature_names, columns=corpus_index)
    df = df.sort_values(by=['Values'], ascending=False)
    print (df.head(5).index.values.tolist())
    common_vocabularly_lem.update(df.head(1000).index.values.tolist())


['bmw', 'group', 'million', 'unternehmen', 'höhe']
['bmw', 'group', 'million', 'unternehmen', 'segment']
['bmw', 'group', 'million', 'unternehmen', 'segment']
['zeiss', 'carl', 'meditec', 'konzern', 'unternehmen']
['zeiss', 'carl', 'meditec', 'konzern', 'unternehmen']
['zeiss', 'carl', 'konzern', 'meditec', 'unternehmen']
['dortmund', 'teur', 'borussia', 'gmbh', 'risiko']
['dortmund', 'borussia', 'teur', 'gmbh', 'risiko']
['dortmund', 'teur', 'borussia', 'gmbh', 'risiko']


In [93]:
#common_vocabularly_lem.remove("borussia")
#common_vocabularly_lem.remove("dortmund")
#common_vocabularly_lem.remove("bmw")
#common_vocabularly_lem.remove("group")
#common_vocabularly_lem.remove("zeiss")
#common_vocabularly_lem.remove("carl")

In [94]:
vectorizer = TfidfVectorizer(vocabulary=common_vocabularly_lem)

In [95]:
tfidf_matrix = vectorizer.fit_transform(
    [perform_lemmatization(FILE_PATH + 'BVB-AnnualReport-2011.json'), 
     perform_lemmatization(FILE_PATH + 'BVB-AnnualReport-2012.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2012.json'),
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2011.json'),
     perform_lemmatization(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2013.json'),
     perform_lemmatization(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2012.json'),
     perform_lemmatization(FILE_PATH + 'BVB-AnnualReport-2013.json'), 
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2013.json'),])

In [116]:
start_time = time.time()
lemm_docs_prep = [perform_lemmatization(FILE_PATH + 'BVB-AnnualReport-2015.json'), 
     perform_lemmatization(FILE_PATH + 'BVB-AnnualReport-2016.json'), 
     perform_lemmatization(FILE_PATH + 'BVB-AnnualReport-2017.json'),
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2015.json'),
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2016.json'),
     perform_lemmatization(FILE_PATH + 'BMW-AnnualReport-2017.json'),
     perform_lemmatization(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2015.json'),
     perform_lemmatization(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2016.json'),
     perform_lemmatization(FILE_PATH + 'CarlZeissMeditec-AnnualReport-2017.json')]
print (time.time() - start_time)

58.02603197097778


In [117]:
vectorizer = TfidfVectorizer(vocabulary=common_vocabularly_lem)
start_time = time.time()
tfidf_matrix = vectorizer.fit_transform(lemm_docs_prep)
print (time.time() - start_time)

0.13860106468200684


In [105]:
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in [
    'BVB-2015', 'BVB-2016', 'BVB-2017', 
    'BMW-2015', 'BMW-2016', 'BMW-2017',
    'CZM-2015', 'CZM-2016', 'CZM-2017']]
df = pd.DataFrame(tfidf_matrix.T.todense(), index=feature_names, columns=corpus_index)
df.head(5)

Unnamed: 0,BVB-2015,BVB-2016,BVB-2017,BMW-2015,BMW-2016,BMW-2017,CZM-2015,CZM-2016,CZM-2017
1aktg,0.006278,0.002767,0.004496,0.000646,0.000718,0.000718,0.0,0.0,0.0
1er,0.0,0.0,0.0,0.009118,0.004057,0.004057,0.0,0.0,0.0
1hgb,0.002314,0.00204,0.00729,0.000952,0.001588,0.001588,0.001819,0.001642,0.00515
1mio,0.0,0.0,0.0,0.005032,0.0008,0.0008,0.002749,0.002481,0.0
1satz,0.003085,0.002719,0.003314,0.000476,0.000529,0.000529,0.001819,0.001642,0.001471


In [107]:
df = df.sort_values(by=['BVB-2016'], ascending=False)
df.head(10)

Unnamed: 0,BVB-2015,BVB-2016,BVB-2017,BMW-2015,BMW-2016,BMW-2017,CZM-2015,CZM-2016,CZM-2017
dortmund,0.517293,0.551092,0.529605,0.0,0.0,0.0,0.0,0.0,0.0
borussia,0.472954,0.515916,0.485153,0.0,0.0,0.0,0.0,0.0,0.0
teur,0.490689,0.461198,0.491504,0.0,0.0,0.0,0.0,0.0,0.0
gmbh,0.128802,0.125775,0.12261,0.009992,0.007939,0.007939,0.015466,0.013136,0.0103
risiko,0.089467,0.09994,0.095437,0.067568,0.072513,0.072513,0.064591,0.055829,0.058119
bvb,0.065031,0.079472,0.077472,0.0,0.0,0.0,0.0,0.0,0.0
uefa,0.079811,0.076866,0.059692,0.0,0.0,0.0,0.0,0.0,0.0
sportlich,0.085064,0.074982,0.074097,0.0,0.0008,0.0008,0.0,0.0,0.0
vorjahr,0.087154,0.074785,0.080856,0.039494,0.053459,0.053459,0.079147,0.075533,0.103731
kgaa,0.072246,0.071901,0.057075,0.0,0.0008,0.0008,0.0,0.0,0.0


In [108]:
df = df.sort_values(by=['BMW-2016'], ascending=False)
df.head(10)

Unnamed: 0,BVB-2015,BVB-2016,BVB-2017,BMW-2015,BMW-2016,BMW-2017,CZM-2015,CZM-2016,CZM-2017
bmw,0.0,0.0,0.0,0.834323,0.836783,0.836783,0.0,0.0,0.0
group,0.0,0.000751,0.000733,0.263508,0.274394,0.274394,0.002011,0.00363,0.001626
million,0.003856,0.020396,0.013918,0.228874,0.168316,0.168316,0.080057,0.077175,0.100053
fahrzeug,0.0,0.0,0.0,0.093918,0.111571,0.111571,0.0,0.0,0.0
unternehmen,0.043962,0.04895,0.053021,0.075657,0.08257,0.08257,0.10371,0.115762,0.104467
automobile,0.0,0.0,0.005611,0.062035,0.07976,0.07976,0.0,0.0,0.0
risiko,0.089467,0.09994,0.095437,0.067568,0.072513,0.072513,0.064591,0.055829,0.058119
segment,0.005399,0.003399,0.003977,0.065665,0.072513,0.072513,0.010007,0.00821,0.008828
motorrad,0.0,0.0,0.0,0.051974,0.0639,0.0639,0.0,0.0,0.0
mitglied,0.016197,0.014957,0.015906,0.0571,0.062986,0.062986,0.024563,0.018062,0.013978


In [112]:
df = df.sort_values(by=['CZM-2016'], ascending=False)
df.head(25)

Unnamed: 0,BVB-2015,BVB-2016,BVB-2017,BMW-2015,BMW-2016,BMW-2017,CZM-2015,CZM-2016,CZM-2017
zeiss,0.0,0.0,0.0,0.0,0.0,0.0,0.561348,0.580545,0.575191
carl,0.0,0.0,0.0,0.0,0.0,0.0,0.481156,0.471988,0.486375
meditec,0.0,0.0,0.0,0.0,0.0,0.0,0.40968,0.421642,0.404608
konzern,0.037021,0.036033,0.03844,0.029977,0.029641,0.029641,0.198322,0.197042,0.235419
unternehmen,0.043962,0.04895,0.053021,0.075657,0.08257,0.08257,0.10371,0.115762,0.104467
vj,0.0,0.0,0.0,0.0,0.0,0.0,0.110277,0.097712,0.0
tausend,0.001046,0.001845,0.001798,0.0,0.0,0.0,0.096269,0.089107,0.071862
geschäftsjahr,0.057074,0.059148,0.064287,0.046156,0.048166,0.048166,0.098251,0.088669,0.092696
million,0.003856,0.020396,0.013918,0.228874,0.168316,0.168316,0.080057,0.077175,0.100053
vorjahr,0.087154,0.074785,0.080856,0.039494,0.053459,0.053459,0.079147,0.075533,0.103731


In [96]:
km = KMeans(n_clusters=3, init='k-means++')
km.fit(tfidf_matrix)
km.labels_

array([1, 1, 0, 0, 2, 2, 1, 0], dtype=int32)

# Picking topic of a paragraph

In [71]:
def readContentOfParagraphs(file_name):
    contents = []
    try:
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    contents.append(item[CONTENT])
    except:
        FileUtils.fix_json(file_name)
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    contents.append(item[CONTENT])
    return contents

In [74]:
def lemmatize_paragraphs(paragraphs):
    lemmatized_paragraphs = []
    for paragraph in paragraphs:
        nlp = spacy.load("de")
        sentence = nlp(paragraph)
        filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
        filtered_words_withoutdigits = [word for word in filtered_words if not word.is_digit]
        filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
        filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
        lemmatized_content = " ".join(item for item in filtered_lemmas)
        lemmatized_paragraphs.append(lemmatized_content)
    return lemmatized_paragraphs

In [76]:
lem_pars = lemmatize_paragraphs(readContentOfParagraphs(FILE_PATH + 'BVB-AnnualReport-2011.json'))

In [75]:
vectorizer = TfidfVectorizer(vocabulary=common_vocabularly_lem)

tfidf_matrix = vectorizer.fit_transform(lemmatize_paragraphs(readContentOfParagraphs(FILE_PATH + 'BVB-AnnualReport-2011.json')))

In [None]:
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in xrange()]
df = pd.DataFrame(tfidf_matrix.T.todense(), index=feature_names, columns=corpus_index)

In [53]:
df.head(10)

Unnamed: 0,Values
bmw,0.655367
group,0.372566
million,0.252614
unternehmen,0.123924
segment,0.108831
risiko,0.108831
mitglied,0.094532
höhe,0.093737
fahrzeug,0.087382
vorstehen,0.086588


In [197]:
nlp = spacy.load("de")
sentence = nlp(content_of_all_documents)

In [198]:
filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
filtered_words_withoutpunc = [word for word in filtered_words if word.pos_ != 'PUNCT']

In [199]:
vocabularly = []
for word in filtered_words_withoutpunc:
    vocabularly.append(word.text.replace('\n', '').lower())

In [200]:
new_vocab = set()
for u in vocabularly:
    if u != '':
        new_vocab.add(u)

In [202]:
vectorizer = TfidfVectorizer(vocabulary=new_vocab)

In [232]:
tfidf_matrix = vectorizer.fit_transform([readContentOfFile(FILE_PATH + 'BVB-AnnualReport-2016.json'), readContentOfFile(FILE_PATH + 'BMW-AnnualReport-2016.json')])

In [233]:
print (len(new_vocab))

14420


In [234]:
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in ['A', 'B']]
import pandas as pd
df = pd.DataFrame(tfidf_matrix.T.todense(), index=feature_names, columns=corpus_index)
#df['1'].argmax()
# print(df.)

In [235]:
print (df['A'].argmax())

dortmund


will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  """Entry point for launching an IPython kernel.


In [238]:
print (df.sort_values(by=['B'], ascending=False))

                               A         B
bmw                     0.000000  0.778350
group                   0.000871  0.314828
2016                    0.160191  0.173860
2015                    0.147132  0.169833
automobile              0.000000  0.083967
unternehmen             0.040048  0.079211
höhe                    0.079225  0.079211
risiken                 0.060072  0.078539
vorstand                0.001741  0.073169
vorstands               0.000871  0.071826
aufsichtsrat            0.053107  0.066456
segment                 0.002612  0.062429
vorjahr                 0.084449  0.061086
geschäftsjahr           0.075743  0.061086
aufsichtsrats           0.010447  0.053702
beziehungsweise         0.000000  0.051890
entwicklung             0.017412  0.051017
fahrzeuge               0.000000  0.050947
mini                    0.000000  0.050947
wesentlichen            0.047883  0.050346
compliance              0.001741  0.049674
2017                    0.023506  0.047661
rahmen     