# TF/IDF keywords trend

In [43]:
import spacy
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS
import pandas as pd
from sklearn.cluster import KMeans
import time

In [44]:
#other needed configurations
%run src/file_utils.py
%run src/configuration.py
%run "load_and_prepro_document.ipynb"

# BMW 2010-2017

In [45]:
#the file list of BMW from 2010 to 2017
bmw_lemm_docs_prep = [
     'BMW-AnnualReport-2010.json', 
     'BMW-AnnualReport-2011.json', 
     'BMW-AnnualReport-2012.json',
     'BMW-AnnualReport-2013.json', 
     'BMW-AnnualReport-2014.json', 
     'BMW-AnnualReport-2015.json',
     'BMW-AnnualReport-2016.json', 
     'BMW-AnnualReport-2017.json']

In [46]:
#stop tfidf from preprocessing and split the word
def preProcess(s):
    return s

In [47]:
#remove all the stop words and other meaningless characters
bmw_doc, bmw_name= get_clean_data(bmw_lemm_docs_prep)

#do the TF/IDF and produce the tfidf-matrix
vectorizer_bmw = TfidfVectorizer()
start_time = time.time()
tfidf_matrix_bmw = vectorizer_bmw.fit_transform(bmw_doc)
print (time.time() - start_time)

filtered_BMW-AnnualReport-2010.json has already done preprocess
filtered_BMW-AnnualReport-2011.json has already done preprocess
filtered_BMW-AnnualReport-2012.json has already done preprocess
filtered_BMW-AnnualReport-2013.json has already done preprocess
filtered_BMW-AnnualReport-2014.json has already done preprocess
filtered_BMW-AnnualReport-2015.json has already done preprocess
filtered_BMW-AnnualReport-2016.json has already done preprocess
filtered_BMW-AnnualReport-2017.json has already done preprocess
0.2481086254119873


In [48]:
#use pandas to show the result (make data structure more clear)
bmw_feature_names = vectorizer_bmw.get_feature_names()
bmw_corpus_index = [n for n in [
    'BMW-2010', 'BMW-2011', 'BMW-2012', 
    'BMW-2013', 'BMW-2014', 'BMW-2015',
    'BMW-2016', 'BMW-2017']]
idf = vectorizer_bmw.idf_
df = pd.DataFrame(tfidf_matrix_bmw.T.todense(), index=bmw_feature_names, columns=bmw_corpus_index)
df['idf'] = idf

In [50]:
#present the matrix sorted by year-2016 value
df = df.sort_values(by=['BMW-2016'], ascending=False)
df[(df['idf'] != 1)].head(10)

Unnamed: 0,BMW-2010,BMW-2011,BMW-2012,BMW-2013,BMW-2014,BMW-2015,BMW-2016,BMW-2017,idf
next,0.0,0.0,0.0,0.0,0.0,0.011058,0.048473,0.048473,1.81093
co2,0.0,0.0,0.0,0.001703,0.001647,0.0,0.040652,0.040652,1.587787
ergebnisauswirkungen,0.0,0.0,0.0,0.003016,0.023322,0.027178,0.032713,0.032713,1.405465
zweijährig,0.0,0.0,0.0,0.0,0.027994,0.027471,0.031413,0.031413,1.587787
betrachtungszeitraum,0.001216,0.0,0.001261,0.003598,0.013911,0.025028,0.02992,0.02992,1.117783
there,0.0,0.0,0.0,0.0,0.0,0.035018,0.029505,0.029505,1.81093
brexit,0.0,0.0,0.0,0.0,0.0,0.0,0.026866,0.026866,2.098612
here,0.0,0.0,0.0,0.0,0.0,0.027646,0.02529,0.02529,1.81093
plug,0.0,0.005046,0.001261,0.002398,0.013911,0.010239,0.024716,0.024716,1.117783
aktienbasierte,0.0,0.018924,0.023953,0.023985,0.020867,0.02389,0.024716,0.024716,1.117783


# Deutsche Bank 2010-2016

In [51]:
#the file list of Deutsche Bank from 2010 to 2017
db_lemm_docs_prep = [
     'DeutscheBank-AnnualReport-2010.json', 
     'DeutscheBank-AnnualReport-2011.json', 
     'DeutscheBank-AnnualReport-2012.json',
     'DeutscheBank-AnnualReport-2013.json', 
     'DeutscheBank-AnnualReport-2014.json', 
     'DeutscheBank-AnnualReport-2015.json',
     'DeutscheBank-AnnualReport-2016.json']

In [52]:
#remove the stop words and other meaningless characters
db_doc, db_name = get_clean_data(db_lemm_docs_prep)

#do the TF/IDF and produce the tfidf-matrix
vectorizer_db = TfidfVectorizer()
start_time = time.time()
tfidf_matrix_db = vectorizer_db.fit_transform(db_doc)
print (time.time() - start_time)

filtered_DeutscheBank-AnnualReport-2010.json has already done preprocess
filtered_DeutscheBank-AnnualReport-2011.json has already done preprocess
filtered_DeutscheBank-AnnualReport-2012.json has already done preprocess
filtered_DeutscheBank-AnnualReport-2013.json has already done preprocess
filtered_DeutscheBank-AnnualReport-2014.json has already done preprocess
filtered_DeutscheBank-AnnualReport-2015.json has already done preprocess
filtered_DeutscheBank-AnnualReport-2016.json has already done preprocess
0.44272732734680176


In [53]:
#use pandas to show the result (make data structure more clear)
db_feature_names = vectorizer_db.get_feature_names()
db_corpus_index = [n for n in [
    'DB-2010', 'DB-2011', 'DB-2012', 
    'DB-2013', 'DB-2014', 'DB-2015',
    'DB-2016']]
idf = vectorizer_db.idf_
df_db = pd.DataFrame(tfidf_matrix_db.T.todense(), index=db_feature_names, columns=db_corpus_index)
df_db['idf'] = idf

In [54]:
#present the matrix sorted by year-2016 value
df_db = df_db.sort_values(by=['DB-2016'], ascending=False)
df_db[(df_db['idf'] != 1)].head(10)

Unnamed: 0,DB-2010,DB-2011,DB-2012,DB-2013,DB-2014,DB-2015,DB-2016,idf
beizulegenden,0.0,0.203085,0.166018,0.131401,0.128794,0.131993,0.126773,1.133531
crrcrd,0.0,0.0,0.0,0.040167,0.075463,0.03819,0.04622,1.470004
zeitwerts,0.0,0.067357,0.050803,0.042236,0.040345,0.044699,0.042408,1.133531
pwcc,0.0,0.0,0.0,0.0,0.0,0.006433,0.037842,1.980829
ncou,0.0,0.0,0.042769,0.058109,0.041425,0.032856,0.035363,1.287682
gm,0.0,0.0,0.0,0.0,0.0,0.004595,0.030747,1.980829
vollumsetzung,0.0,0.0,0.0,0.003652,0.017608,0.018413,0.028668,1.470004
kernkapital,0.000517,0.0,0.004536,0.006101,0.020173,0.025242,0.025264,1.133531
cib,0.022902,0.021287,0.001546,0.0,0.0,0.004182,0.025113,1.287682
harte,0.0,0.0,0.0,0.0,0.0,0.024812,0.022863,1.980829


# Visualization

In [55]:
import plotly as py
import plotly.graph_objs as go
import numpy as np

py.offline.init_notebook_mode(connected=True)

## BMW

In [58]:
#the keywords chosen from the BMW Annual reports
key = ['Husqvarna', 'aktienbasierte', 'Citroën', 'electrification', 'amsterdam', 'Drivenow', 'co2', 'brexit', 'HERE / Amsterdam', 'there']

In [64]:
#get the value of certain row (as y-value of visualization result)
y1 = df.loc['husqvarna'].tolist()
y3 = df.loc['citroën'].tolist()
y5 = df.loc['amsterdam'].tolist()
y6 = df.loc['drivenow'].tolist()
y7 = df.loc['co2'].tolist()
y8 = df.loc['brexit'].tolist()
y9 = df.loc['here'].tolist()
y10= df.loc['there'].tolist()

In [61]:
#x-value(year 2010-2017) of visualization result
years = np.linspace(2010, 2017, 8)

#define all the lines(keywords) with the data from TF/IDF
line1 = go.Scatter(x=years, y=y1, mode='lines+markers', name=key[0])
line3 = go.Scatter(x=years, y=y3, mode='lines+markers', name=key[2])

line6 = go.Scatter(x=years, y=y6, mode='lines+markers', name=key[5])
line7 = go.Scatter(x=years, y=y7, mode='lines+markers', name=key[6])
line8 = go.Scatter(x=years, y=y8, mode='lines+markers', name=key[7])
line9 = go.Scatter(x=years, y=y9, mode='lines+markers', name=key[8])

In [62]:
#the layout of visualization of BMW (main title, axis title)
decay = [line1, line3]
increase = [line6, line7, line8, line9]
total = [line1, line3, line6, line7, line8, line9]

layout_decay = dict(title = 'BMW:TF-IDF keywords trend - decrease',
              xaxis = dict(title = 'years'),
              yaxis = dict(title = 'corelation'),
              )
fig_decay = dict(data=decay, layout=layout_decay)

layout_increase = dict(title = 'BMW:TF-IDF keywords trend - increase',
              xaxis = dict(title = 'years'),
              yaxis = dict(title = 'corelation'),
              )
fig_increase = dict(data=increase, layout=layout_increase)

layout_total = dict(title = 'all topics trend of BMW',
              xaxis = dict(title = 'years'),
              yaxis = dict(title = 'corelation'),
              )
fig_total = dict(data=total, layout=layout_total)

#draw the line chart
py.offline.iplot(fig_decay, filename='topic trend of BMW - decay')
py.offline.iplot(fig_increase, filename='topic trend of BMW - increase')
py.offline.iplot(fig_total, filename='all topics trend of BMW')

## Deutsche Bank

In [63]:
#the keywords chosen from the Deutsche Bank Annual reports
key_co = ['Goodwill', 'ABN AMRO', 'Deutsche Bank National Trust Co.', 
          'klage (lawsuit)', 'gericht (judgement)', 'court']
key_ser = ['PCAM', 
           'special purpose entities', 
           'CIB',
           'Incremental Risk Charge',
           'non-core operations unit',
           'exposure at defaut',
           'PWCC',
           'global market'
          ]

In [65]:
#get the value of certain row (as y-value of visualization result)
y1 = df_db.loc['goodwill'].tolist()
y2 = df_db.loc['amro'].tolist()
y3 = df_db.loc['dbntc'].tolist()
y4 = df_db.loc['klage'].tolist()
y5 = df_db.loc['gericht'].tolist()
y6 = df_db.loc['court'].tolist()

In [66]:
#x-value(year 2010-2016) of visualization result
years = np.linspace(2010, 2016, 7)

#define all the lines(keywords) with the data from TF/IDF
line1 = go.Scatter(x=years, y=y1, mode='lines+markers', name=key_co[0])
line2 = go.Scatter(x=years, y=y2, mode='lines+markers', name=key_co[1])
line3 = go.Scatter(x=years, y=y3, mode='lines+markers', name=key_co[2])

In [68]:
#the layout of visualization of Deutsche Bank (main title, axis title)
company = [line1, line2, line3]
service = [line7, line8, line9, line10, line11, line12, line13, line14]

layout_company = dict(title = 'Deutsche Bank:TF-IDF keywords trend',
              xaxis = dict(title = 'years'),
              yaxis = dict(title = 'corelation'),
              )
fig_company = dict(data=company, layout=layout_company)

#draw the line chart
py.offline.iplot(fig_company, filename='Deutsche Bank:TF-IDF keywords trend')