# TF/IDF keywords trend

In [22]:
import spacy
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS
import pandas as pd
from sklearn.cluster import KMeans
import time

In [23]:
#other needed configurations
%run src/file_utils.py
%run src/configuration.py
%run "load_and_prepro_document.ipynb"

# BMW 2010-2017

In [24]:
#the file list of BMW from 2010 to 2017
bmw_lemm_docs_prep = [
     'BMW-AnnualReport-2010.json', 
     'BMW-AnnualReport-2011.json', 
     'BMW-AnnualReport-2012.json',
     'BMW-AnnualReport-2013.json', 
     'BMW-AnnualReport-2014.json', 
     'BMW-AnnualReport-2015.json',
     'BMW-AnnualReport-2016.json', 
     'BMW-AnnualReport-2017.json']

In [25]:
#stop tfidf from preprocessing and split the word
def preProcess(s):
    return s

In [26]:
#remove all the stop words and other meaningless characters
bmw_doc, bmw_name= get_clean_data(bmw_lemm_docs_prep)

#do the TF/IDF and produce the tfidf-matrix
vectorizer_bmw = TfidfVectorizer()
start_time = time.time()
tfidf_matrix_bmw = vectorizer_bmw.fit_transform(bmw_doc)
print (time.time() - start_time)

filtered_BMW-AnnualReport-2010.json has already done preprocess
filtered_BMW-AnnualReport-2011.json has already done preprocess
filtered_BMW-AnnualReport-2012.json has already done preprocess
filtered_BMW-AnnualReport-2013.json has already done preprocess
filtered_BMW-AnnualReport-2014.json has already done preprocess
filtered_BMW-AnnualReport-2015.json has already done preprocess
filtered_BMW-AnnualReport-2016.json has already done preprocess
filtered_BMW-AnnualReport-2017.json has already done preprocess
0.2378082275390625


In [27]:
#use pandas to show the result (make data structure more clear)
bmw_feature_names = vectorizer_bmw.get_feature_names()
bmw_corpus_index = [n for n in [
    'BMW-2010', 'BMW-2011', 'BMW-2012', 
    'BMW-2013', 'BMW-2014', 'BMW-2015',
    'BMW-2016', 'BMW-2017']]
idf = vectorizer_bmw.idf_
df = pd.DataFrame(tfidf_matrix_bmw.T.todense(), index=bmw_feature_names, columns=bmw_corpus_index)
df['idf'] = idf

# Deutsche Bank 2010-2016

In [28]:
#the file list of Deutsche Bank from 2010 to 2017
db_lemm_docs_prep = [
     'DeutscheBank-AnnualReport-2010.json', 
     'DeutscheBank-AnnualReport-2011.json', 
     'DeutscheBank-AnnualReport-2012.json',
     'DeutscheBank-AnnualReport-2013.json', 
     'DeutscheBank-AnnualReport-2014.json', 
     'DeutscheBank-AnnualReport-2015.json',
     'DeutscheBank-AnnualReport-2016.json']

In [29]:
#remove the stop words and other meaningless characters
db_doc, db_name = get_clean_data(db_lemm_docs_prep)

#do the TF/IDF and produce the tfidf-matrix
vectorizer_db = TfidfVectorizer()
start_time = time.time()
tfidf_matrix_db = vectorizer_db.fit_transform(db_doc)
print (time.time() - start_time)

filtered_DeutscheBank-AnnualReport-2010.json has already done preprocess
filtered_DeutscheBank-AnnualReport-2011.json has already done preprocess
filtered_DeutscheBank-AnnualReport-2012.json has already done preprocess
filtered_DeutscheBank-AnnualReport-2013.json has already done preprocess
filtered_DeutscheBank-AnnualReport-2014.json has already done preprocess
filtered_DeutscheBank-AnnualReport-2015.json has already done preprocess
filtered_DeutscheBank-AnnualReport-2016.json has already done preprocess
0.4333820343017578


In [30]:
#use pandas to show the result (make data structure more clear)
db_feature_names = vectorizer_db.get_feature_names()
db_corpus_index = [n for n in [
    'DB-2010', 'DB-2011', 'DB-2012', 
    'DB-2013', 'DB-2014', 'DB-2015',
    'DB-2016']]
idf = vectorizer_db.idf_
df_db = pd.DataFrame(tfidf_matrix_db.T.todense(), index=db_feature_names, columns=db_corpus_index)
df_db['idf'] = idf

# Visualization

In [31]:
import plotly as py
import plotly.graph_objs as go
import numpy as np

py.offline.init_notebook_mode(connected=True)

## BMW

In [32]:
#the keywords chosen from the BMW Annual reports
key = ['Husqvarna', 'aktienbasierte', 'Citroën', 'electrification', 'amsterdam', 'Drivenow', 'co2', 'brexit', 'HERE / Amsterdam', 'there']

In [33]:
#get the value of certain row (as y-value of visualization result)
y1 = df.loc['husqvarna'].tolist()
y3 = df.loc['citroën'].tolist()
y5 = df.loc['amsterdam'].tolist()
y6 = df.loc['drivenow'].tolist()
y7 = df.loc['co2'].tolist()
y8 = df.loc['brexit'].tolist()
y9 = df.loc['here'].tolist()
y10= df.loc['there'].tolist()

In [34]:
#x-value(year 2010-2017) of visualization result
years = np.linspace(2010, 2017, 8)

#define all the lines(keywords) with the data from TF/IDF
line1 = go.Scatter(x=years, y=y1, mode='lines+markers', name=key[0])
line3 = go.Scatter(x=years, y=y3, mode='lines+markers', name=key[2])
line6 = go.Scatter(x=years, y=y6, mode='lines+markers', name=key[5])
line7 = go.Scatter(x=years, y=y7, mode='lines+markers', name=key[6])
line8 = go.Scatter(x=years, y=y8, mode='lines+markers', name=key[7])
line9 = go.Scatter(x=years, y=y9, mode='lines+markers', name=key[8])

In [35]:
#the layout of visualization of BMW (main title, axis title)
decay = [line1, line3]
increase = [line6, line7, line8, line9]

layout_decay = dict(title = 'BMW:TF-IDF keywords trend - decrease',
              xaxis = dict(title = 'years')
              )
fig_decay = dict(data=decay, layout=layout_decay)

layout_increase = dict(title = 'BMW:TF-IDF keywords trend - increase',
              xaxis = dict(title = 'years')
              )
fig_increase = dict(data=increase, layout=layout_increase)

#draw the line chart
py.offline.iplot(fig_decay, filename='BMW:TF-IDF keywords trend - decrease')
py.offline.iplot(fig_increase, filename='BMW:TF-IDF keywords trend - increase')

## Deutsche Bank

In [36]:
#the keywords chosen from the Deutsche Bank Annual reports
key_co = ['Goodwill', 'ABN AMRO', 'Deutsche Bank National Trust Co.']

In [37]:
#get the value of certain row (as y-value of visualization result)
y1 = df_db.loc['goodwill'].tolist()
y2 = df_db.loc['amro'].tolist()
y3 = df_db.loc['dbntc'].tolist()

In [38]:
#x-value(year 2010-2016) of visualization result
years = np.linspace(2010, 2016, 7)

#define all the lines(keywords) with the data from TF/IDF
line1 = go.Scatter(x=years, y=y1, mode='lines+markers', name=key_co[0])
line2 = go.Scatter(x=years, y=y2, mode='lines+markers', name=key_co[1])
line3 = go.Scatter(x=years, y=y3, mode='lines+markers', name=key_co[2])

In [39]:
#the layout of visualization of Deutsche Bank (main title, axis title)
company = [line1, line2, line3]

layout_company = dict(title = 'Deutsche Bank:TF-IDF keywords trend',
              xaxis = dict(title = 'years')
              )
fig_company = dict(data=company, layout=layout_company)

#draw the line chart
py.offline.iplot(fig_company, filename='Deutsche Bank:TF-IDF keywords trend')