# Sõnade arvutamine

Võtan tekstidest kõik sõnad ja võrdlen neid etteantud sõnadega, mis on seotud ESG'ga. Defineeritud sõnapaketi võtsin internetist.

In [145]:
import pandas as pd
import nltk
datacorp = pd.read_pickle('annualpickles/annualcorpuses.pkl')
datacorp["unigrams"] = datacorp["text"].apply(nltk.word_tokenize)
datacorp['quarters'] = datacorp.index
datacorp

Unnamed: 0,text,unigrams,quarters
annual_report_2019_luminor.txt,annual report luminor holding as consolidated...,"[annual, report, luminor, holding, as, consoli...",annual_report_2019_luminor.txt
lhv_annual_2019.txt,group annual indd as lhv group consolidated an...,"[group, annual, indd, as, lhv, group, consolid...",lhv_annual_2019.txt
seb_annual_report_2019.txt,annual reportannua l r e p o rt contents in br...,"[annual, reportannua, l, r, e, p, o, rt, conte...",seb_annual_report_2019.txt
swedannual2019.txt,annual and sustainability report financial in...,"[annual, and, sustainability, report, financia...",swedannual2019.txt


In [146]:
import json
import logging
from re import sub
from multiprocessing import cpu_count

import numpy as np

import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

In [147]:
import logging

# logimine
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING)  # DEBUG # INFO

In [148]:
import nltk

# stoppsõnad
nltk.download('stopwords') 
stopwords = set(nltk.corpus.stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marek.keskull\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [149]:
#andmetest võtan andmepealkirjad ja tekstid nimetan dokumentideks

titles = [item for item in datacorp['quarters']]
documents = [item for item in datacorp['text']]


In [150]:
def preprocess(doc):
    # puhastamine ja tokeniseerimine
    doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
    doc = sub(r'<[^<>]+(>|$)', " ", doc)
    doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
    doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]

In [151]:
query_string = 'biodiversity'
query_Environment = 'Biodiversity Carbon Cleantech Clean Climate Coal Conservation Ecosystem Emission Energy Fuel Green Land Natural Pollution Renewable Resources Sustainability Sustainable Toxic Waste Water Accident Alcohol Anti-personnel Behavior Charity Community Controversial Controversy Discrimination Gambling Health Human capital Human rights Inclusion Injury Lab Munitions Opposition Pay Philanthropic Quality Responsible Advocacy Bribery Compensation Competitive Corruption Divestment Fraud GRI Independent Justice Stability Stewardship Transparency'
query_Social = 'Accident Alcohol Anti-personnel Behavior Charity Community Controversial Controversy Discrimination Gambling Health Human capital Human rights Inclusion Injury Lab Munitions Opposition Pay Philanthropic Quality Responsible'
query_Government = 'Advocacy Bribery Compensation Competitive Corruption Divestment Fraud GRI Independent Justice Stability Stewardship Transparency'


# Preprocess meetod
corpus = [preprocess(document) for document in documents]
query = preprocess(query_Environment)


In [152]:
%%time

# glove vektoripakk, siin on 400000 vektorit sees 
#https://nlp.stanford.edu/projects/glove/

if 'glove' not in locals():  
    glove = api.load("glove-wiki-gigaword-50")


#arvutame välja koosinuse sarnasused välja sõnavektorides
similarity_index = WordEmbeddingSimilarityIndex(glove)

Wall time: 0 ns


In [153]:
#ehitame TF-idf mudeli
def build_term(corpus, query):
    # The search query must be in the dictionary as well, in case the terms do not overlap with the documents (we still want similarity)
    dictionary = Dictionary(corpus+[query])
    tfidf = TfidfModel(dictionary=dictionary)
    # Create the term similarity matrix. 
    # The nonzero_limit enforces sparsity by limiting the number of non-zero terms in each column. 
    similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)  # , nonzero_limit=None)
    return similarity_matrix

In [154]:
tfidf_model = build_term(corpus, query)

In [155]:
def doc_similarity_scores(query,similarity_matrix):
    dictionary = Dictionary(corpus+[query])
    tfidf = TfidfModel(dictionary=dictionary)
    query_tf = tfidf[dictionary.doc2bow(query)]
    index = SoftCosineSimilarity(tfidf[[dictionary.doc2bow(document) for document in corpus]],similarity_matrix)
    doc_similarity_scores = index[query_tf]
    return doc_similarity_scores

In [156]:

document_sim_scores = doc_similarity_scores(query,tfidf_model)


In [157]:
#meetod sarnasusskooride sortimiseks
def sort_similarity_scores_by_document(doc_similarity_scores):
    sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
    #for idx in sorted_indexes[:15]:
        #print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {titles[idx]}')
        
    return sorted_indexes

In [158]:
sorted_sim_scores = sort_similarity_scores_by_document(document_sim_scores)


In [161]:
doc_similar_terms = []
max_results_per_doc = 30
for term in query:
    dictionary = Dictionary(corpus+[query])
    idx1 = dictionary.token2id[term]
    for document in corpus:
        results_this_doc = []
        for word in set(document):
            idx2 = dictionary.token2id[word]
            score = tfidf_model.matrix[idx1, idx2]
            if score > 0.0:
                results_this_doc.append((word, score))
        results_this_doc = sorted(results_this_doc, reverse=True, key=lambda x: x[1])  # sort results by score
        results_this_doc = results_this_doc[:min(len(results_this_doc), max_results_per_doc)]  # take the top results
        doc_similar_terms.append(results_this_doc)

In [163]:
#esimese 15 dokumendi tulemused
for idx in s[:15]:
    similar_terms_string = ', '.join([result[0] for result in doc_similar_terms[idx]])
    print(f'{idx} \t {document_sim_scores[idx]:0.3f} \t {titles[idx]}')

1 	 0.432 	 lhv_annual_2019.txt
2 	 0.487 	 seb_annual_report_2019.txt
3 	 0.518 	 swedannual2019.txt
0 	 0.462 	 annual_report_2019_luminor.txt


In [164]:
#esimese 15 dokumendi tulemused koos sarnaste sõnadega
results = []
for idx in s[:15]:
    similar_terms_string = ', '.join([result[0] for result in doc_similar_terms[idx]])
    results.append([idx,document_sim_scores[idx],titles[idx],similar_terms_string])
    print(f'{idx} \t {document_sim_scores[idx]:0.3f} \t {titles[idx]}  :  {similar_terms_string}')

1 	 0.432 	 lhv_annual_2019.txt  :  biodiversity, conservation, ecological, sustainable, sustainability, preservation, diversity, environmental, resource, climate, impacts, environment, depletion, resources, natural, stewardship, development, footprint, awareness, mitigation, global, nature, agricultural, prevention, impact
2 	 0.487 	 seb_annual_report_2019.txt  :  ecosystem, conservation, ecosystems, sustainable, sustainability, diversity, environmental, resource, climate, impacts, environment, aquatic, resources, natural, forest, development, footprint, conducive, awareness, mitigation, global, nature, prosperity, prevention, pollution, impact, safeguarding
3 	 0.518 	 swedannual2019.txt  :  conservation, sustainable, sustainability, diversity, environmental, resource, climate, impacts, environment, resources, deforestation, natural, forest, preparedness, development, footprint, awareness, mitigation, global, nature, environments, agroforestry, prosperity, prevention, impact, safegu

In [169]:
df = pd.DataFrame(results, columns=["Index of document", "Similarity score","Document name","Most similar words"])

In [168]:
df

Unnamed: 0,Index,Similarity score,Document name,Most similar words
0,1,0.431862,lhv_annual_2019.txt,"biodiversity, conservation, ecological, sustai..."
1,2,0.486714,seb_annual_report_2019.txt,"ecosystem, conservation, ecosystems, sustainab..."
2,3,0.518059,swedannual2019.txt,"conservation, sustainable, sustainability, div..."
3,0,0.461859,annual_report_2019_luminor.txt,"ecological, sustainable, sustainability, diver..."


In [170]:
import pickle
df.to_pickle('annualresults/2019banksimilarityscores.pkl')


In [143]:
datacorpswed = pd.read_pickle('resultdata/swedsimilarityscores.pkl')
datacorpswed

Unnamed: 0,Order of similarity,Similarity score,Document name,Most similar words
0,1,0.264879,swedbankQ2_20_eng.txt,"critical, fax, bank"
1,2,0.267326,swedbankQ3_20_eng.txt,"critical, fax, bank"


In [144]:
datacorplhv = pd.read_pickle('resultdata/lhvsimilarityscores.pkl')
datacorplhv

Unnamed: 0,Order of similarity,Similarity score,Document name,Most similar words
0,1,0.261196,lhv2020_q1_en_eur_con_00.txt,"note, collection, operation"
1,2,0.040926,lhv2020_q2_en_eur_00_00.txt,
2,3,0.267063,lhv2020_q2_en_eur_con_00.txt,"note, collection"
3,0,0.040926,lhv2020_q1_en_eur_00_00.txt,
