# Sõnade arvutamine

Võtan tekstidest kõik sõnad ja võrdlen neid etteantud sõnadega, mis on seotud ESG'ga. Defineeritud sõnapaketi võtsin internetist.

In [936]:
import pandas as pd
import nltk
datacorp = pd.read_pickle('pickles/lhv2016t2021.pkl')
datacorp['document'] = datacorp.index

query_G = 'Audit and control, Board structure, Remuneration, Shareholder rights, Transparency and Performance'
query_S = 'Access to medicines, HIV, AIDs, Nutrition, Product safety, Community relations, Privacy and free expression, Security, Weak, governance zones, Diversity, Health and safety, ILO core conventions, Supply chain labor standards, Bribery and corruption, Political influence, Responsible marketing, Whistle-blowing systems, disclosure and reporting, Governance of sustainability issues, Stakeholder engagement, UNGC compliance'
query_E = 'Biofuels, Climate ,Emissions ,land, Biodiversity, Water, Environmental, standards, Pollution, Supply, Waste, recycling'


In [937]:
import json
import logging
from re import sub
from multiprocessing import cpu_count

import numpy as np

import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

In [938]:
import logging

# logimine
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING)  # DEBUG # INFO

In [939]:
import nltk

# stoppsõnad
nltk.download('stopwords') 


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marek.keskull\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [988]:
#andmetest võtan andmepealkirjad ja tekstid nimetan dokumentideks

titles = [item for item in datacorp['document']]
documents = [item for item in datacorp['text']]


print(f'{len(documents)} documents')

25 documents


In [989]:
def preprocess(doc):
    doc = doc.lower().split()
    stop_words = stopwords.words('english')
    doc = [w for w in doc if w not in stop_words]
    return doc

In [990]:
import re 
import string
def preprocess_query(doc):
    doc = doc.lower().split()
    doc = [remove_punc(i) for i in doc]
    stop_words = stopwords.words('english')
    doc = [w for w in doc if w not in stop_words]
    return doc

In [991]:
def remove_punc(string):
    punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
    for ele in string:  
        if ele in punc:  
            string = string.replace(ele, "") 
    return string

In [992]:
corpus = [preprocess(document) for document in documents]

query = preprocess_query(query_G)
query

['audit',
 'control',
 'board',
 'structure',
 'remuneration',
 'shareholder',
 'rights',
 'transparency',
 'performance']

In [993]:
%%time
#pre-trained embeddings
# glove vektoripakk, siin on 400000 vektorit sees 
#https://nlp.stanford.edu/projects/glove/

if 'glove' not in locals():  
    glove = api.load("glove-wiki-gigaword-50")

#print(glove.most_similar("remuneration"))

#A term similarity index that computes cosine similarities between word embeddings.
#1) Compute cosine similarities between word embeddings.
#2) Retrieve the closest word embeddings (by cosine similarity) to a given word embedding.

similarity_index = WordEmbeddingSimilarityIndex(glove)

Wall time: 0 ns


In [994]:
dictionary = Dictionary(corpus+[query])
print(dictionary.get(8099))

None


In [995]:
#ehitame TF-idf mudeli
    
#kõigepealt ehitame valmis andmesõnastiku, kus on sees kõik dokumendi sõnad ja otsingupäringu sõnad vormis: 'võti':'sõna'
dictionary = Dictionary(corpus+[query])
print(dictionary.get(2979))

#This module implements functionality related to the 
#Term Frequency - Inverse Document Frequency vector space bag-of-words models.
tfidf = TfidfModel(dictionary=dictionary)
    
    
#Builds a sparse term similarity matrix using a term similarity index. 
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf, nonzero_limit=100) 


shareholder


In [996]:
dictionary = Dictionary(corpus+[query])
tfidf = TfidfModel(dictionary=dictionary)
print(dictionary.doc2bow(query))
print()
print(tfidf.__getitem__(dictionary.doc2bow(query),eps=0))




[(261, 1), (351, 1), (719, 1), (2376, 1), (2756, 1), (2859, 1), (2979, 1), (3132, 1), (3351, 1)]

[(261, 0.7095646935780332), (719, 0.4084655003777961), (2859, 0.574172344012343)]


In [997]:

query_tf = tfidf[dictionary.doc2bow(query)]
print(query_tf)

#Compute soft cosine similarity against a corpus of documents by storing the index matrix in memory.
index = SoftCosineSimilarity(tfidf[[dictionary.doc2bow(document) for document in corpus]],similarity_matrix)

doc_similarity_scores = index[query_tf]
print(doc_similarity_scores)


[(261, 0.7095646935780332), (719, 0.4084655003777961), (2859, 0.574172344012343)]
[1.         0.94946295 0.91611385 0.87647414 0.7799178  1.
 0.77677476 0.7951516  0.96929705 0.7595645  1.         1.
 0.91914254 0.944202   1.         1.         0.69289446 0.9065589
 0.9115491  1.         1.         0.8632685  0.9852923  0.7073752
 0.8370332 ]


In [998]:

sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
for idx in sorted_indexes[:30]:
    print(f'{idx} \t {doc_similarity_scores[idx]:0.6f} \t {titles[idx]}')
        


0 	 1.000000 	 2016_ar_en_eur_con_00.txt
11 	 1.000000 	 2018_q1_en_eur_con_00.txt
20 	 1.000000 	 2020_ar_en_eur_con_00.txt
19 	 1.000000 	 2019_q4_en_eur_con_00.pdf.txt
5 	 1.000000 	 2017_ar_en_eur_con_00.txt
10 	 1.000000 	 2018_ar_en_eur_con_00.pdf.txt
15 	 1.000000 	 2019_ar_en_eur_con_00.txt
14 	 1.000000 	 2018_q4_en_eur_con_00.txt
22 	 0.985292 	 2020_q2_en_eur_con_00.txt
8 	 0.969297 	 2017_q3_en_eur_con_00.txt
1 	 0.949463 	 2016_q1_en_eur_con_00.txt
13 	 0.944202 	 2018_q3_en_eur_con_00.txt
12 	 0.919143 	 2018_q2_en_eur_con_00.txt
2 	 0.916114 	 2016_q2_en_eur_con_00.txt
18 	 0.911549 	 2019_q3_en_eur_con_00.txt
17 	 0.906559 	 2019_q2_en_eur_con_00.txt
3 	 0.876474 	 2016_q3_en_eur_con_00.txt
21 	 0.863268 	 2020_q1_en_eur_con_00.txt
24 	 0.837033 	 2020_q4_en_eur_con_00.txt
7 	 0.795152 	 2017_q2_en_eur_con_00.txt
4 	 0.779918 	 2016_q4_en_eur_con_00.txt
6 	 0.776775 	 2017_q1_en_eur_con_00.txt
9 	 0.759565 	 2017_q4_en_eur_con_00.txt
23 	 0.707375 	 2020_q3_en_eur_con_0

In [983]:

doc_similar_terms = []
max_results_per_doc = 30
#query = ['audit', 'control', 'board', 'structure', 'remuneration', 'shareholder', 'rights', 'transparency', 'performance']
for term in query:
    #dictionary = Dictionary(corpus+[query])
    #dictionary is query + my corpus(which has 25 documents)

    idx1 = dictionary.token2id[term]
    for document in corpus:
        #print(document.name)
        results_this_doc = []
        for word in set(document):
            idx2 = dictionary.token2id[word]
            score = similarity_matrix.matrix[idx1, idx2]
            if score > 0.0:
                results_this_doc.append((word, score))
                
        results_this_doc = sorted(results_this_doc, reverse=True, key=lambda x: x[1])  
        results_this_doc = results_this_doc[:min(len(results_this_doc), max_results_per_doc)] 
        doc_similar_terms.append(results_this_doc)
     


In [984]:
#esimese 15 dokumendi tulemused koos sarnaste sõnadega
results = []
for idx in sorted_indexes[:45]:
    
    similar_terms_string = ', '.join([result[0] for result in doc_similar_terms[idx]])
    results.append([idx,doc_similarity_scores[idx],titles[idx],similar_terms_string])
    print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {titles[idx]}  :  {similar_terms_string}')

20 	 0.568 	 2020_ar_en_eur_con_00.txt  :  access, users, secure, enabling, enable, link, services, internet, accessible, travel, enables, service, sharing, permit, maintains, operate, free, connect, applications, facilities, application, network, communication, restrict, provider, offers, monitor, via, controls, supports
10 	 0.564 	 2018_ar_en_eur_con_00.pdf.txt  :  access, providers, users, secure, enabling, enable, link, services, internet, accessible, connections, sites, travel, enables, service, sharing, operate, free, user, facilities, web, application, network, communication, restrict, provider, search, facilitate, offers, monitor
15 	 0.553 	 2019_ar_en_eur_con_00.txt  :  access, providers, users, secure, enabling, enable, link, services, internet, accessible, enables, service, sharing, maintains, operate, free, connect, enabled, applications, facilities, web, application, network, communication, restrict, provider, search, offers, monitor, via
24 	 0.549 	 2020_q4_en_eur_con_

In [985]:
df = pd.DataFrame(results, columns=["Index of document", "Similarity score in social topic","Document name","Most similar words in social topic"])

In [986]:
df

Unnamed: 0,Index of document,Similarity score in social topic,Document name,Most similar words in social topic
0,20,0.568225,2020_ar_en_eur_con_00.txt,"access, users, secure, enabling, enable, link,..."
1,10,0.563884,2018_ar_en_eur_con_00.pdf.txt,"access, providers, users, secure, enabling, en..."
2,15,0.552659,2019_ar_en_eur_con_00.txt,"access, providers, users, secure, enabling, en..."
3,24,0.549176,2020_q4_en_eur_con_00.txt,"access, services, enables, service, operate, f..."
4,5,0.513502,2017_ar_en_eur_con_00.txt,"access, providers, users, secure, enabling, en..."
5,18,0.509382,2019_q3_en_eur_con_00.txt,"access, providers, services, enables, service,..."
6,23,0.504362,2020_q3_en_eur_con_00.txt,"access, users, services, travel, service, free..."
7,13,0.503856,2018_q3_en_eur_con_00.txt,"access, providers, services, internet, enables..."
8,14,0.501616,2018_q4_en_eur_con_00.txt,"access, services, enables, service, free, comm..."
9,22,0.483542,2020_q2_en_eur_con_00.txt,"access, secure, services, internet, service, f..."


In [987]:
import pickle
df.to_pickle('Socialresults/LHV.pkl')


In [143]:
datacorpswed = pd.read_pickle('resultdata/swedsimilarityscores.pkl')
datacorpswed

Unnamed: 0,Order of similarity,Similarity score,Document name,Most similar words
0,1,0.264879,swedbankQ2_20_eng.txt,"critical, fax, bank"
1,2,0.267326,swedbankQ3_20_eng.txt,"critical, fax, bank"


In [144]:
datacorplhv = pd.read_pickle('resultdata/lhvsimilarityscores.pkl')
datacorplhv

Unnamed: 0,Order of similarity,Similarity score,Document name,Most similar words
0,1,0.261196,lhv2020_q1_en_eur_con_00.txt,"note, collection, operation"
1,2,0.040926,lhv2020_q2_en_eur_00_00.txt,
2,3,0.267063,lhv2020_q2_en_eur_con_00.txt,"note, collection"
3,0,0.040926,lhv2020_q1_en_eur_00_00.txt,


In [6]:
import pickle
databank = pd.read_pickle('annualresults/2019banksimilarityscores.pkl')
databank

Unnamed: 0,Index of document,Similarity score,Document name,Most similar words
0,1,0.431862,lhv_annual_2019.txt,"biodiversity, conservation, ecological, sustai..."
1,2,0.486714,seb_annual_report_2019.txt,"ecosystem, conservation, ecosystems, sustainab..."
2,3,0.518059,swedannual2019.txt,"conservation, sustainable, sustainability, div..."
3,0,0.461859,annual_report_2019_luminor.txt,"ecological, sustainable, sustainability, diver..."


In [27]:
#valideerimine näiteks balti pankade uuring: https://www.estwatch.ee/wp-content/uploads/2020/02/Vastutustundlikkus-Eesti-panganduses-Estwatch.pdf

In [195]:
from docsim import DocSim
import docsim

In [196]:
%%time

docsim_obj = docsim.DocSim(verbose=True)
# docsim_obj = docsim.DocSim_threaded(verbose=True)

Loading default GloVe word vector model: glove-wiki-gigaword-50
Model loaded
Wall time: 28.1 s


In [197]:
print(f'Model ready: {docsim_obj.model_ready}')

Model ready: True


In [198]:

titles = [item for item in datacorp['document']]
documents = [item for item in datacorp['text']]

print(f'{len(documents)} documents')

query_string = 'Audit and control, Board structure, Remuneration, Shareholder rights, Transparency and Performance'

6 documents


In [199]:
%%time

similarities = docsim_obj.similarity_query(query_string, documents)


6 documents loaded into corpus
Wall time: 36.3 s


  Y = np.multiply(Y, 1 / np.sqrt(Y_norm))
  Y = np.multiply(Y, 1 / np.sqrt(Y_norm))


In [200]:
for idx, score in (sorted(enumerate(similarities), reverse=True, key=lambda x: x[1])[:15]):
    print(f'{idx} \t {score:0.3f} \t {titles[idx]}')

3 	 0.547 	 lhv2020_q2_en_eur_con_00.txt
5 	 0.535 	 lhv2020_q3_en_eur_con_00.txt
1 	 0.524 	 lhv2020_q1_en_eur_con_00.txt
4 	 0.355 	 lhv2020_q3_en_eur_00_00.txt
0 	 0.000 	 lhv2020_q1_en_eur_00_00.txt
2 	 0.000 	 lhv2020_q2_en_eur_00_00.txt


[0.0, 0.5235986709594727, 0.0, 0.5465497970581055, 0.3552752733230591, 0.5345579385757446]
