# Sõnade arvutamine

Võtan tekstidest kõik sõnad ja võrdlen neid etteantud sõnadega, mis on seotud ESG'ga. Defineeritud sõnapaketi võtsin internetist.

In [None]:
import pandas as pd
import nltk
datacorp = pd.read_pickle('pickles/Seb2016t2020.pkl')
datacorp['document'] = datacorp.index

query_G = 'Audit and control, Board structure, Remuneration, Shareholder rights, Transparency and Performance'
query_S = 'Access to medicines, HIV, AIDs, Nutrition, Product safety, Community relations, Privacy and free expression, Security, Weak, governance zones, Diversity, Health and safety, ILO core conventions, Supply chain labor standards, Bribery and corruption, Political influence, Responsible marketing, Whistle-blowing systems, disclosure and reporting, Governance of sustainability issues, Stakeholder engagement, UNGC compliance'
query_E = 'Biofuels, Climate ,Emissions ,land, Biodiversity, Water, Environmental, standards, Pollution, Supply, Waste, recycling'
datacorp

In [None]:
import json
import logging
from re import sub
from multiprocessing import cpu_count

import numpy as np

import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

In [None]:
import logging

# logimine
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING)  # DEBUG # INFO

In [None]:
import nltk

# stoppsõnad
nltk.download('stopwords') 
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
#andmetest võtan andmepealkirjad ja tekstid nimetan dokumentideks

titles = [item for item in datacorp['document']]
documents = [item for item in datacorp['text']]


print(f'{len(documents)} documents')

In [None]:
def preprocess(doc):
    logging.info( 'tokenizing' )
    doc = doc.lower().split()
    doc = [w for w in doc if w not in stopwords]
    return doc

In [None]:
import re 
import string
def preprocess_query(doc):
    logging.info( 'tokenizing' )
    doc = doc.lower().split()
    doc = [remove_punc(i) for i in doc]
    doc = [w for w in doc if w not in stopwords]
    return doc

In [None]:
def remove_punc(string):
    punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
    for ele in string:  
        if ele in punc:  
            string = string.replace(ele, "") 
    return string

In [None]:
corpus = [preprocess(document) for document in documents]

query = preprocess_query(query_G)
print(query)

In [None]:
%%time
#pre-trained embeddings
# glove vektoripakk, siin on 400000 vektorit sees 
#https://nlp.stanford.edu/projects/glove/

if 'glove' not in locals():  
    glove = api.load("glove-wiki-gigaword-50")

#print(glove.most_similar("remuneration"))

#A term similarity index that computes cosine similarities between word embeddings.
#1) Compute cosine similarities between word embeddings.
#2) Retrieve the closest word embeddings (by cosine similarity) to a given word embedding.

similarity_index = WordEmbeddingSimilarityIndex(glove)


In [None]:
#vecs = similarity_index.keyedvectors


In [None]:
#from vec2graph import visualize

#visualize(r'C:\Users\marek.keskull\Documents\GitHub\NLP\Vizualization', vecs, 'audit')

In [None]:
#glove vektorite koosinussarnasus indeksi näide
#most_similar = similarity_index.keyedvectors.most_similar(positive=['water'], topn=10)
#most_similar

In [None]:
#ehitame TF-idf mudeli
    
#kõigepealt ehitame valmis andmesõnastiku, kus on sees kõik dokumendi sõnad ja otsingupäringu sõnad vormis: 'võti':'sõna'
logging.info( 'building dictionary' )
dictionary = Dictionary(corpus+[query])

#This module implements functionality related to the 
#Term Frequency - Inverse Document Frequency vector space bag-of-words models.
tfidf = TfidfModel(dictionary=dictionary)

  
#Builds a sparse term similarity matrix using a term similarity index. 
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf, nonzero_limit=100) 


In [None]:
#dictionary = Dictionary(corpus+[query])
#print(dictionary[429])
#basedir =r"C:\Users\marek.keskull\Documents\GitHub\NLP\Vizualization"
#logging.info( 'saving dictionary' )
#dictFile = basedir + '.dict'
#dictionary.save_as_text(dictFile, sort_by_word=True)

In [None]:
#sparse maatriksi kasutamise näide
#similarity_matrix.inner_product(dictionary.doc2bow(query),dictionary.doc2bow(corpus[7]))
#similarity_matrix.matrix.todense()

In [None]:
similarity_matrix.matrix.nnz

In [None]:
len(dictionary)**2

In [None]:

#for document in tfidf[[dictionary.doc2bow(document) for document in corpus]]:
 #   print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in document])

In [None]:
#print(dictionary.token2id)

In [None]:
#Compute soft cosine similarity against a corpus of documents by storing the index matrix in memory.
index = SoftCosineSimilarity(tfidf[[dictionary.doc2bow(document) for document in corpus]],similarity_matrix)

doc_similarity_scores = index.get_similarities(dictionary.doc2bow(query))
doc_similarity_scores


In [None]:
sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
d = []
for idx in sorted_indexes:
    d.append(
        {
            'Document no': idx,
            'Similarity score with query': doc_similarity_scores[idx],
            'Document name': titles[idx]
        }
    )

final = pd.DataFrame(d)
final

In [None]:

doc_similar_terms = []
max_results_per_doc = 30
#query = ['audit', 'control', 'board', 'structure', 'remuneration', 'shareholder', 'rights', 'transparency', 'performance']
for term in query:
    #dictionary = Dictionary(corpus+[query])
    #dictionary is query + my corpus(which has 25 documents)

    idx1 = dictionary.token2id[term]
    for document in corpus:
        #print(document.name)
        results_this_doc = []
        for word in set(document):
            idx2 = dictionary.token2id[word]
            score = similarity_matrix.matrix[idx1, idx2]
            if score > 0.0:
                results_this_doc.append((word, score))
               
        results_this_doc = sorted(results_this_doc, reverse=True, key=lambda x: x[1])
        
        results_this_doc = results_this_doc[:min(len(results_this_doc), max_results_per_doc)]
        #print(results_this_doc)
        doc_similar_terms.append(results_this_doc)
        


In [None]:

results = []
for idx in sorted_indexes[:30]:
    similar_terms_string = ', '.join([result[0] for result in doc_similar_terms[idx]])
    results.append(
        {
            'Document no': idx,
            'Similarity score with query': doc_similarity_scores[idx],
            'Document name': titles[idx],
            "Most similar words":similar_terms_string
        }
    )

similar_words = pd.DataFrame(results)
similar_words

In [None]:
import pickle
similar_words.to_pickle('Governanceresults/SEB.pkl')
