# Sõnade arvutamine

Võtan tekstidest kõik sõnad ja võrdlen neid etteantud sõnadega, mis on seotud ESG'ga. Defineeritud sõnapaketi võtsin internetist.

In [None]:
import pandas as pd
import nltk
datacorp = pd.read_pickle('pickles/corpus.pkl')
datacorp["unigrams"] = datacorp["text"].apply(nltk.word_tokenize)
datacorp['quarters'] = datacorp.index
datacorp

In [None]:
import json
import logging
from re import sub
from multiprocessing import cpu_count

import numpy as np

import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

In [None]:
import logging

# logimine
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING)  # DEBUG # INFO

In [None]:
import nltk

# stoppsõnad
nltk.download('stopwords') 
stopwords = set(nltk.corpus.stopwords.words("english"))

In [None]:
#andmetest võtan andmepealkirjad ja tekstid nimetan dokumentideks

titles = [item for item in datacorp['quarters']]
documents = [item for item in datacorp['text']]


In [343]:
def preprocess(doc):
    # puhastamine ja tokeniseerimine
    doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
    doc = sub(r'<[^<>]+(>|$)', " ", doc)
    doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
    doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]

In [None]:
query_string = 'biodiversity'
query_Environment = 'Biodiversity Carbon Cleantech Clean Climate Coal Conservation Ecosystem Emission Energy Fuel Green Land Natural Pollution Renewable Resources Sustainability Sustainable Toxic Waste Water'
query_Social = 'Accident Alcohol Anti-personnel Behavior Charity Community Controversial Controversy Discrimination Gambling Health Human capital Human rights Inclusion Injury Lab Munitions Opposition Pay Philanthropic Quality Responsible'
query_Government = 'Advocacy Bribery Compensation Competitive Corruption Divestment Fraud GRI Independent Justice Stability Stewardship Transparency'


# Preprocess meetod
corpus = [preprocess(document) for document in documents]
query = preprocess(query_Environment)
query

In [346]:
%%time

# glove vektoripakk, siin on 400000 vektorit sees 
#https://nlp.stanford.edu/projects/glove/

if 'glove' not in locals():  
    glove = api.load("glove-wiki-gigaword-50")


#arvutame välja koosinuse sarnasused välja sõnavektorides
similarity_index = WordEmbeddingSimilarityIndex(glove)

Wall time: 0 ns


In [None]:
#ehitame TF-idf mudeli
def build_term(corpus, query):
    # Build the term dictionary, TF-idf model
    # The search query must be in the dictionary as well, in case the terms do not overlap with the documents (we still want similarity)
    dictionary = Dictionary(corpus+[query])
    tfidf = TfidfModel(dictionary=dictionary)
    # Create the term similarity matrix. 
    # The nonzero_limit enforces sparsity by limiting the number of non-zero terms in each column. 
    similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)  # , nonzero_limit=None)
    return similarity_matrix

In [None]:
a = build_term(corpus, query)

In [355]:
def doc_similarity_scores(query,similarity_matrix):
    query_tf = tfidf[dictionary.doc2bow(query)]
    index = SoftCosineSimilarity(tfidf[[dictionary.doc2bow(document) for document in corpus]],similarity_matrix)
    doc_similarity_scores = index[query_tf]
    return doc_similarity_scores

In [357]:

b = doc_similarity_scores(query,a)
print(b)

[0.37446806 0.43782774 0.4113688  0.3945238 ]


In [370]:
#meetod sarnasusskooride sortimiseks
def sort_similarity_scores_by_document(doc_similarity_scores):
    sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
    #for idx in sorted_indexes[:15]:
        #print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {titles[idx]}')
        
    return sorted_indexes

In [371]:
s = sort_similarity_scores_by_document(b)
s

array([1, 2, 3, 0], dtype=int64)

In [365]:
doc_similar_terms = []
max_results_per_doc = 20
for term in query:
    idx1 = dictionary.token2id[term]
    for document in corpus:
        results_this_doc = []
        for word in set(document):
            idx2 = dictionary.token2id[word]
            score = similarity_matrix.matrix[idx1, idx2]
            if score > 0.0:
                results_this_doc.append((word, score))
        results_this_doc = sorted(results_this_doc, reverse=True, key=lambda x: x[1])  # sort results by score
        results_this_doc = results_this_doc[:min(len(results_this_doc), max_results_per_doc)]  # take the top results
        doc_similar_terms.append(results_this_doc)

In [373]:
for idx in s[:15]:
    similar_terms_string = ', '.join([result[0] for result in doc_similar_terms[idx]])
    print(f'{idx} \t {b[idx]:0.3f} \t {titles[idx]}')

1 	 0.438 	 ignitis2020_q2_en_eur_con_ias.txt
2 	 0.411 	 ignitis2020_q2_en_eur_con_ias_00.txt
3 	 0.395 	 ignitis2020_q3_en_eur_con_ias.txt
0 	 0.374 	 ignitis2020_q1_en_eur_con_ias.txt


In [374]:
for idx in s[:15]:
    similar_terms_string = ', '.join([result[0] for result in doc_similar_terms[idx]])
    print(f'{idx} \t {b[idx]:0.3f} \t {titles[idx]}  :  {similar_terms_string}')

1 	 0.438 	 ignitis2020_q2_en_eur_con_ias.txt  :  sustainable, sustainability, preservation, diversity, environmental, resource, climate, impacts, environment, resources, natural, development, mitigation, global, nature, impact, implications
2 	 0.411 	 ignitis2020_q2_en_eur_con_ias_00.txt  :  sustainable, sustainability, preservation, diversity, environmental, resource, climate, impacts, environment, resources, natural, preserving, development, mitigation, global, nature, impact, implications
3 	 0.395 	 ignitis2020_q3_en_eur_con_ias.txt  :  sustainable, sustainability, preservation, diversity, environmental, climate, impacts, environment, resources, oceans, natural, preserving, development, global, nature, impact, implications
0 	 0.374 	 ignitis2020_q1_en_eur_con_ias.txt  :  sustainable, diversity, environmental, resource, environment, resources, natural, preserving, development, awareness, global, nature, prevention, pollution, impact, implications
