# Similarity Analysis using Bag of Words and tf-idf

In [1]:
import json
import os
import pandas as pd

# on a first run, the nltk models have to be loaded
# import nltk
# # nltk.download('all')
# nltk.download('averaged_perceptron_tagger')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [2]:
# Create a lemmatizer function to lemmatize all words in a string
lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    words = word_tokenize(text.lower())
    return ' '.join(lemmatizer.lemmatize(word) for word in words)

## Extract search terms using tfidf

In [3]:
RESULTS_DIRECTORY = '00_data/keyword_search_results/'

# ngram_min depicts the minimum number of words a search term can have
ngram_min = 2

# ngram_max depicts the maximum number of words a search term can have
ngram_max = 2

# Load topical sentences
with open('00_data/queries/query_sentences.json') as f:
    queries = json.load(f)

# search_terms will contain all search terms by topic
search_terms = {}

# Lemmatize search terms
for key, value in queries.items():
    docs = []
    for text in value:
        docs.append(lemmatize(text))

    # Count words for each sentence in the topical sentences
    cv=CountVectorizer(stop_words='english', ngram_range = (ngram_min, ngram_max)) 
    word_count_vector=cv.fit_transform(docs)

    # Calculate tf-idf scores
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
    tfidf_transformer.fit(word_count_vector)
    count_vector=cv.transform(docs) 
    tf_idf_vector=tfidf_transformer.transform(count_vector)

    # Use all terms as search terms that have a tf-idf score bigger than zero
    feature_names = cv.get_feature_names_out() 
    first_document_vector = tf_idf_vector[0] 

    df= pd.DataFrame({'feature': feature_names, 'tfidf': [x.item(0) for x in first_document_vector.T.todense()]})
    df = df.sort_values(by='tfidf', ascending=False)
    df = df[df['tfidf'] > 0]

    search_terms[key] = list(df['feature'].values)

# store search terms in json file
with open('{}/{}.json'.format(RESULTS_DIRECTORY, 'search_terms'), 'w') as f:
    json.dump(search_terms, f, indent=4)

## Find sentences containing search terms

In [4]:
PARSED_DIRECTORY = '00_data/parsing_results'
PARSING_RESULTS = (file for file in os.listdir(PARSED_DIRECTORY) if os.path.isfile(os.path.join(PARSED_DIRECTORY, file)))

# parsing_method depicts what encodings should be used (pdfminer or easyocr)
parsing_method = 'pdfminer'

# min_matches depicts the minimum number of different terms that have to match for a sentence to be deemed as relevant
min_matches = 1

# min_length depicts the minimum number of characters a paragraph should have to be searched
min_length = 30

# result will contain all relevant sentences by topic
result = {}

# Search all reports
for file in PARSING_RESULTS:

    # Only use reports with the specified parsing method
    if file.split('_')[0] == parsing_method:

        company = file.split('_')[1]

        # Add empty sub-dictionary for each company
        if company not in result:
            result[company] = {}

        # Load encoded report
        with open('{}/{}'.format(PARSED_DIRECTORY, file)) as f:

            parsed_text = json.load(f)

            # Go through each sentence in each paragraph
            for paragraph_id, sentences in parsed_text.items():

                # Test if minimum number of characters is reached
                whole_text = ' '.join([sent for sent in sentences.values()])
                if len(whole_text) >= min_length:

                    # lemmatize each sentence and search for search terms
                    for sentence_id, sentence in sentences.items():

                        lemmatized_sentence = lemmatize(sentence)

                        # Search by topic
                        for topic, terms in search_terms.items():

                            if topic not in result[company]:
                                result[company][topic] = []

                            # Add sentence to results if search terms are found
                            match_count = 0
                            for term in terms:
                                if term in lemmatized_sentence:
                                    match_count += 1
                            
                            if match_count >= min_matches:
                                result[company][topic].append((paragraph_id, sentence_id, term))
                        
# Store result in a json file                        
with open('{}/{}_{}.json'.format(RESULTS_DIRECTORY, parsing_method, 'found_sentences'), 'w') as f:
    json.dump(result, f, indent=4)