In [1]:
import json
import os
import pandas as pd

# import nltk
# # nltk.download('all')
# nltk.download('averaged_perceptron_tagger')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


In [2]:
lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    words = word_tokenize(text.lower())
    return ' '.join(lemmatizer.lemmatize(word) for word in words)

Extract search terms using tfidf

In [3]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

RESULTS_DIRECTORY = '00_data/keyword_search_results/'

ngram_min = 2
ngram_max = 2

with open('00_data/queries/query_sentences.json') as f:
    queries = json.load(f)

search_terms = {}

for key, value in queries.items():
    docs = []
    for text in value:
        docs.append(lemmatize(text))

    cv=CountVectorizer(stop_words='english', ngram_range = (ngram_min, ngram_max)) 
    word_count_vector=cv.fit_transform(docs)

    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
    tfidf_transformer.fit(word_count_vector)

    count_vector=cv.transform(docs) 
    tf_idf_vector=tfidf_transformer.transform(count_vector)

    feature_names = cv.get_feature_names_out() 
    first_document_vector = tf_idf_vector[0] 

    df= pd.DataFrame({'feature': feature_names, 'tfidf': [x.item(0) for x in first_document_vector.T.todense()]})
    df = df.sort_values(by='tfidf', ascending=False)
    df = df[df['tfidf'] > 0]

    search_terms[key] = list(df['feature'].values)

with open('{}/{}.json'.format(RESULTS_DIRECTORY, 'search_terms'), 'w') as f:
    json.dump(search_terms, f, indent=4)

Find sentences containing search terms

In [4]:
PARSED_DIRECTORY = '00_data/parsing_results'
PARSING_RESULTS = (file for file in os.listdir(PARSED_DIRECTORY) if os.path.isfile(os.path.join(PARSED_DIRECTORY, file)))
parsing_method = 'pdfminer'
min_matches = 1
min_length = 30

result = {}

for file in PARSING_RESULTS:
    if file.split('_')[0] == parsing_method:

        company = file.split('_')[1]

        if company not in result:
            result[company] = {}

        with open('{}/{}'.format(PARSED_DIRECTORY, file)) as f:

            parsed_text = json.load(f)

            for paragraph_id, sentences in parsed_text.items():

                whole_text = ' '.join([sent for sent in sentences.values()])
                if len(whole_text) >= min_length:

                    for sentence_id, sentence in sentences.items():

                        lemmatized_sentence = lemmatize(sentence)

                        for topic, terms in search_terms.items():

                            if topic not in result[company]:
                                result[company][topic] = []

                            match_count = 0
                            for term in terms:
                                if term in lemmatized_sentence:
                                    match_count += 1
                            
                            if match_count >= min_matches:
                                result[company][topic].append((paragraph_id, sentence_id, term))
                        
                        
with open('{}/{}_{}.json'.format(RESULTS_DIRECTORY, parsing_method, 'found_sentences'), 'w') as f:
    json.dump(result, f, indent=4)