# Semantic search demo

In [None]:
# import packages and functions
from utils.data_setup import text_to_processed_file, load_json_text_objects, text_object_to_dict
from utils.models import nlp, use_model, cross_model, summarizer
from utils.semantic_search import bi_semantic_search, bi_cross_semantic_search

## Import and process data
Assuming data is in raw text file with one row per free text entry.

We first read in the data and process it into custom Text and Sentence objects.

This processing involves generating vector embeddings for each sentence, so we save these processed objects as a serialised JSON file that can be easily re-loaded to save time.


In [None]:
import os
if not os.path.isfile('text1.json'):
    # if text1.json doesn't exist yet, we make it
    print(f'Creating example_data.json')
    source_texts = text_to_processed_file('text1.txt','text1.json', nlp, use_model, replace_list=None, remove_list=None)
    
else: 
    # if it exists, we just load it in
    print(f'Loading text1.json')
    source_texts = load_json_text_objects('text1.json') # load processed text data

In [None]:
source_texts = text_object_to_dict(source_texts) # index as dictionary
source_texts[5] # Example - Text object stored at key 5

### Example of Text and Sentence classes objects
This is how our data is stored for easy processing.

In [None]:
example_text = source_texts[0]
example_sentence = example_text.sentences[1]
print(f'- Text object -')
print(f'example_text.content: {example_text.content}')
print(f'example_text.id: {example_text.id}')
print(f'example_text.sentences: {example_text.sentences}')
print(f'\n- Sentence object -')
print(f'example_sentence.index: {example_sentence.index}')
print(f'example_sentence.sentence: {example_sentence.sentence}')
print(f'example_sentence.cleaned_sentence: {example_sentence.cleaned_sentence}')
print(f'example_sentence.vector_embedding[:5]: {example_sentence.vector_embedding[:5]}')
print(f'example_sentence.text_id: {example_sentence.text_id}')

## Semantic search

In [None]:
top_sentences, loc, output_text = bi_semantic_search('information about animals', source_texts, 
                                                    num_results=5, 
                                                    embed_model=use_model, 
                                                    redact=True, paragraph=True, p_n = 4, verbose=True)

In [None]:
top_sentences, loc, all_output_text, diagnostics = bi_cross_semantic_search('information about animals', source_texts, 
                                                                      num_results=5, 
                                                                      nlp_model = nlp,embed_model = use_model, cross_model = cross_model,
                                                                      redact=True, paragraph=True, p_n = 4, verbose=True)

## Summariser

In [None]:
long_text = '''In the early 20th century, technological advancements began reshaping the global landscape. 
The introduction of automobiles, initially seen as a luxury, started becoming more accessible to the masses. 
This was largely due to the innovative assembly line techniques developed by Henry Ford. 
Meanwhile, aviation took its first tentative steps, with pioneers like the Wright brothers changing our perception of travel. 
Telecommunication breakthroughs, especially the invention of the telephone by Alexander Graham Bell, revolutionized communication, 
bridging vast distances instantly. These innovations, among others, set the stage for the rapid modernization and globalization of the world, 
influencing industries, economies, and cultures.'''

In [None]:
summarizer(long_text)[0]['summary_text']