# Basic search pipelines

In [None]:
import boto3
import os
import config
os.chdir('/home/jovyan/rd-hr-smart-knowledge-management')
from datapipeline.utils.read_data_functions import read_docs
from datapipeline.utils.pipeline_functions import (
    setup_bm25_pipeline,
    run_semantic_indexing_pipeline,
    setup_semantic_pipeline,
    setup_hybrid_pipeline
)
from datapipeline.utils.search_functions import (
    bm25_search,
    semantic_search,
    hybrid_search,
    pretty_print_results
)

from api.src.lib import get_config
from api.src.lib.services import SERVICES
cfg = get_config()

## Read and preprocess data

In [None]:
# This isn't working at the moment - returns empty list

# s3client = SERVICES["s3client"]

# # Get a list of documents to be read in
# objs, _ = s3client.list()
# file_list = [obj["Key"] for obj in objs]
# print(file_list)

# dataset = read_docs(s3client, file_list)

In [None]:
# Get a list of documents to be read in
s3client = boto3.client('s3')

response = s3client.list_objects_v2(Bucket=cfg['S3_BUCKET'], Prefix=cfg['S3_KEY_PREFIX'])
objs = response["Contents"]

file_list = [obj["Key"] for obj in objs]

file_list

In [None]:
dataset = read_docs(s3client, file_list, docker=False, bucket_name=cfg['S3_BUCKET'])

In [None]:
dataset[31]

In [None]:
len(dataset)

In [None]:
# Write out dataset for manual inspection

import json
with open("dataset.json", "w") as outfile:
    outfile.write(json.dumps(dataset, indent=4))

In [None]:
# search_query = "maternity pay"
# search_query = "When will my GDD allowance expire?"
search_query = "Can I complete an assessment in probation?"

## BM25 pipeline

In [None]:
pipeline = setup_bm25_pipeline(dataset)

In [None]:
results = bm25_search(search_query, pipeline, filters=None, top_k=3)

In [None]:
pretty_print_results(results["bm25_retriever"]['documents'])

In [None]:
# results["bm25_retriever"]['documents']

## Semantic search pipeline

In [None]:
# Run this if it's not been run already, to set up a locally stored vector database
# run_semantic_indexing_pipeline(dataset)

In [None]:
semantic_pipeline = setup_semantic_pipeline(dataset)

In [None]:
results = semantic_search(search_query, semantic_pipeline, filters=None, top_k=3)

In [None]:
pretty_print_results(results["ranker"]['documents'])

## Hybrid search pipeline

In [None]:
# Run this if it's not been run already, to set up a locally stored vector database
# run_semantic_indexing_pipeline(dataset)

In [None]:
hybrid_pipeline = setup_hybrid_pipeline(dataset)

In [None]:
results = hybrid_search(search_query, hybrid_pipeline, filters=None, top_k=5)

In [None]:
pretty_print_results(results["ranker"]['documents'])