In [15]:
import weaviate
from weaviate.classes.config import Configure, Property, DataType
import weaviate.classes as wvc

additional_config = wvc.init.AdditionalConfig(
        timeout=wvc.init.Timeout(init=4, query=600, insert=60) # Query timeout set to 120 seconds (2 minutes)
)
client = weaviate.connect_to_local(
        additional_config=additional_config,
        skip_init_checks=True
)

            Please make sure to close the connection using `client.close()`.
  client = weaviate.connect_to_local(


In [16]:
# client.close()

## Helper function

In [17]:
import json


def parse_query_return(query_return):
    """
    Parses a QueryReturn-like object into a JSON string.
    """
    parsed_objects = []

    for obj in query_return.objects:
        parsed_obj = {
            "uuid": str(obj.uuid),
            "collection": getattr(obj, "collection", None),
            "properties": getattr(obj, "properties", {}),
            "metadata": {
                "creation_time": getattr(obj.metadata, "creation_time", None),
                "last_update_time": getattr(obj.metadata, "last_update_time", None),
                "distance": getattr(obj.metadata, "distance", None),
                "certainty": getattr(obj.metadata, "certainty", None),
                "score": getattr(obj.metadata, "score", None),
                "explain_score": getattr(obj.metadata, "explain_score", None),
                "is_consistent": getattr(obj.metadata, "is_consistent", None),
                "rerank_score": getattr(obj.metadata, "rerank_score", None),
            }
        }
        parsed_objects.append(parsed_obj)

    return json.dumps(parsed_objects, indent=2)

In [18]:
def get_citations(query_return_json):
    citations = set()
    try:
        # Parse the JSON string into a Python list of dictionaries
        query_results = json.loads(query_return_json)

        if not isinstance(query_results, list):
            print("Error: Input is not a valid list of results.")
            return citations

        for obj in query_results:
            try:
                # Extract required fields
                doc_id = obj.get('properties', {}).get('doc_id')
                paper_title = obj.get('properties', {}).get('paper_title')
                certainty = obj.get('metadata', {}).get('certainty')

                # Add to set only if all required fields are present
                if doc_id is not None and paper_title is not None and certainty is not None:
                    citations.add((doc_id, paper_title, certainty))
                else:
                    print(f"Warning: Skipping object due to missing data: {obj.get('uuid', 'UUID N/A')}")

            except KeyError as e:
                print(f"Warning: Skipping object due to missing key: {e}. Object: {obj.get('uuid', 'UUID N/A')}")
            except Exception as e:
                 print(f"Warning: An error occurred processing object {obj.get('uuid', 'UUID N/A')}: {e}")


    except json.JSONDecodeError:
        print("Error: Invalid JSON input string.")
    except Exception as e:
        print(f"An unexpected error occurred in get_citations: {e}")

    return citations    

## Loading collections

In [19]:
questions = client.collections.get('questions')
papers = client.collections.get("ResearchPapers")

In [None]:
user_prompt =  "What distinguishes GPT-3 from previous language models in terms of few-shot learning?" # ENTER THE CUSTOM PROMPT HERE

## Question Similarity Score

In [36]:
docs_que_similarity = questions.query.near_text(
    query=user_prompt,
    distance=0.8,
    limit=5,
    return_metadata=wvc.query.MetadataQuery(certainty=True, distance=True)
)


docs_que_similarity = parse_query_return(docs_que_similarity)


In [37]:
citations = get_citations(docs_que_similarity)
for citation in citations:
    print(citation)

('16', 'Language Models are Few-Shot Learners', 0.9278692007064819)
('16', 'Language Models are Few-Shot Learners', 0.9159530401229858)
('16', 'Language Models are Few-Shot Learners', 0.9314121007919312)
('16', 'Language Models are Few-Shot Learners', 0.9199385046958923)
('16', 'Language Models are Few-Shot Learners', 0.9943397641181946)


## Paper Similarity Score

- Hybrid Search

In [31]:
doc_search = papers.generate.hybrid(
    query=user_prompt,
    limit=1,
    # distance=0.8,
    # fusion_type="relativeScoreFusion",
    alpha=0.7,
    return_metadata=wvc.query.MetadataQuery(distance=True), 
    single_prompt=f"Answer this question using the given context: {user_prompt}\nContext: {{chunk_text}}",
    grouped_task=f"Based on the following context, answer the question: {user_prompt}"
)

print(doc_search.generated)

INSTRUCTDIAL is a dataset used for training and testing dialogue models, specifically designed to improve zero- and few-shot generalization in dialogue. It consists of tasks created from existing open-access dialogue datasets, categorized into various types such as classification, generation, evaluation, edit, pretraining, safety, and miscellaneous tasks.


In [26]:
doc_search = papers.generate.hybrid(
    query=user_prompt,
    limit=5,
    # distance=0.8,
    # fusion_type="relativeScoreFusion",
    alpha=0,
    return_metadata=wvc.query.MetadataQuery(distance=True), 
    single_prompt=f"Answer this question using the given context: {user_prompt}\nContext: {{chunk_text}}",
    grouped_task=f"Based on the following context, answer the question: {user_prompt}"
)

print(doc_search.generated)

The purpose of speculative sampling is to accelerate large language model decoding by generating multiple tokens from each transformer call, allowing for a faster and more efficient decoding process. By leveraging the latency of parallel scoring of short continuations generated by a faster but less powerful draft model, speculative sampling can achieve significant speedups without compromising on sample quality or requiring modifications to the target model itself.

In summary, speculative sampling enables the generation of multiple tokens from each transformer call, which allows for:

1. Faster decoding: By generating multiple tokens simultaneously, the total decoding time is reduced.
2. Reduced communication overheads: Serving a powerful draft model on the same hardware as the target model reduces the need for expensive communication operations.
3. Improved efficiency: Speculative sampling enables the use of a faster but less powerful draft model to generate short continuations, whic

## Proposed Flow 

- User enters a prompt 
- This prompt get embedded [nomic-embed-text] 




- Search for similarity with question  

- We get a certainity score for the top 5 results

- If certanity > 0.85, then pull that paper into context, and generate

- Else run search on all the docs



In [None]:
user_query = ""
def run_rag(user_query):


    return ans

print(run_rag(user_query))

In [None]:
Sentence Transformer - 
    Tokens - 1000
    Overlap - 200


Evals 
1. Recursive Transformer for text splitting - did not use because files are not in MD format. 

2. Token Window / Overlap window for Sentence Transformer - [Need to do this in the future].

3. nomic-embed-text - OpenAI - better than openai opensource model
        Lightweight 137M params
        Used to embed - questions, research papers, and user prompt. 

4. Llama3.2 [1.3B param model] - light weight, opensource, and allowed us to run on local 
    - We would get a giant performance leap, if we use an API for a better model. 

5. How did we tune alpha - experimented with 0.5,0.6, 0.7. 0.7 gave us the best results. But we still need to experminet more. [We checked with only 100 retrievals]

6. We are currently only working on evaluating retrieval performance. We need to device a method for evaluating the answers. 


