# Importing Model and APIs

In [None]:
from utilities import fetch_data
from dotenv import dotenv_values
from groq import Groq
from tqdm import tqdm
from sklearn.cluster import MeanShift
import numpy
from markdown_pdf import MarkdownPdf,Section

import ollama
import time
variables = dotenv_values(".env")
gen_model = "llama3-8b-8192"
embedding_model = "nomic-embed-text"

In [None]:

content = "https://wow.groq.com/retrieval-augmented-generation-with-groq-api"
content_type = "url"

client = Groq(api_key=variables["GROQ_API_KEY"])

# Prompts

In [None]:
topic_prompt = '''
Given the following excrepts compiled from textbooks and lecture transcripts on a subject.

{content}

Identify core topics discussed and provide them an importance score.
'''

content_prompt = '''
Given the following excrepts compiled from textbooks and lecture transcripts on a subject.

{content}

Clean the contents and make a comprehensive lecture notes on the topics being covered. Stick to the contents
'''

question_prompt = ''' 
Given the following lecture notes.

<lecture_notes>
{lecture_notes}
</lecture_notes>

Topic importance of each topic discussed in the lecture is given below.

<topic importance>
{topic_imp}
<topic importance>
You are a Teacher tasked with setting up a large number of questions for an upcoming examination. The number of questions per topic should depend upon the topic importance.
The questions should include conceptual, reasoning and application level questions. Do not generate answers. Generate questions and not a question distribution
'''

In [None]:
def get_embedding(text):
   return ollama.embeddings(model=embedding_model, prompt=text)['embedding']


In [None]:
def syllabus(content_clusters):
  labels = set(content_clusters.values())
  syllabus_list = []
  for label in tqdm(labels):
    content = "\n".join([ct for ct,lb in content_clusters.items() if lb == label])

    topic_response = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful professor tasked with teaching and testing knowledge of students."
        },
        {
            "role": "user",
            "content": topic_prompt.replace("{content}",content),
        }
    ],
    model=gen_model
).choices[0].message.content
    content_response =  client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful professor tasked with teaching and testing knowledge of students."
        },
        {
            "role": "user",
            "content": content_prompt.replace("{content}",content),
        }
    ],
    model=gen_model
).choices[0].message.content
    
    syllabus_list.append((topic_response,content_response))
    time.sleep(7)

    
  return syllabus_list

In [None]:
def generate_questions(content_tuple):
    content_dict ={"Lecture Note":content_tuple[1],"Topic Importance": content_tuple[0]}
    question_content = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful professor tasked with teaching and testing knowledge of students."
        },
        {
            "role": "user",
            "content": question_prompt.replace("{lecture_notes}",content_tuple[1]).replace("{topic_imp}",content_tuple[0]),
        }
    ],
    model=gen_model
).choices[0].message.content
    content_dict["Question Paper"] = question_content
    time.sleep(30)
    return content_dict

In [None]:
content_text = fetch_data.fetch_input(content,content_type)
content_text = [ct for ct in tqdm(content_text) if ct.replace("\n","").replace(" ","") != ""]
content_embedding = [get_embedding(ct) for ct in tqdm(content_text)]
content_embedding = numpy.array(content_embedding)
clusters = MeanShift().fit(content_embedding)
content_clusters = dict(zip(content_text,clusters.labels_))
content_model = syllabus(content_clusters)
content_dict = [generate_questions(ct) for ct in content_model]

In [None]:
content_dict = [{'Lecture Note': "Lecture Notes: Retrieval Augmented Generation with Groq API\n\nI. Introduction\n\n* The emergence of Large Language Models (LLMs) has transformed the way we interact with information\n* LLMs come with limitations, such as:\n\t+ Dated models and information\n\t+ Absence of domain-specific knowledge\n\t+ Inaccurate but plausible answers\n* Enter Retrieval Augmented Generation (RAG), an approach that addresses these limitations\n\nII. What is Retrieval Augmented Generation (RAG)?\n\n* RAG combines the strengths of information retrieval methods and LLMs\n* It harnesses pre-existing knowledge through a retrieval mechanism, allowing the model to pull in relevant information from a vast repository of data\n* This ensures that the generated content is not only contextually accurate but also grounded in real-world information\n* RAG aims to bridge the gap between traditional LLMs and human-like understanding\n\nIII. How does RAG help reduce the limitations of LLMs?\n\n* Dated Models and Information: RAG ensures the responsiveness of LLMs by consistently aligning generated responses with the latest, precise information sourced from an external database\n* Absence of domain-specific knowledge: RAG overcomes this hurdle by enriching the model's context with domain-specific data from an organization's knowledge base\n* Inaccurate but plausible answers: RAG combines generative capabilities with information retrieval, leveraging external knowledge to enhance the accuracy, contextuality, and reliability of the generated responses\n\nIV. Integrating RAG with Groq API\n\n* Connecting proprietary data to the Groq API is straightforward\n* Steps:\n\t1. Connect to your database\n\t2. Convert questions into a vector representation using an embedding model\n\t3. Query your database\n\t4. Add the retrieved information to the LLM system prompt\n\t5. Ask Groq API to answer your question\n\nV. Public Sector Applications\n\n* Despite unique challenges, leveraging LLMs remains feasible in the Public Sector\n* RAG can be a strategic approach for anchoring LLMs in the most current and verifiable information\n* RAG contributes to building user trust in the system, a crucial element in the Public Sector where transparency and precision are paramount\n\nVI. Potential Examples of how Public Sector Organizations can leverage LLMs with RAG\n\n* Customers can optimize their utilization of proprietary data in conjunction with open source LLMs running on the Groq hardware to extract the full power of LLMs\n* Customization is possible using own set of documents, other Vector Databases, other embedding models, and text generation LLMs available on Groq API",
  'Topic Importance': "Based on the provided excerpts, I've identified core topics and provided an importance score:\n\n1. **Overview of Retrieval Augmented Generation (RAG)** (Importance: 9/10)\n\t* Definition of RAG and its purpose\n\t* Overview of how RAG combines information retrieval and LLMs to enhance contextual understanding and content accuracy\n2. **Limitations of Large Language Models (LLMs)** and how RAG addresses them (Importance: 8.5/10)\n\t* Discussion of limitations, such as outdated models, lack of domain-specific knowledge, and inaccurate but plausible answers\n\t* How RAG mitigates these limitations to improve the reliability and utility of LLMs\n3. **Integrating RAG with Groq API** (Importance: 8.5/10)\n\t* Step-by-step instructions on how to connect proprietary data to the Groq API using Python\n\t* Overview of how to use RAG to enhance the accuracy and contextuality of generated responses\n4. **Benefits of RAG in Public Sector Organizations** (Importance: 8/10)\n\t* Discussion of how RAG can enhance the reliability and accuracy of responses in public sector organizations\n\t* Overview of the benefits of using RAG to build user trust and confidence in the system\n5. **Potential Examples of Public Sector Organizations leveraging LLMs with RAG** (Importance: 7.5/10)\n\t* Overview of how public sector organizations can leverage LLMs with RAG to extract the full power of LLMs\n\t* Examples of customization and usage of RAG with LLMs\n\nNote that the importance scores are subjective and based on my interpretation of the relevance and significance of each topic within the larger context of the text.",
  'Question Paper': "Based on the topic importance, I've created a set of questions for each topic. Since topic importance scores vary, I've allocated more questions to the more important topics.\n\n**Overview of Retrieval Augmented Generation (RAG)** (9/10)\n\n1. What is Retrieval Augmented Generation (RAG), and what problem does it solve in the context of Large Language Models (LLMs)?\n2. How does RAG combine information retrieval and LLMs to enhance contextual understanding and content accuracy?\n3. What are the primary goals of RAG, and how does it address the limitations of LLMs?\n\n**Limitations of Large Language Models (LLMs)** and how RAG addresses them (8.5/10)\n\n4. What are some of the limitations of Large Language Models (LLMs)?\n5. How do outdated models and information pose a challenge for LLMs, and how does RAG address this limitation?\n6. What is the role of domain-specific knowledge in LLMs, and how does RAG overcome the absence of such knowledge?\n\n**Integrating RAG with Groq API** (8.5/10)\n\n7. How do you connect proprietary data to the Groq API using Python, and what benefits does this integration provide?\n8. What are the key steps involved in using RAG with the Groq API, and how does this process enhance the accuracy and contextuality of generated responses?\n\n**Benefits of RAG in Public Sector Organizations** (8/10)\n\n9. How can RAG enhance the reliability and accuracy of responses in public sector organizations, and what benefits does this provide?\n10. What role does trust play in public sector applications, and how does RAG contribute to building user confidence in the system?\n\n**Potential Examples of Public Sector Organizations leveraging LLMs with RAG** (7.5/10)\n\n11. How can public sector organizations leverage LLMs with RAG to extract the full power of LLMs, and what benefits does this provide?\n12. What are some potential customization options for public sector organizations using RAG with LLMs, and how can they be implemented?\n\nAdditional questions:\n\n13. How does RAG ensure the relevance and accuracy of generated responses, and what role does information retrieval play in this process?\n14. What are some potential applications of RAG beyond public sector organizations, and how might it be used in other domains?\n15. How does RAG compare to other approaches to enhancing the accuracy and contextuality of LLMs, and what advantages does it offer?\n\nPlease note that these questions are meant to be a starting point and may require further refinement or modification to better align with the specific needs and goals of your examination."}]

In [None]:
content

In [None]:
def write_chapters(content_dict:dict, out_path:str) -> str:
    out_pdf = MarkdownPdf()
    out_content = ""  # Initialize out_content variable
    out_pdf.add_section(Section("# " + content.split("/")[-1].split(".")[0] + "\n"))
    for cd in content_dict:
        out_content += "## Section 01\n"
        out_content += "### Topics Discussed\n"
        out_content += cd["Topic Importance"] + "\n"
        out_content += "### Notes\n"
        out_content += cd["Lecture Note"] + "\n"
        out_content += "### Sample Questions\n"
        out_content += cd["Question Paper"] + "\n\n"
    out_pdf.add_section(Section(out_content))
    print(cd)
    out_pdf.save(out_path)


In [None]:
write_chapters(content_dict,"notes.pdf")

In [None]:
fetch_data.download_audio("https://youtu.be/om7TfE7cUko")

In [None]:
from pytube import YouTube

In [None]:
YouTube("https://youtu.be/om7TfE7cUko").streams.filter(type="video").all()[0].download("Data/temp.mp4")

In [None]:
pip install ffmpeg

In [None]:
ffmpeg.input("/Users/picklehari/Desktop/Code/practicaly_llm/Data/Life Lesson from Smokers  Take 01.mp4")