# Imports

In [None]:
from IPython.display import clear_output

In [None]:
!pip install -q chromadb google-generativeai python-pptx PyPDF2 pyTelegramBotAPI
clear_output()

In [None]:
import chromadb
from chromadb.utils import embedding_functions
from google.colab import userdata
import google.generativeai as genai
import csv
import json
import telebot
from telebot import types
import os
import getpass
import PyPDF2
import re
import time
from sentence_transformers import SentenceTransformer

In [None]:
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

# Presentation creation

In [None]:
client = chromadb.Client()

# Use the embedding model from SentenceTransformers
embedder = SentenceTransformer('all-MiniLM-L6-v2')


def setup_database(embedding_function):
    collection = client.get_or_create_collection('course', embedding_function=embedding_function)
    return collection

# Create a custom embedding function class
class SentenceTransformerEmbeddingFunction(embedding_functions.EmbeddingFunction):
    def __init__(self, model):
        self.model = model

    def call(self, input):
        # Expecting input to be a list of dictionaries with a 'content' key
        return self.model.encode(input)



# Initialize the custom embedding function
embedding_function = SentenceTransformerEmbeddingFunction(embedder)

model = genai.GenerativeModel('gemini-1.5-flash')
clear_output()

In [None]:
def generate_content(prompt: str) -> str:

  try:
    response = model.generate_content(prompt)
    return response.text if hasattr(response, 'text') else "Sorry, I couldn't generate a response."
  except Exception as e:
    return f"There was an error generating the response: {str(e)}"

  return "Sorry, I couldn't generate a response."


def generate_prompt_for_db(text):
  prompt_input = f"""
You are an assistant with artificial intelligence who corrects this text, preprocesses it so that the related words are separated by a space, turns it into a coherent text, shortens it a little and outputs it convenient for explaining the topic in the presentation.
Text:
{text}
Please provide output only in this format without anything else.
Format example:
Title:: Some title
Text:: Some text from given text
"""
  content = generate_content(prompt_input)

  return content


In [None]:
def database_replenishment(content, collection):
  if 'Title:: ' in content:
    content = content[8:].split('Title:: ')

    title = ''
    text=content

    for i in content:
      i = i.split('\nText:: ')
      if len(i)==2:
        title = i[0]
        text = i[1]

      # Add page as documents to the vector database
        collection.add(
            documents=[text],
            metadatas=[{"title": title}],
            ids=['ids'+str(collection.count())],
            embeddings=embedding_function.call(title+":: "+text)
          )
  else:
    collection.add(
          documents=[content],
          metadatas=[{"title": "Some text"}],
          ids=['ids'+str(collection.count())],
          embeddings=embedding_function.call(content)
        )


def extract_text_from_pdf(pdf_path, collection, message, msg):
  # removing special characters and tags from the documents
  pattern=r"[^\w]"
  length=0

  with open(pdf_path, 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    for i, page in enumerate(pdf_reader.pages):

      text = page.extract_text()

      if i%10==0:
        bot.edit_message_text(chat_id = message.chat.id, message_id = msg.message_id, text = f"Please wait while the file is being written to the database. It may take some time.\nDone...{round(i/len(pdf_reader.pages)*100, 2)}%")

      if text:

        text=re.sub(pattern, " ", text)
        mark=True

        try:
          for i in range(5):
            content = generate_prompt_for_db(text)

            if 'error generating the response:' not in content and "couldn't generate a response."  not in content:
              database_replenishment(str(content), collection)
              mark=False
              break

          if mark:
            database_replenishment(str(text), collection)
          mark=True

        except Exception as e:
          print('Error:', e)
          database_replenishment(str(text), collection)


  bot.edit_message_text(chat_id = message.chat.id, message_id = msg.message_id, text = "Please wait while the file is being written to the database. It may take some time.\nDone!")
  print('Successful')


In [None]:
def generate_quastion_for_PP_code(topics, text, name):
  prompt_input = f"""
You are an assistant with artificial intelligence who generates a code for creation a PowerPoint presentation using python-pptx library based on given text. Name the pptx file like this:{name}
Please provide output only in code format without anything else. Note that 'SlideShapes' object has no attribute 'subtitle'. Please note that the topics and texts must be spelled out explicitly (for example, text = 'Some text...')
List of topics:
{topics}
List of texts from book:
{text}
"""

  content = generate_content(prompt_input)
  return content[10:-4]


def generate_topics_for_section(text, titles):
  prompt_input = f"""
You are an artificial intelligence assistant who sets the topics that should be included in this section of the course. Create themes based on existing titles extracted from the database.
Please provide output only in code format without anything else.
Section:
{text}
Titles from database:
{titles}

Expected output format:
topic1
topic2
...
"""

  content = generate_content(prompt_input)[4:-4]
  content = content.split('\n')
  return content



def extract_section_topics_from_db(section, collection, titles):
  print('Section:', section)
  while 1:
    topics = generate_topics_for_section(section, titles)

    if 'error generating the response:' not in topics and "couldn't generate a response."  not in topics:
      break

  print('Topics generated:', topics)

  output=[]
  for query in section:
    # Generate embeddings for the query
    query_embedding = embedding_function.call(query)

    # Retrieve relevant documents from ChromaDB
    try:
      results = collection.query(
          query_embeddings=query_embedding,
          n_results=1
      )
    except Exception as e:
      print(f"Error occured in data extraction from db: {e}")
      results=[]


    if results['ids'][0]!=[]:

      # Extract content from the results
      retrieved_docs = results['documents'][0]

      output.append(retrieved_docs)

    else:
      output.append('There is no data on this topic in the book.')


  return output, topics


def topics2code(sections, docs):
  file_pptx = []
  length = 0

  for i, section in enumerate(sections):
    print('Topics:', section)
    print('Docs:', docs[i])

    while len(file_pptx)<=length:

      code=generate_quastion_for_PP_code(section, docs[i], f"Lecture{i+1}.pptx")

      while 'error generating the response' in code:
        code=generate_quastion_for_PP_code(section, docs[i], f"Lecture{i+1}.pptx")


      try:
        exec(code)
        file_pptx.append(f"Lecture{i+1}.pptx")
      except Exception as e:
        print('Error:', e)

    length+=1


  return file_pptx



In [None]:
# pdf_path = '/content/DKR_proposal.pdf'
# message=0
# msg=0
# collection = setup_database(embedding_function)
# extract_text_from_pdf(pdf_path, collection, message, msg)

# titles = []
# for i in range(collection.count()):
#   id = 'ids'+str(i)
#   titles.append(collection.get(ids=[id]))

# sections = 'Project Idea\nApplication Domain\nDescription(Dataset, metrics, prediction tasks)\nGraph ML Technique and Model with equations'
# sections = sections.split('\n')

# topics=[]
# docs = []

# # Create a PPTX file
# for section in sections:
#   outputs, queries = extract_section_topics_from_db(section, collection, titles)
#   topics.append(queries)
#   docs.append(outputs)

# print(topics)

# presents = topics2code(topics, docs)

In [None]:
TELEGRAM_API = userdata.get('pptx_telegram_API')
bot = telebot.TeleBot(TELEGRAM_API)

@bot.message_handler(commands=['start', 'help'])
def send_welcome(message):
    global collection
    collection = setup_database(embedding_function)

    if collection.count()>0:
      bot.send_message(message.chat.id, "Hi, this is a pptx_creator_bot for creating presentations based on the pdf version of the course book and lecture plan.\nYou already have a pdf file in the database.\nTo continue working with the file, send topics for creating presentations, and to replace the file with another one, send a new pdf file with course materials, but the data from the previous file will be deleted.")
    else:
      bot.send_message(message.chat.id, "Hi, this is a pptx_creator_bot for creating presentations based on the pdf version of the course book and lecture plan.\nTo start creating presentations, send a pdf file with the course materials.")


@bot.message_handler(content_types=['document'])
def handle_pdf(message):
    global collection
    collection = setup_database(embedding_function)

    if collection.count()>0:
      client.delete_collection(name="course")
      collection = setup_database(embedding_function)

    try:


        # Download the PDF file
        file_info = bot.get_file(message.document.file_id)
        downloaded_file = bot.download_file(file_info.file_path)

        pdf_path = 'book.pdf'
        # Save the PDF file temporarily
        with open(pdf_path, 'wb') as new_file:
            new_file.write(downloaded_file)

        # Extract text from the PDF
        msg = bot.send_message(message.chat.id, "Please wait while the file is being written to the database. It may take some time.")

        extract_text_from_pdf(pdf_path, collection, message, msg)
        os.remove('book.pdf')


        # Wait for the user to send text
        bot.reply_to(message, f"{collection.count()} documents have been successfully saved in the database.")
        bot.reply_to(message, "Please provide me the course content by highlighting each lecture in a separate line so that I can include it in pptx.")

    except Exception as e:
        bot.reply_to(message, f"Error: {e}")


@bot.message_handler(content_types=['text'])
def handle_text(message):
  global collection
  collection = setup_database(embedding_function)

  if collection.count()==0:
    bot.reply_to(message, "The database does not contain data about the book, so first send the pdf file, and then enter the topics of the lectures.")
  else:

    try:
        # Get the text from the user
        sections = message.text
        bot.reply_to(message, "Please wait until the bot sends you the pptx files.")
        sections = sections.split('\n')

        titles = []
        for i in range(collection.count()):
          id = 'ids'+str(i)
          titles.append(collection.get(ids=[id]))

        topics=[]
        docs = []

        # Create a PPTX file
        for section in sections:
          outputs, queries = extract_section_topics_from_db(section, collection, titles)
          topics.append(queries)
          docs.append(outputs)

        presents = topics2code(topics, docs)

        # Send the PPTX file back to the user
        for i in presents:
          with open(i, 'rb') as pptx_file:
              bot.send_document(message.chat.id, pptx_file)

        for i in presents:
          os.remove(i)


    except Exception as e:
        bot.reply_to(message, f"Error: {e}")


# Run the bot
# keep_alive()
bot.polling(non_stop=True, interval=0)

ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 836.53ms


Successful
Section: Representation of images and videos (Computer representation, Rescaling/manipulating images)
Topics generated: ['Computer Vision: Introduction and Images', 'Histograms and Binary Vision', 'Geometric Transformations', 'Image Processing: Edges, Features and Recognition', 'Computer Vision', 'Images in Computer Vision', 'A Practical Introduction to Computer Vision with OpenCV', 'Image Quantization in OpenCV', 'Image Quantization and its Impact on Computer Vision', 'Grey Scale vs. Colour Images', 'RGB Colour Images', 'Color Images in OpenCV', 'Image Representation and Processing in OpenCV', 'Colour Models: CMY and YUV', 'YUV and HLS Color Spaces in Computer Vision', 'HLS Color Space', 'RGB to HLS Conversion', 'Other Color Spaces', 'Color Applications', 'Skin Detection', 'Red Eye Detection with OpenCV', 'Noise in Images', 'Adding Gaussian Noise to Images in OpenCV', 'Adding Salt and Pepper Noise to Images in OpenCV', 'Noise Generation and Evaluation', 'Noise Reduction in 