# UNICC Chatbot

In this notebook, we train the open-source Llama LLM on our dataset of UNECE documents, using a 4-bit quantized version from Unsloth to improve efficiency. To improve accuracy, we also introduced a RAG pipeline that identifies relevant parsed segments of the PDF database and passes them in as context to the Llama queries.

## Contents

1. Performing text splitting: parses PDF database of UNECE policy documents and session resolutions into 50-500 character "chunks" using font size, boldness, etc. to identify section headers. Stores these chunks along with document metadata to later feed into the RAG pipeline.

  **1.1**. Uses llama-index (open source embedding library) to embed and store these chunks in a vector-based document index for later collection. Uses a traditional tf-idf scoring with cosine similarity for relevance evaluations.

2. Llama 4-bit quantized: prepares the model itself, using unsloth to get a pre-trained 4-bit quantized version of Llama 3.1 8B Instruct. Re-loads the same PDFs from the text splitting phase but as entire documents to pass into the model for fine-tuning.

    **2.1.** Uses LoRA for fine-tuning
    **2.2.** Trains with SFTTrainer from hugging face

3. Front end: basic chatbot website set up for demo purposes -- allows user to input questions, view responses, and interact with relevant documents based on submitted queries (collected from the chunk embeddings metadata). Uses ngrok to simulate a mini-server on Colab.

In [None]:
# Text parsing packages
!pip install pdfplumber
!pip install fitz
!pip install PyMuPDF
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
# Imports for the RAG encodings
!pip install llama-index
!pip install llama-index-embeddings-huggingface

In [None]:
# Imports for the model
!pip install unsloth
!pip install -U bitsandbytes

In [None]:
# Installs for frontend
!pip install flask-ngrok pyngrok
!ngrok authtoken 2ojOiPQ59Oi8KsIkY8xByZxp3xp_GJ1GTXSJfSimSNUKquke
!pip install pdf2image

# Performing text splitting

Parsing PDF into chunks based on topic headers, which are then encoded to use in the RAG pipeline

In [None]:
import fitz
import os
from google.colab import drive
import glob
import re

def extract_chunks(pdf_path, title):
    chunks = []
    current_chunk = ""
    current_title = "Introduction"  # default title for the first chunk
    current_title = title
    tables = []
    table_pattern = re.compile(r"([A-Za-z0-9]+(\s{2,}|,\s?))+")  # Pattern for detecting rows in tables

    # bullet points pattern --> used to avoid tracking bullet itself as a section header
    bullet_pattern = re.compile(r"^[•●○‣▪■□–-]\s")

    doc = fitz.open(pdf_path)
    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                table_content = []
                for line in block["lines"]:

                    #accounting for too big chunks
                    if len(current_chunk) > 650:
                        chunks.append({"title": title, "chunk_title": current_title, "content": current_chunk.strip()})
                        current_chunk = ""
                        chunk_title=""

                    line_text = " ".join([span["text"] for span in line["spans"]]).strip()
                    font_size = line["spans"][0]["size"]
                    font_name = line["spans"][0]["font"]

                    # heuristics for headers
                    is_bold = "Bold" in font_name or "SemiBold" in font_name
                    is_bullet = bullet_pattern.match(line_text)
                    line_text = bullet_pattern.sub("", line_text).strip()

                    # detect numbers-only lines (eg page numbers)
                    if line_text.isdigit():
                        continue

                    # heuristics for footnotes (choosing to ignore for now bc often not useful without context)
                    if font_size < 10:
                        continue

                    # checking for tables
                    is_table_line = table_pattern.match(line_text)
                    if is_table_line:
                        table_content.append(line_text)
                        continue

                    # checking for headers
                    if (font_size >= 14 or is_bold) and not is_bullet:
                        if current_chunk:
                            chunks.append({"title": title, "chunk_title": current_title, "content": current_chunk.strip()})
                            current_chunk = ""
                        current_title = line_text
                        current_chunk = line_text #adding the title to the chunk, just bc chunk_title is often not that specific and is messing us up
                    else:
                        current_chunk += " " + line_text

                if table_content:
                    tables.append({"title": title, "chunk_title": current_title, "content": "\n".join(table_content)})
                    table_content = []

    if current_chunk:
        chunks.append({"title": title, "chunk_title": current_title,  "content": current_chunk.strip()})
    if tables:
        chunks.extend(tables)

    return chunks

drive.mount('/content/drive')

folder_path = '/content/drive/My Drive/UNICC_dataset/' #'/content/drive/My Drive/UNICC_db/'

# Get all .pdf files in the folder
#NOTE: you will have to make this file yourself in your own drive, it just contains all of the PDFS Jason gave us
file_pattern = os.path.join(folder_path, '*.pdf')
chunks = []
for file_path in glob.glob(file_pattern):
    filename = os.path.basename(file_path)
    title = os.path.splitext(filename)[0]
    print("Extracting passages from document:", file_path)
    chunks.extend(extract_chunks(file_path, title))

#removing super short chunks, as these are not informative based on testing
rem_chunks =  [chunk for chunk in chunks if len(chunk["content"]) < 50 ]
chunks =  [chunk for chunk in chunks if len(chunk["content"]) >= 50 ]

"""
for chunk in chunks:
    print(f"Title: {chunk['title']}")
    print(f"Chunk Title: {chunk['chunk_title']}")
    print(f"Content length: {len(chunk['content'])}")
    print(f"Content: {chunk['content'][:200]}...")  # Print the first 200 characters of content
    print("\n")
"""


## Getting embeddings from chunks

In [None]:
# using https://docs.llamaindex.ai/en/v0.9.48/examples/embeddings/huggingface.html
# https://docs.llamaindex.ai/en/stable/module_guides/models/embeddings/

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Document, Settings, VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.llms import MockLLM

# we are only using llama_index for its embeddings and collecting relevant context, not its LLMs, so this is basically a placeholder
llm = MockLLM()

embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

Settings.embed_model = embed_model

# reformatting chunks found above into document form
documents = [
    Document(
        text=chunk['content'],
        metadata={
            'document_title': chunk['title'],
            'chunk_title': chunk['chunk_title']
        }
    ) for chunk in chunks
]

index = VectorStoreIndex.from_documents(documents)

In [None]:
# testing relevant document retrieval
query_engine = index.as_query_engine(embed_model=embed_model, llm=llm, similarity_top_k=10)

response = query_engine.query("What is methane drainage?")

print("\nRelevant documents found:")
for i, node in enumerate(response.source_nodes, start=1):
    print(f"\nDocument {i}:")
    print(f"Document Title: {node.node.metadata.get('document_title', 'No title available')}")
    print(f"Chunk Title: {node.node.metadata.get('chunk_title', 'No chunk title available')}")
    print(f"Content: {node.node.text}\n")

# Llama 4-bit quantized


In [None]:
# FRom https://huggingface.co/unsloth/Meta-Llama-3.1-8B-bnb-4bit

* We support Llama, Mistral, Phi-3, Gemma, Yi, DeepSeek, Qwen, TinyLlama, Vicuna, Open Hermes etc
* We support 16bit LoRA or 4bit QLoRA. Both 2x faster.
* `max_seq_length` can be set to anything, since we do automatic RoPE Scaling via [kaiokendev's](https://kaiokendev.github.io/til) method.
* [**NEW**] We make Gemma-2 9b / 27b **2x faster**! See our [Gemma-2 9b notebook](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing)
* [**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)
* [**NEW**] We make Mistral NeMo 12B 2x faster and fit in under 12GB of VRAM! [Mistral NeMo notebook](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing)

In [None]:
from unsloth import FastLanguageModel
import torch

# from unsloth docs at https://huggingface.co/unsloth

max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", #"unsloth/Meta-Llama-3.1-70B-bnb-4bit", # "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
# adding LoRA adaptors -- use a very high lora_alpha to increase the impact of UNECE dataset over pre-training.
# also use rank stabilized LoRA for slightly improved performance
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64, # increased from 16 to increase the impact of our training dataset in comparison to the default training
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # rank stabilized LoRA
    loftq_config = None,
)

### Data Prep
Parsing UNECE PDFS into text -- doesn't do any other processing.

In [None]:
import fitz
import spacy
import glob
from google.colab import drive
import os
import pandas as pd
from datasets import load_dataset


def extract_text(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    text = text.replace('\n', '')
    # make it all lower case
    text = text.lower()
    print(text)
    return text

drive.mount('/content/drive')

# processing all PDFs
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
docs = []
folder_path = '/content/drive/My Drive/UNICC_dataset' #UNICC_db for smaller sample
file_pattern = os.path.join(folder_path, '*.pdf')
for file_path in glob.glob(file_pattern):
    print(f"Processing file: {file_path}")
    docs.append(extract_text(file_path) + EOS_TOKEN)


df = pd.DataFrame({"text": docs})
df.to_csv('dataset.csv', index=False, escapechar='\\') #using escapechar bc our actual data contains comma

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

dataset = load_dataset('csv', data_files={'train': 'dataset.csv'}, split='train')
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# add labels by copying input_ids
def add_labels(batch):
    batch['labels'] = batch['input_ids'].copy()
    return batch

tokenized_datasets = tokenized_datasets.map(add_labels, batched=True)

<a name="Train"></a>
### Train the model
SFT Docs (chosen based on unsloth docs): (https://huggingface.co/docs/trl/sft_trainer) -- train with max_steps for now to shorted training process and reduce compute on Colab

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        #num_train_epochs = 1, # set this for 1 full training run.
        max_steps = 10,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
# training!
import os
from google.colab import userdata

# setting api key from colab secrets
os.environ["WANDB_API_KEY"] = userdata.get('wandb-api-key')
trainer_stats = trainer.train()

<a name="Inference"></a>
### Inference

In [None]:

# Example questions:  Why is it important to reduce gas emissions?
# From what type of mine do most coal mine emissions come from?
# When and where did the first occurance of methane drainage take place?

conv_history = [] # list of question - response strings
context = ""
max_conv_len = 1024 # used to trim conv history if it's getting too long

""" returns a formatted string of the conversation history to pass into a prompt """
def get_conv_hist():
    global conv_history
    formatted_conv = [("User: " + conv if i % 2 == 0 else "AI assistant: " + conv) for i, conv in enumerate(conv_history)]
    formatted_conv= "\n".join(formatted_conv)
    return formatted_conv

""" adds text to the existing conv history, and reduces len if it exceeds max_conv_len """
def set_conv_hist(text):
    global conv_history
    conv_history.append(text)

    # cutting off oldest parts of the conversation in pairs of 2 (question + answer)
    while len("\n".join(conv_history)) > max_conv_len:
      conv_history = conv_history[2:]


"""
Uses the query engine created during the "Getting embeddings from chunks" section
to identify the most similar document chunks
given a specific query.
"""
def get_context(question):
    # adding context from the identified similar chunks
    global context
    cur_cont = ""
    titles = []
    query_engine = index.as_query_engine(embed_model=embed_model, llm=llm, similarity_top_k=10)
    question_history = [conv if i%2 == 0 else "" for i, conv in enumerate(conv_history)]
    #print("CONTEXT QUERY: ", ("\n".join(question_history) + question))
    response = query_engine.query(("\n".join(question_history) + question))
    for i, doc in enumerate(response.source_nodes, start=1):
        #print(f"Document {i}: {doc.node.text[:200]}...")
        #print(f"Document Title: {doc.node.metadata.get('document_title', 'No title available')}")
        if i < 8: #taking top 8 results
          cur_cont += f"Document: {doc.node.metadata.get('document_title', 'No title available')} (Excerpt from text: {doc.node.text}) \n\n"
          titles.append(doc.node.metadata.get('document_title', 'No title available'))

    # adding this new round of docs to the FRONT of the context string
    context = cur_cont + context

    return context, titles

"""
Explores prompt engineering to get a prompt that takes in context and a question.
TODO: this doesn't really account for questions that CANNOT be answered in the dataset.
"""
def get_prompt(question):
    context, titles = get_context(question)

    formatted_conv = get_conv_hist()

    # this prompt helps keep a consistent complete sentance format
    # NOTE: the tags are specific to the instruct model, see https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1/
    prompt = f"""
    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are a helpful AI assistant. You will analyze the information in the provides context passages and conversation history, and answer questions based solely on that context.
    Answer the question based on the information in the passages.

    - Do NOT reference the context chunks directly
    - Respond in a complete sentence
    - if the question cannot be answered based on the information in the passages, say so explicitly

    Here is the history of your conversation with the user:
    {formatted_conv}
    <|eot_id|>

    <|start_header_id|>user<|end_header_id|>

    Here is the relevant context:
    {context}

    Question: {question}

    <|start_header_id|>assistant<|end_header_id|>
    """

    return prompt, titles

"""
Primary method called from the frontend.

Takes in a question, formats the prompt and context, then passes the output into the model.
"""
def get_response(question):
  global conv_history
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference

  prompt, titles = get_prompt(question)

  #print(prompt)

  inputs = tokenizer([prompt], return_tensors = "pt").to("cuda")
  #outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
  outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        min_new_tokens=5, #key to avoiding empty inputs
        temperature=0.1, # increase here = more likely to choose less probable tokens, increases creativity (we don't want that lol)
        use_cache=True,
        top_p=0.2, # similar to temp
        #num_beams=3,
        # turned this off to ensure consistency, cancells out temp and top_p
        do_sample=False, # random samples groups of likely tokens, also introduces randomness that increases creativity
        pad_token_id=tokenizer.eos_token_id,
        # Stop at the end of the answer
        eos_token_id=tokenizer.eos_token_id,
        # Prevent prompt repetition
        no_repeat_ngram_size=3
    )

  #response = tokenizer.batch_decode(outputs)
  response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
  if not response: #this shouldn't ever happen bc we've set min_new_tokens but is here in case something goes wrong
    return "Hm, I can't seem to find an answer to that question in this dataset."

  # adding both the question and response to the conversation history
  set_conv_hist(question)
  set_conv_hist(response.strip())

  return response.strip(), titles

In [None]:
# Testing inference

# Example questions:  Why is it important to reduce gas emissions?
# From what type of mine do most coal mine emissions come from?
# When and where did the first occurance of methane drainage take place?
# "When and where was methane drainage first recorded?"
# "Which documents should I reference to learn more about methane drainage?"
# "From what type of coal mine does the most ventilation air methane come from?" --> this one is good

question = "What is methane drainage?"
resp, titles = get_response(question)
print(resp)
print("\n")

question = "Can you explain in more detail?"
resp, titles = get_response(question)
print(resp)
print("\n")

question = "When and where did methane drainage first take place?"
resp, titles = get_response(question)
print(resp)
print("\n")

#Front end

We run a flask app from the server in the "Main Server" subsection. This handles a basic frontend that displays an input for the user to ssend a question, then sends that question to our inference functions, which returns a response. We then display the response on the frontend and prompt the user again for input.

INSTRUCTIONS to run the app:


1.   Run all of the cells in the file until you reach the "Main server" section (shortcut: go to that cell, click on it, then go to Runtime->Run before to run every cell prior in the notebook)
2.   Run the main server cell
3.   After it starts, you'll see the following output (or similar):

* Public URL: NgrokTunnel: "https://8c7f-34-142-236-178.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off
INFO:werkzeug:WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
 * Running on http://127.0.0.1:5000

4.   Click on the NgrokTunnel URL (NOT localhost or 127.0.0.1), then select OK when asked about security. This will be the page where you can see/interact with the bot.



## HTML Templates
Run each cell to create the file, which will then be stored in the Colab file storage. This just prevents us from having to upload new files every time we run the Colab/change the HTML

In [None]:
# First, create necessary directories and files
!mkdir -p templates static/css
!mkdir -p content

In [None]:
# Write HTML content to files
%%writefile templates/base.html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{{ title }}</title>
    <link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
</head>
<body>
    <nav>
        <ul>
            <li><a href="{{ url_for('home') }}">Home</a></li>
        </ul>
    </nav>

    <main>
        {% block content %}
        {% endblock %}
    </main>
</body>
</html>

In [None]:
%%writefile templates/home.html
{% extends "base.html" %}

{% block content %}
<div class="container">
    <h1>Welcome to Flask</h1>
    <p>This is your homepage with styled content!</p>
</div>
{% endblock %}

In [None]:
%%writefile templates/index.html

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>GenAI-Bot</title>
    <link rel="stylesheet" href="{{ url_for('static', filename='css/bot-style.css') }}">
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.2.1/jquery.min.js"></script>
</head>
<body>
    <header>
        <h1>Questions about Climate Change?</h1>
        <h4>Ask our interactive bot below to be directed to helpful resources</h4>
    </header>

    <main>
        <div class="chat-container">
            <div id="chatbox" class="chat-box">
                <p class="botText">
                    <span>Hello! How can I help you?</span>
                </p>
            </div>
            <div id="userInput" class="input-container">
                <input id="textInput" type="text" name="msg" placeholder="Type your message..." />
                <!-- could add submit button here if wanted -->
            </div>
        </div>


        <!-- what actually displays the relevant docs -->
        <div id="relevantDocs" class="relevant-docs">
            <h4>Relevant Documents:</h4>
            {% for doc in rel_docs %}
            <p>
                <a href="{{ doc.url }}" target="_blank">{{ doc.title }}</a>
            </p>
            {% endfor %}
        </div>

        <div class="actions">
            <a href="{{ url_for('dataset')}}" class="view-pdfs-button">View all PDFs</a>
        </div>

        <br/>
        <br/>
        <br/>

    </main>

    <script>
        function getBotResponse() {
            var rawText = $("#textInput").val();
            var userHtml = '<p class="userText"><span>' + rawText + "</span></p>";
            $("#textInput").val("");
            $("#chatbox").append(userHtml);
            document
                .getElementById("userInput")
                .scrollIntoView({ block: "start", behavior: "smooth" });

            $.get("/get", { msg: rawText }).done(function(data) {
                var botHtml = '<p class="botText"><span>' + data.response + "</span></p>";
                $("#chatbox").append(botHtml);

                var docsContainer = document.getElementById("relevantDocs");
                docsContainer.innerHTML = ""; // Clear existing docs
                data.rel_docs.forEach(function(doc) {
                    var p = document.createElement("p");
                    var a = document.createElement("a");
                    a.href = doc.url;
                    a.textContent = doc.title;
                    a.target = "_blank"; // Opens in new tab
                    p.appendChild(a);
                    docsContainer.appendChild(p);
                });

                document
                    .getElementById("userInput")
                    .scrollIntoView({ block: "start", behavior: "smooth" });
            });
        }

        $("#textInput").keypress(function(e) {
            if (e.which == 13) {
                getBotResponse();
            }
        });
    </script>
</body>
</html>


In [None]:
%%writefile templates/pdf_gallery.html

<!DOCTYPE html>
<html>
<head>
    <title>PDF Thumbnail Gallery</title>
     <link rel="stylesheet" href="{{ url_for('static', filename='css/bot-style.css') }}">
    <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.16/dist/tailwind.min.css" rel="stylesheet">
</head>
<body>
    <div class="back-button">
        <button> <a href="{{ url_for('home')}}"> Back to Bot </a> </button>
    </div>
    <div class="container mx-auto my-8">
        <h1 class="text-3xl font-bold mb-4">PDF Thumbnail Gallery</h1>
        <div class="grid grid-cols-3 gap-4">
            {% for pdf in pdf_data %}
            <div class="border rounded shadow p-4">
                <!-- <img src="{{ pdf.thumbnailUrl }}" alt="{{ pdf.title }}" class="w-full h-auto"> -->
                <a href="#" onclick="openPdfInNewTab('{{ pdf.pdfUrl }}'); return false;" class="block border rounded shadow p-4 hover:shadow-lg transition-shadow">
                  <img src="{{ url_for('static', filename=pdf.thumbnailUrl) }}" class="image" />
                  <h3 class="mt-2 text-lg font-medium">{{ pdf.title }}</h3>
                </a>
            </div>
            {% endfor %}
        </div>
    </div>
    <script>
        function openPdfInNewTab(pdfUrl) {
            window.open(pdfUrl, '_blank');
        }
    </script>
</body>
</html>

##CSS

In [None]:
%%writefile static/css/bot-style.css


* {
    box-sizing: border-box;
    margin: 0;
    padding: 0;
}

body, html {
    height: 100%;
    font-family: 'Arial', sans-serif;
    background-color: #f4f4f9;
    color: #333;
    line-height: 1.6;
}

header {
    text-align: center;
    padding: 20px;
    background-color: #f4f4f9;
    color: #4c87af;
}

header h1 {
    margin-bottom: 10px;
    font-size: 2rem;
}

header h4 {
    font-weight: normal;
}

.chat-container {
    max-width: 600px;
    margin: 20px auto;
    padding: 20px;
    background: white;
    border-radius: 10px;
    box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
}

.chat-box {
    max-height: 400px;
    overflow-y: auto;
    margin-bottom: 20px;
    padding: 10px;
    border: 1px solid #ddd;
    border-radius: 5px;
    background-color: #f9f9f9;
}

.input-container {
    display: flex;
    justify-content: space-between;
}

#textInput {
    width: 100%;
    padding: 10px;
    border: 1px solid #ccc;
    border-radius: 5px;
    font-size: 16px;
    outline: none;
    transition: border-color 0.2s;
}

#textInput:focus {
    border-color: #4c87af; //4CAF50;
}

.userText, .botText {
    margin: 10px 0;
    font-size: 16px;
}

.userText span {
    background-color: #444;
    color: white;
    padding: 10px;
    border-radius: 10px;
    display: inline-block;
}

.botText span {
    background-color: #4c87af;
    color: white;
    padding: 10px;
    border-radius: 10px;
    display: inline-block;
}

.actions {
    text-align: center;
    margin: 20px 0;
}

.view-pdfs-button {
    display: inline-block;
    padding: 10px 20px;
    background-color: #4c87af;
    color: white;
    text-decoration: none;
    border-radius: 5px;
    transition: background-color 0.3s;
}

.view-pdfs-button:hover {
    background-color: #456ba0; //45a049;
}

.relevant-docs {
    max-width: 600px;
    margin: 20px auto;
    padding: 10px;
    background: #f9f9f9;
    border-radius: 5px;
    box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
}

.relevant-docs p a {
    color: #333;
    text-decoration: none;
}

.relevant-docs p a:hover {
    text-decoration: underline;
}

footer {
    text-align: center;
    padding: 10px 0;
    background: #4c87af;
    color: white;
    position: fixed;
    bottom: 0;
    width: 100%;
}


## Main server

In [None]:
# from https://colab.research.google.com/drive/10doc9xwhFDpDGNferehBzkQ6M0Un-tYq#scrollTo=QDVm2QUrnJaF
!apt-get install poppler-utils

In [None]:
from flask import Flask, request, render_template, jsonify
from pyngrok import ngrok
import os
import shutil
from pdf2image import convert_from_path
from googleapiclient.discovery import build
from google.oauth2 import service_account
import json

""" uses GDrive API to find folder with a given name """
def find_folder(drive_service, folder_name):
    # getting all folders shared with the drive API
    all_folders = drive_service.files().list(
        q="mimeType='application/vnd.google-apps.folder' and trashed=false",
        fields='files(id, name)',
        spaces='drive'
    ).execute()

    folders = all_folders.get('files', [])
    for folder in folders:
        if folder['name'].lower() == folder_name.lower():
            return folder['id']

    return None

""" gets a folder with GDrive API, then pulls all PDFS from the folder """
def get_pdfs():
    colab_dir = '/content/static'
    os.makedirs(colab_dir, exist_ok=True)

    # initialize Google Drive API
    creds = service_account.Credentials.from_service_account_info(
        info=json.load(open('/content/service_account.json', 'r'))
    )
    drive_service = build('drive', 'v3', credentials=creds)

    folder_id = find_folder(drive_service, 'UNICC_dataset')

    if not folder_id:
        raise Exception("Could not find the UNICC_db folder. Please check folder sharing permissions.")

    # Search for PDF files in the folder
    file_list = drive_service.files().list(
        q=f"'{folder_id}' in parents and mimeType='application/pdf' and trashed=false",
        fields='files(id, name, webViewLink)',
        pageSize=100,
        spaces='drive'
    ).execute()

    pdf_files = file_list.get('files', [])
    print(f"\nFound {len(pdf_files)} PDF files:")
    for pdf in pdf_files:
        print(f"- {pdf['name']} (ID: {pdf['id']})")

    if not pdf_files:
        # If no PDFs found, check what files are actually in the folder
        all_files = drive_service.files().list(
            q=f"'{folder_id}' in parents and trashed=false",
            fields='files(id, name, mimeType)',
            pageSize=100
        ).execute()
        print("\nAll files in folder:")
        for file in all_files.get('files', []):
            print(f"- {file['name']} ({file['mimeType']})")

    # extracting text from found PDFs
    pdf_data = []
    for file in pdf_files:
        filename = file['name']
        print(f"\nProcessing {filename}...")
        local_path = os.path.join(colab_dir, filename)

        try:
            request = drive_service.files().get_media(fileId=file['id'])
            with open(local_path, 'wb') as f:
                f.write(request.execute())
            print(f"Downloaded {filename}")

            # generating thumbnail image
            first_page = convert_from_path(local_path, last_page=1)[0]
            thumbnail_path = os.path.join(colab_dir, f"{os.path.splitext(filename)[0]}.png")
            first_page.save(thumbnail_path, 'PNG')
            print(f"Generated thumbnail for {filename}")

            pdf_data.append({
                'pdfUrl': file['webViewLink'],
                'thumbnailUrl': f"{os.path.splitext(filename)[0]}.png",
                'title': os.path.splitext(filename)[0]
            })

        except Exception as e:
            print(f"Error processing file {filename}: {str(e)}")
            continue
        finally:
            if os.path.exists(local_path):
                os.remove(local_path)
                print(f"Cleaned up {filename}")

    return pdf_data

app = Flask(__name__)

@app.route('/')
def home():
    return render_template('index.html', title='Home')

@app.route("/get")
def get_bot_response():
    userText = request.args.get('msg')
    response, titles = get_response(userText)
    print("updated titles: ",titles)

    # getting URLs for all of the titles
    titles = set(titles) # removing duplicates
    rel_docs = []
    for title in titles:
        for pdf in pdf_data:
            if pdf['title'] == title:
                rel_docs.append({"title": title, "url": pdf['pdfUrl']})
                break

    return jsonify({
        'response': response,
        'rel_docs': rel_docs
    })

@app.route("/dataset")
def dataset():

    return render_template('pdf_gallery.html', pdf_data=pdf_data)

if __name__ == "__main__":

    # loading PDFs once bc it takes forever
    try:
        pdf_data = get_pdfs()
    except Exception as e:
        print(f"Error loading PDF data: {str(e)}")
        pdf_data = []

    # Get a tunnel from ngrok and run Flask
    public_url = ngrok.connect(5000)
    print(f' * Public URL: {public_url}')

    # Run the app
    app.run(port=5000)