In [1]:
from ollama import Client
from transformers import PreTrainedTokenizerFast
from dotenv import load_dotenv
import nest_asyncio

load_dotenv()
nest_asyncio.apply()

# This code cleans gets text from the samples folder and puts them in the output folder
target_transcription = [
    "samples/Lecture04/transcribe.txt"
]

target_material = [
    "samples/Lecture04/pdf_text.txt"
]

model = "llama3.1:8b-instruct-q4_0"
hf_model = "meta-llama/Llama-3.1-8B-Instruct"
embed_model = "mxbai-embed-large:latest"
ollama_endpoint = "http://127.0.0.1:11434"

c = Client(ollama_endpoint, timeout=60)
t = PreTrainedTokenizerFast.from_pretrained(hf_model)

In [2]:
# Prepare LlamaIndex
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.readers.file import CSVReader
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import Settings
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.node_parser import SentenceSplitter
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.chat_engine import CondensePlusContextChatEngine
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core import Document

text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20, tokenizer=t)
Settings.llm = Ollama(model=model, base_url=ollama_endpoint)
Settings.embed_model = OllamaEmbedding(base_url=ollama_endpoint, model_name=embed_model)

resource module not available on Windows


In [3]:
transcription_text: list[str] = []
material_text: list[str] = []

documents: list[Document] = []

for file in target_transcription:
    with open(file, "r") as f:
        text = f.read()
        documents.append(
            Document(
                text=text,
                extra_info={"type": "raw transcription"}
            )
        )
        transcription_text.append(text)

for file in target_material:
    with open(file, "r") as f:
        text = f.read()
        documents.append(
            Document(
                text=text,
                extra_info={"type": "raw material"}
            )
        )
        material_text.append(text)

In [4]:
# Manually summarise transcription and material into points
summariser_splitter = SentenceSplitter(chunk_size=15*1024, chunk_overlap=512, tokenizer=t)
summaries: list[str] = []

for chunk in summariser_splitter.split_texts(transcription_text) + summariser_splitter.split_texts(material_text):
    result = c.generate(
        model=model,
        prompt=(
f"""Clean up the given document and restructure it using Markdown.
Clean up the document by structuring the information into points.
Fix and correct grammatical and language errors.
You must include all crucial information. You must not add any information that is not in the document.

<Document>
{chunk}

<Restructured Document>
"""
        ),
        options={
            "temperature": 0,
            "num_ctx": 16*1024,
            "num_predict": 16*1024,
        }
    )

    response = result["response"].strip()
    summaries.append(response)
    print("[Summary]\n" + response, end="\n\n")

[Summary]
**Multicast Communication**

* **Flooding-Based Multicasting**
	+ Each node sends a message M to its neighbors.
	+ Neighbors forward the message to their own neighbors, and so on.
	+ This process continues until all nodes receive the message.
* **Probability of Forwarding (Pf)**
	+ A parameter that determines the probability of a node forwarding a message to its neighbors.
	+ Example: If Pf = 0.01, then each node will forward the message to only 10% of its neighbors.
* **Connectivity Probability (Ps)**
	+ A parameter that determines the probability of a node being connected to other nodes in the network.
	+ Example: If Ps = 0.1, then each node is connected to 1000 other nodes.

**Hypercube-Based Multicasting**

* **4-Dimensional Hypercube**
	+ Each node has 4 dimensions (e.g., x, y, z, and w).
	+ Messages are forwarded along the dimensions of the hypercube.
	+ Example: Node 1001 sends a message to its neighbors in each dimension (x=0, y=1, z=0, w=1).

**Gossip-Based Data Diss

In [18]:
documents_with_summary = documents.copy()

for summary in summaries:
    documents_with_summary.append(
        Document(
            text=summary,
            extra_info={"source": "summary"}
        )
    )

# Prepare the documents to be indexed
def prepare_query_retriever():
    nodes = text_splitter.get_nodes_from_documents(documents_with_summary, show_progress=True)
    index = VectorStoreIndex.from_documents(documents_with_summary, show_progress=True, text_splitter=text_splitter)
    index_retriever = index.as_retriever(similarity_top_k=3)
    bm25_retriever = BM25Retriever.from_defaults(
        nodes=nodes,
        similarity_top_k=3,
    )

    return QueryFusionRetriever(
        [index_retriever, bm25_retriever],
        num_queries=1,
        similarity_top_k=4,
        verbose=True,
    )

qr = prepare_query_retriever()

Parsing nodes:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/4 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/54 [00:00<?, ?it/s]

In [19]:
# Create questions out of the summaries
import re
questions = []
question_re = re.compile(r"\d{1,2}\.\s(.*)")

# Trim the first 2 lines
chunk = "\n".join(summaries)

result = c.generate(
    model=model,
    prompt=(
f"""Create 5 questions that covers this summary.
The questions should ask reasoning and not ask for facts.
You must answer in the following format:
<Questions>
1. Question 1
2. Question 2

<Summary>
{chunk}

<Questions>
"""
    ),
    options={
        "temperature": 1,
        "num_predict": 1*1024,
    }
)

response = result["response"]
print(response)
questions = question_re.findall(response)
questions = [q.strip() for q in questions]

questions

Here are 5 reasoning-based questions that cover the provided summary:

1. **Which method is more effective in disseminating information across a distributed system: a push-based approach or a pull-based approach? Justify your answer.**

2. **Consider a network with limited connectivity. How would you balance the need for efficient message dissemination with the constraint of sparse connections? Provide a solution and explain your reasoning.**

3. **Imagine a scenario where a critical message needs to be removed from circulation. What strategies could be employed to prevent further propagation, despite the presence of gossip-based data dissemination? Explain your approach.**

4. **Compare the efficiency of tree-based and mesh-based multicast approaches in terms of resource utilization and latency. Provide a detailed analysis of their trade-offs.**

5. **Suppose you are designing an epidemic behavior model for message dissemination. How would you incorporate features like difficulty in d

['**Which method is more effective in disseminating information across a distributed system: a push-based approach or a pull-based approach? Justify your answer.**',
 '**Consider a network with limited connectivity. How would you balance the need for efficient message dissemination with the constraint of sparse connections? Provide a solution and explain your reasoning.**',
 '**Imagine a scenario where a critical message needs to be removed from circulation. What strategies could be employed to prevent further propagation, despite the presence of gossip-based data dissemination? Explain your approach.**',
 '**Compare the efficiency of tree-based and mesh-based multicast approaches in terms of resource utilization and latency. Provide a detailed analysis of their trade-offs.**',
 '**Suppose you are designing an epidemic behavior model for message dissemination. How would you incorporate features like difficulty in deleting messages, use of death certificates, or other strategies to miti

In [20]:
# Augment each question with the relevant data and information
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import get_response_synthesizer

qe = RetrieverQueryEngine(
    retriever=qr,
)

In [21]:
questions_with_explanations = []

question = questions[0]
result = qe.query(question)

print(result)

**Rewrite**

A gossip-based data dissemination approach with anti-entropy is more effective in disseminating information across a distributed system. This is because it relies on epidemic behavior, where nodes spread updates to their neighbors using local information, and can quickly propagate updates among all nodes using either form of anti-entropy, although push-pull remains the best strategy. The use of pull-based approach works much better when many nodes are infected, making it an excellent way of rapidly spreading news across a distributed system.
