## Types

In [1]:
from typing import Literal
from pydantic import BaseModel, Field


###########
# SOURCES #
###########
class Source(BaseModel):
    id: str
    url: str
    name: str
    desc: str
    type: Literal["moodle", "file", "web", "tg"] = Field("file")

    def __hash__(self) -> int:
        return self.id.__hash__()


class MoodleSource(Source):
    course_id: str
    course_url: str
    course_name: str
    type: Literal["moodle"] = Field("moodle")


class FileSource(Source):
    type: Literal["file"] = Field("file")


class WebSource(Source):
    type: Literal["web"] = Field("web")


class TelegramSource(Source):
    type: Literal["tg"] = Field("tg")


#########
# UTILS #
#########
class Chunk(BaseModel):
    index: int = Field(ge=0)
    source_id: str
    text: str


##########
# SEARCH #
##########
class SearchQuery(BaseModel):
    text: str


class SearchResult(BaseModel):
    source: Source
    distance: float

    def __hash__(self) -> int:
        return self.source.id.__hash__()

## Paths

In [2]:
from pathlib import Path

TEXTS_PATH = Path("../texts")
DATA_PATH = Path("../data")
META_PATH = DATA_PATH / "meta.json"

## Chunk

In [3]:
import os
import json
import logging
import numpy as np
from tqdm import tqdm
from sentence_transformers import util, SentenceTransformer
from langchain_text_splitters import (
    TextSplitter,
    RecursiveCharacterTextSplitter,
    Language,
)

import torch
from collections import Counter

  from tqdm.autonotebook import tqdm, trange


In [4]:
def get_source_by_chunk(chunk_index: str, chunks: list[Chunk]) -> Source:
    for chunk in chunks:
        if chunk.index == chunk_index:
            source_id = chunk.source_id
            break
    else:
        raise ValueError(f"Chunk {chunk_index} not found")

    # load meta data
    with open(META_PATH, "r", encoding="utf-8") as meta_file:
        meta_data = json.load(meta_file)

    sources: list[Source] = []
    for data in meta_data:
        source: Source = Source.model_validate_json(json.dumps(data), strict=True)
        sources.append(source)

    for source in sources:
        if source.id == source_id:
            return source
    else:
        raise ValueError(f"Source {source_id} not found")

In [5]:
def chunk(text_splitter: TextSplitter) -> list[Chunk]:
    # log missing files
    logging.basicConfig(
        filename="missing.log",
        filemode="w",
        level=logging.INFO,
        # format='%',
        encoding="utf-8",
    )

    # get sources data
    with open(META_PATH, "r", encoding="utf-8") as meta_file:
        json_data = json.load(meta_file)

    sources: list[Source] = []
    for data in json_data:
        source: Source = Source.model_validate_json(json.dumps(data), strict=True)
        sources.append(source)

    index = 0
    chunks: list[Chunk] = []
    for source in tqdm(sources, total=len(sources), unit="source"):
        source_text_path = TEXTS_PATH / (source.name + ".txt")
        if not os.path.exists(source_text_path):
            logging.info(source.id)
            continue

        with open(source_text_path, "r", encoding="utf-8") as text_file:
            text = text_file.read()

        for chunk_text in text_splitter.split_text(text):
            chunk = Chunk(index=index, source_id=source.id, text=chunk_text)
            chunks.append(chunk)
            index += 1

    return chunks

In [6]:
def embed(
    texts: list[str],
    model: SentenceTransformer,
) -> np.ndarray:
    embeddings: np.ndarray = model.encode(texts)
    return embeddings

In [7]:
def search(query_embedding: np.ndarray, embeddings: np.ndarray, chunks: list[Chunk]) -> list[SearchResult]:
    results = util.semantic_search(query_embedding, embeddings, top_k=10)

    search_results: list[SearchResult] = []
    for result in results[0]:
        chunk_index = result["corpus_id"]
        source: Source = get_source_by_chunk(chunk_index, chunks)
        search_result = SearchResult(text="", source=source, distance=result["score"])
        search_results.append(search_result)

    # apply majority vote
    counter = Counter([search_result.source for search_result in search_results])
    most_common = counter.most_common(10)
    most_common

    # filter and leave unique documents (a bit of crutch O(n^2))
    new_results: list[SearchResult] = []
    for source, _ in most_common:
        for result in search_results:
            if source.id == result.source.id:
                new_results.append(result)
                break

    return new_results

## Code

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=20,
    length_function=lambda x: len(x.split()),  # 150 words, not characters
    add_start_index=True,
).from_language(language=Language.MARKDOWN)  # not to brute force all languages

chunks = chunk(text_splitter)
texts = [chunk.text for chunk in chunks]

100%|██████████| 949/949 [00:11<00:00, 80.14source/s] 


In [9]:
for text in texts[::1000]:
    print(text)

УТВЕРЖДЕНЫ
приказом АНО ВО «Университет Иннополис»
от 09.08.2023 №
Директор
___________________ К.В. Семенихин

<b>Правила размещения и проживания</b>
<b>в жилом комплексе</b>
<b>АНО ВО «Университет Иннополис»</b>

<b>г. Иннополис</b>
<b>2023</b>

-----

2

<b>1.</b> <b>Общие положения</b>

1.1. Настоящие Правила размещения и проживания в жилом комплексе АНО ВО
«Университет Иннополис» (далее – Правила) определяют порядок размещения и
проживания в жилом комплексе АНО ВО «Университет Иннополис», расположенном
по адресу: Республика Татарстан, Верхнеуслонский муниципальный район, г.
Иннополис, ул. Университетская, д. 1, корпуса 1, 2, 3, 4 (далее – Комплекс), и
обязательны для исполнения всеми категориями проживающих (далее по тексту –
Проживающие).
1.2. Размещение в Комплексе производится на основании договора о
предоставлении услуг по размещению или на основании договора присоединения о
предоставлении услуг по размещению путем подписания согласия на присоединение
– регистрационной карты (

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

# MODEL_NAME = 'all-mpnet-base-v2'  # SOTA
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME, device=device)
embeddings = embed(texts, model)

cuda


Batches:   0%|          | 0/279 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [12]:
embeddings.shape

(8918, 384)

In [13]:
query = SearchQuery(text="Burmykov Networks course lecture 11")
query_embedding = embed([query.text], model)

results: list[SearchResult] = search(query_embedding, embeddings, chunks)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
results

[SearchResult(source=Source(id='module-109689.pdf', url='https://moodle.innopolis.university/mod/resource/view.php?id=109689', name='module-109689.pdf', desc='Lecture Week 11 Part I (TCP Congestion Control)', type='moodle'), distance=0.4222642481327057),
 SearchResult(source=Source(id='module-79154.pdf', url='https://moodle.innopolis.university/mod/resource/view.php?id=79154', name='module-79154.pdf', desc='Graph Theory (main book)', type='moodle'), distance=0.40513643622398376),
 SearchResult(source=Source(id='module-109578.pdf', url='https://moodle.innopolis.university/mod/resource/view.php?id=109578', name='module-109578.pdf', desc='Lecture Week 10 Part II (Forwarding and Routing. Router Architecture)', type='moodle'), distance=0.537629246711731),
 SearchResult(source=Source(id='module-108403.pdf', url='https://moodle.innopolis.university/mod/resource/view.php?id=108403', name='module-108403.pdf', desc='Lecture Week 2 Part I (Network Characteristics)', type='moodle'), distance=0.447

In [15]:
def print_text_file(source_name: str):
    source_path = TEXTS_PATH / (source_name + ".txt")
    if not os.path.exists(source_path):
        print(f"File {source_path} not found")
        return

    with open(source_path, "r", encoding="utf-8") as file:
        print(file.read())


print_text_file(results[0].source.name)

<h2>Computer Networks</h2>
<h2>Lecture Week 11 (Part I)</h2>

<h1><b>TCP Congestion Control:</b></h1>
<h1><b>The Basic Principles</b></h1>

Artem Burmyakov

April 05, 2024

-----

Recap: Router Architecture

Forwarding Table

-----

Recap: Router Architecture

Forwarding Table

<b>Routers and other network core devices may cause various communication problems</b>

-----

Recap: Router Architecture

Forwarding Table

<b>Routers and other network core devices may cause various communication problems;</b>
<b>A packet loss is one of them, due to a buffer overflow at a router</b>

-----

Reference: TCP/IP Reference Model

-----

Reference: TCP/IP Reference Model <b>Question:</b>
<b>Which layer(s) is/are responsible for</b>
<b>detecting and fixing network-related problems?</b>

-----

Reference: TCP/IP Reference Model <b>Question:</b>
<b>Which layer(s) is/are responsible for</b>
<b>detecting and fixing network-related problems?</b>

<b>Examples of possible problems:</b>

• <b>A packet loss;<

In [16]:
class TestQuery(BaseModel):
    text: str
    relevant: bool
    sources: list[str] | None


queries: list[TestQuery] = []
with open("queries.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        query = TestQuery.model_validate_json(line, strict=True)
        if query.relevant:
            queries.append(query)

queries[:5]

[TestQuery(text='mathan assignments', relevant=True, sources=['module-78346.pdf', 'module-78349.pdf']),
 TestQuery(text='mathan assignments with answers', relevant=True, sources=['module-78363.pdf', 'module-78374.pdf', 'module-85037.pdf']),
 TestQuery(text='mathan midterm prep', relevant=True, sources=['module-78486.pdf']),
 TestQuery(text='agal syllabus', relevant=True, sources=['module-78487.pdf']),
 TestQuery(text='agla basis vector', relevant=True, sources=['module-78516.pdf', 'module-78509.pdf', 'module-89471.pdf'])]

In [17]:
def precision_k(recommended_ids: np.array, relevant_ids: np.array, k: int = 10) -> float:
    """precision@k = number of recommended relevant items among top k / number of recommended items k"""
    common_elements = np.intersect1d(recommended_ids[:k], relevant_ids)
    return len(common_elements) / k


def recall_k(recommended_ids: np.array, relevant_ids: np.array, k: int = 10) -> float:
    """recall@k = number of recommended relevant items among top k / number of all relevant items"""
    common_elements = np.intersect1d(recommended_ids[:k], relevant_ids)
    return len(common_elements) / len(relevant_ids)


def f_beta_score_k(recommended_ids: np.array, relevant_ids: np.array, k: int = 10, beta: float = 1) -> float:
    precision = precision_k(recommended_ids, relevant_ids, k)
    recall = recall_k(recommended_ids, relevant_ids, k)
    if precision == 0 and recall == 0:
        return 0
    return (1 + beta**2) * precision * recall / (beta**2 * precision + recall)


def average_precision_k(recommended_ids: np.array, relevant_ids: np.array, k: int = 10) -> float:
    ...


def mean_average_precision_k() -> float:
    ...

In [20]:
test_query_embeddings: np.ndarray = embed([query.text for query in queries], model)

precisions = []
recalls = []
f2_scores = []
for i, query in enumerate(queries):
    results: list[SearchResult] = search(test_query_embeddings[i].reshape(1, -1), embeddings, chunks)

    recommended_ids = np.array([result.source.id for result in results])
    relevant_ids = np.array(query.sources)

    precisions.append(precision_k(recommended_ids, relevant_ids, 10))
    recalls.append(recall_k(recommended_ids, relevant_ids, 10))
    f2_scores.append(f_beta_score_k(recommended_ids, relevant_ids, 10, 2))

    if i == 49:
        print(recommended_ids)
        print(relevant_ids)

    # print(f"{'precision:':<10}", precision_k(recommended_ids, relevant_ids, 10))
    # print(f"{'recall:':<10}", recall_k(recommended_ids, relevant_ids, 10))
    # print(f"{'f2_score: ':<10}", f_beta_score_k(recommended_ids, relevant_ids, 10, 2))  # help precision, give greater coef and got higher score

print()
print(i + 1)
print("average precisions: ", np.mean(precisions))
print("average recalls: ", np.mean(recalls))
print("average f2_scores: ", np.mean(recalls))

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

['module-82736.pdf' 'module-84621.pdf' 'module-84787.pdf'
 'module-84616.pdf' 'module-83601.pdf' 'module-90816.pdf'
 'module-87970.pdf' 'module-79152.pdf' 'module-90037.pdf']
['module-84616.pdf' 'module-84621.pdf' 'module-84787.pdf']

50
average precisions:  0.08
average recalls:  0.4633333333333334
average f2_scores:  0.4633333333333334


: 