In [1]:
import os
os.environ["OPENAI_API_KEY"] = 'YOUR_API_KEY'

In [32]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.indices.service_context import ServiceContext
from llama_index import StorageContext, load_index_from_storage
from llama_index.llms import OpenAI

from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo

# Building knowledge base with nodes

## Adding Nastavnici i suradnici

In [22]:
nastavnici_suradnici_osoblje = SimpleDirectoryReader('data/nastavnici i suradnici', filename_as_id=True).load_data()

In [23]:
def dissolve_employees_into_nodes(document):
    filename = os.path.splitext(os.path.basename(document.id_))[0]

    nodes = []
    root_node_text = ""
    root_text = document.text
    root_node = TextNode(
        text='',
        metadata={
            'filename': filename,
            'type': 'list'
        },
        id_=filename
    )
    child_node_texts = root_text.split('\n\n\n')
    for cn_text in child_node_texts:
        name = cn_text.split('\n')[0]
        node = TextNode(text=cn_text, 
                        metadata={
                            'name': name,
                            'type': 'Personal details'
                        },
                        id_=name
                       )
        node.relationships[NodeRelationship.PREVIOUS] = RelatedNodeInfo(node_id=root_node.id_)
        nodes.append(node)
        root_node_text += name + '\n'
    root_node.text = root_node_text
    nodes = [root_node] + nodes
    return nodes

In [24]:
nodes = []
for doc in nastavnici_suradnici_osoblje:
    nodes += dissolve_employees_into_nodes(doc)

### Obrada profila nastavnika

In [25]:
profili_nastavnika = SimpleDirectoryReader('data/nastavnici i suradnici/profili', filename_as_id=True).load_data()

In [26]:
def handle_profile(document, existing_nodes):
    name = os.path.splitext(os.path.basename(document.id_))[0]
    nodes_section = []
    root_text = document.text
    child_node_texts = root_text.split('\n\n\n')
    for cn_text in child_node_texts:
        section = cn_text.split('\n')[0]
        new_id = name + ' ' + section
        node = TextNode(text=cn_text, 
                        metadata={
                            'name': name,
                            'type': 'Additional information',
                            'section': section
                        },
                        id_= new_id
                       )
        node.relationships[NodeRelationship.PREVIOUS] = RelatedNodeInfo(node_id=name)
        
        for elem in existing_nodes:
            if elem.id_ == name:
                elem.relationships[NodeRelationship.NEXT] = RelatedNodeInfo(node_id=new_id) 
                
        nodes_section.append(node)
    return nodes_section

In [27]:
profile_nodes = []
for profile in profili_nastavnika:
    profile_nodes += handle_profile(profile, nodes)

In [28]:
nodes += profile_nodes

In [29]:
for i, node in enumerate(nodes):
    print(i, ', len: ', len(node.text))

0 , len:  176
1 , len:  172
2 , len:  114
3 , len:  150
4 , len:  129
5 , len:  216
6 , len:  133
7 , len:  147
8 , len:  337
9 , len:  130
10 , len:  183
11 , len:  160
12 , len:  81
13 , len:  93
14 , len:  197
15 , len:  193
16 , len:  177
17 , len:  815
18 , len:  811
19 , len:  344
20 , len:  352
21 , len:  322
22 , len:  395
23 , len:  324
24 , len:  325
25 , len:  332
26 , len:  318
27 , len:  323
28 , len:  371
29 , len:  314
30 , len:  325
31 , len:  319
32 , len:  330
33 , len:  329
34 , len:  321
35 , len:  326
36 , len:  333
37 , len:  310
38 , len:  314
39 , len:  337
40 , len:  327
41 , len:  381
42 , len:  568
43 , len:  181
44 , len:  53
45 , len:  77
46 , len:  98
47 , len:  82
48 , len:  81
49 , len:  50
50 , len:  26
51 , len:  75
52 , len:  49
53 , len:  109
54 , len:  76
55 , len:  205
56 , len:  83
57 , len:  85
58 , len:  79
59 , len:  112
60 , len:  78
61 , len:  38
62 , len:  79
63 , len:  80
64 , len:  300
65 , len:  237
66 , len:  286
67 , len:  293
68 , len:

## Ispitivanje modela

In [31]:
index = VectorStoreIndex(nodes,show_progress=True)


Generating embeddings:   0%|                                                                   | 0/321 [00:00<?, ?it/s][A
Generating embeddings:   3%|█▊                                                        | 10/321 [00:00<00:21, 14.47it/s][A
Generating embeddings:   6%|███▌                                                      | 20/321 [00:01<00:19, 15.63it/s][A
Generating embeddings:   9%|█████▍                                                    | 30/321 [00:01<00:17, 16.18it/s][A
Generating embeddings:  12%|███████▏                                                  | 40/321 [00:02<00:18, 15.51it/s][A
Generating embeddings:  16%|█████████                                                 | 50/321 [00:03<00:16, 16.32it/s][A
Generating embeddings:  19%|██████████▊                                               | 60/321 [00:03<00:16, 16.30it/s][A
Generating embeddings:  22%|████████████▋                                             | 70/321 [00:04<00:15, 16.30it/s][A
Generating embe

In [35]:
index.storage_context.persist()

In [39]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0, max_tokens=2048)
service_context = ServiceContext.from_defaults(llm=llm)
query_engine = index.as_query_engine(service_context=service_context)

In [46]:
response = query_engine.query("What are some of the publications that Domagoj Matijevic has taken part in?")
print(response)

Some of the publications that Domagoj Matijević has taken part in are:

1. "Efficient Implementations of Guarding 1.5D Terrains" in Croatian Operational Research Review 6/1 (2015), 79-89.
2. "Guarding 1.5D Terrains with Demands" in International Journal of Computer Mathematics 89/16 (2012), 2143-2151.
3. "Improved Approximations for Guarding 1.5-Dimensional Terrains" in Algorithmica 60/2 (2011), 451-463.
4. "Naive Matrix Multiplication versus Strassen Algorithm in Multi-thread Environment" in Tehnički vjesnik 18/3 (2011), 309-314.
5. "Compressing Sentence Representation with Maximum Coding Rate Reduction" in ICT and Electronics Convention (MIPRO), 2023 46th MIPRO, Opatija, Hrvatska, 2023.
6. "Fortuna Detects Novel Splicing in Drosophila scRNASeq Data" in ICT and Electronics Convention (MIPRO), 2023 46th MIPRO, Opatija, Hrvatska, 2023, 410-415.
7. "Uparena optimizacijska metoda" in Math.e : hrvatski matematički elektronski časopis 30 (2016).
8. "Grčko - kineski stil u teoriji brojeva" i

In [None]:
parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(documents)