In [211]:
import json
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
import chromadb
import streamlit as st
import pandas as pd
from tqdm import tqdm
import google.generativeai as genai
genai.configure(api_key=st.secrets["google_api_key"])

In [139]:
with open("intentsFTMM.json", "r", encoding="utf-8") as f:
    data = json.load(f)["intents"][1:]

In [132]:
client = chromadb.PersistentClient("knowledge_ftmm")
embed = OpenAIEmbeddings(openai_api_key=st.secrets["openai_key"])
docsearch = Chroma(persist_directory="knowledge_ftmm", collection_name="ftmm", embedding_function=embed)
print(f"There are {docsearch._collection.count()} chunks of documents in collection")

There are 217 chunks of documents in collection


In [35]:
def embed_fn(text):
    return genai.embed_content(model="models/embedding-001", content=text, task_type="retrieval_document")["embedding"]

In [164]:
i = 55
print(data[i]["patterns"][0])
# embedding = embed_fn("Siapa dekan FTMM?")

Kelompok riset renewable and Sustainable Energy Tecnology bergerak di bidang apa? 


In [166]:
contexts = docsearch.similarity_search(data[i]["patterns"][0], k=2)
[c.metadata for c in contexts]

[{'source': 'knowledges\\RENEWABLE AND SUSTAINABLE ENERGY TECHNOLOGY.txt'},
 {'source': 'knowledges\\Penerapan Nanoteknologi Dalam Pengembangan Energi Biomassa.txt'}]

In [75]:
tags = [data[i]['tag'] for i in range(len(data))]
for i, t in enumerate(tags):
    print(i, t)

0 Pengenalan FTMM
1 Lokasi FTMM
2 Informasi Kontak
3 Sejarah FTMM
4 Prodi FTMM
5 Kepimpinan FTMM
6 Kontribusi FTMM
7 Gelar Sarjana FTMM
8 Identitas Bendera FTMM
9 Identitas Logo FTMM
10 Visi FTMM
11 Misi FTMM
12 Visi RN
13 Misi RN
14 Visi TE
15 Misi TE
16 Visi TI
17 Misi TI
18 Visi TRKB
19 Misi TRKB
20 Visi TSD
21 Misi TSD
22 Pendaftaran
23 Organisasi
24 BEM FTMM
25 BLM FTMM
26 Koordinator Prodi TSD
27 Koordinator Prodi TE
28 Koordinator Prodi TRKB
29 Koordinator Prodi TI
30 Koordinator Prodi RN
31 Tenaga kependidikan
32 Kelompok penelitian
33 Visi F&E Nanotechnology
34 Hasil F&E Nanotechnology
35 Publikasi F&E Nanotechnology
36 Sumber Dana F&E Nanotechnology
37 Visi Nanotechnology & Nanomedicine
38 Hasil Nanotechnology & Nanomedicine
39 Sumber Dana Nanotechnology & Nanomedicine
40 HFS Enginering
41 Ruang lingkup HFS Enginering
42 Sumber Dana HFS Enginering
43 AMC Research
44 Ruang lingkup AMC Research
45 Sumber dana AMC Research
46 DDSS Research
47 Topik DDSS Research
48 Robotics & Me

In [149]:
from langchain.document_loaders import DirectoryLoader
loader = DirectoryLoader("knowledges")
docs = loader.load()

In [155]:
for i in range(len(docs)):
    print(i, docs[i].metadata["source"][11:])

0 ADVANCED MANUFACTURING AND COMPUTATIONAL APPROACH.txt
1 Aktivitas Mahasiswa FTMM.txt
2 Alur Pendaftaran Online Mahasiswa Baru.txt
3 Alur Pengajuan Legalisir atau Transkrip KHS.txt
4 Alur Pengajuan SKMA.txt
5 Alur Pengajuan Surat Keterangan.txt
6 Alur Pengajuan Surat Rekomendasi.txt
7 Aturan Berprilaku.txt
8 BANGGA EV Charging Station Ikuti ASSIE 2023.txt
9 Beasiswa.txt
10 BEM FTMM.txt
11 Bentuk Semangat Berwirausaha Melalui Dana Usaha.txt
12 bentuk-semangat-berwirausaha-melalui-dana-usaha.txt
13 Biaya Pendaftaran.txt
14 BLM FTMM.txt
15 Brosur FTMM.txt
16 Buku-Pedoman-Berperilaku-Civitas-Academica-UNAIR.pdf
17 CAMENO 2023 - Kehangatan Dalam Kekeluargaan Prodi Nanoteknologi.txt
18 DATA-DRIVEN DECISION SUPPORT SYSTEM.txt
19 Daya Tampung Doktor TAHUN AKADEMIK 20232024.csv
20 Daya Tampung Magister TAHUN AKADEMIK 20232024.csv
21 Daya Tampung Profesi TAHUN AKADEMIK 20232024.csv
22 Delegasi FTMM Ikuti Program WUACD di Malaysia.txt
23 Desa Binaan.txt
24 DIGITAL HEALTH, COMPUTATIONAL LEARNING,

In [173]:
corpus = pd.DataFrame({
    "id": [i for i in range(len(docs))],
    "file_name": [docs[i].metadata["source"][11:] for i in range(len(docs))],
    "content": [docs[i].page_content for i in range(len(docs))]
})

In [181]:
retrieval_data = pd.DataFrame({
    "id": [i for i in range(len(data))],
    "tag": [data[i]['tag'] for i in range(len(data))],
    "query": [data[i]['patterns'][0] for i in range(len(data))],
    "relevant_docs": [pd.NA for i in range(len(data))]
})
retrieval_data.to_csv("retrieval_data_unlabeled.csv", index=False)
retrieval_data.head()

Unnamed: 0,id,tag,query,relevant_docs
0,0,Pengenalan FTMM,FTMM adalah,
1,1,Lokasi FTMM,Halo FTMM ada dimana?,
2,2,Informasi Kontak,Apakah FTMM mempunyai akun media sosial?,
3,3,Sejarah FTMM,Berapa jumlah mahasiswa FTMM?,
4,4,Prodi FTMM,Apakah ada jurusan yang hanya ada satu di Indo...,


In [216]:
relevant_docs_list = []
for i in tqdm(range(len(retrieval_data))):
    q = retrieval_data.iloc[i]["query"]
    relevant_docs = docsearch.similarity_search(q, k=2)
    docs_title = [d.metadata["source"][11:] for d in relevant_docs]
    docs_id = [corpus[corpus["file_name"] == docs_title[i]]["id"].values[0] for i in range(len(docs_title))]
    relevant_docs_list.append(docs_id)

100%|██████████| 61/61 [00:22<00:00,  2.71it/s]


In [218]:
retrieval_data["relevant_docs"] = relevant_docs_list

In [221]:
retrieval_data.to_csv("retrieval_data_labeled.csv", index=False)
retrieval_data.head()

Unnamed: 0,id,tag,query,relevant_docs
0,0,Pengenalan FTMM,FTMM adalah,"[85, 70]"
1,1,Lokasi FTMM,Halo FTMM ada dimana?,"[85, 51]"
2,2,Informasi Kontak,Apakah FTMM mempunyai akun media sosial?,"[74, 34]"
3,3,Sejarah FTMM,Berapa jumlah mahasiswa FTMM?,"[70, 85]"
4,4,Prodi FTMM,Apakah ada jurusan yang hanya ada satu di Indo...,"[49, 79]"


In [233]:
title_1_list = list()
title_2_list = list()
for i in range(len(retrieval_data)):
    title_1 = corpus[corpus["id"] == retrieval_data["relevant_docs"].iloc[i][0]]["file_name"].values[0]
    title_2 = corpus[corpus["id"] == retrieval_data["relevant_docs"].iloc[i][1]]["file_name"].values[0]
    title_1_list.append(title_1)
    title_2_list.append(title_2)

In [239]:
retrieval_data["title_1"] = title_1_list
retrieval_data["title_2"] = title_2_list
retrieval_data.to_csv("retrieval_data_labeled.csv", index=False)
retrieval_data.head()

Unnamed: 0,id,tag,query,relevant_docs,title_1,title_2
0,0,Pengenalan FTMM,FTMM adalah,"[85, 70]",Sejarah FTMM.txt,Prestasi Mahasiswa.txt
1,1,Lokasi FTMM,Halo FTMM ada dimana?,"[85, 51]",Sejarah FTMM.txt,"Lewat Synreach, Ini Pemenang Duta FTMM 2023.txt"
2,2,Informasi Kontak,Apakah FTMM mempunyai akun media sosial?,"[74, 34]",QnA.txt,ftmm-kupas-tuntas-pro-dan-kontra-ai-dalam-perg...
3,3,Sejarah FTMM,Berapa jumlah mahasiswa FTMM?,"[70, 85]",Prestasi Mahasiswa.txt,Sejarah FTMM.txt
4,4,Prodi FTMM,Apakah ada jurusan yang hanya ada satu di Indo...,"[49, 79]",Kuliah Tamu FTMM Gali Inspirasi MBKM ala Jerma...,Rencana Strategis.txt


In [237]:
corpus.to_csv("corpus.csv", index=False)

In [240]:
from langchain.chat_models import ChatOpenAI

In [241]:
llm = ChatOpenAI()

In [242]:
dir(llm)

['Config',
 'InputType',
 'OutputType',
 '__abstractmethods__',
 '__annotations__',
 '__call__',
 '__class__',
 '__class_getitem__',
 '__class_vars__',
 '__config__',
 '__custom_root_type__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__exclude_fields__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_validators__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__include_fields__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__json_encoder__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__or__',
 '__orig_bases__',
 '__parameters__',
 '__post_root_validators__',
 '__pre_root_validators__',
 '__pretty__',
 '__private_attributes__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__repr_args__',
 '__repr_name__',
 '__repr_str__',
 '__rich_repr__',
 '__ror__',
 '__schema_cache__',
 '__setattr__',
 '__setstate__',
 '__signature__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__try_updat

In [249]:
llm.model_name

'gpt-3.5-turbo'