In [1]:
import os
import openai
import sys
import json
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS, LanceDB
sys.path.append("C:/Users/gka30/Desktop/langchain")


In [2]:
# open ai
from langchain.embeddings.openai import OpenAIEmbeddings
os.environ['OPENAI_API_KEY'] = 'API_KEY'
openai.api_key = os.getenv("OPENAI_API_KEY")

emb = OpenAIEmbeddings()

In [128]:
# hugging face
from langchain.embeddings import HuggingFaceEmbeddings

# Chroma db에서는 dimension 이슈 발생
emb = HuggingFaceEmbeddings(model_name = 'BAAI/bge-large-en-v1.5')

Downloading (…)87a7d/.gitattributes: 100%|██████████| 1.52k/1.52k [00:00<?, ?B/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 191/191 [00:00<00:00, 149kB/s]
Downloading (…)625eb87a7d/README.md: 100%|██████████| 89.1k/89.1k [00:00<?, ?B/s]
Downloading (…)5eb87a7d/config.json: 100%|██████████| 779/779 [00:00<?, ?B/s] 
Downloading (…)ce_transformers.json: 100%|██████████| 124/124 [00:00<?, ?B/s] 
Downloading pytorch_model.bin: 100%|██████████| 1.34G/1.34G [00:25<00:00, 53.3MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 52.0/52.0 [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 125/125 [00:00<?, ?B/s] 
Downloading (…)87a7d/tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 53.8MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 366/366 [00:00<?, ?B/s] 
Downloading (…)625eb87a7d/vocab.txt: 100%|██████████| 232k/232k [00:00<?, ?B/s]
Downloading (…)eb87a7d/modules.json: 100%|██████████| 349/349 [00:00<?, ?B/s] 


In [121]:
# LLAMA - need to wait the authorization
from langchain.embeddings import LlamaCppEmbeddings

emb = LlamaCppEmbeddings(model_path="C:/Users/gka30/Desktop/langchain/llama-2-7b-vietnamese-20k.Q5_K_M.gguf")

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [None]:
# Palm
from langchain.embeddings import VertexAIEmbeddings, GooglePalmEmbeddings
import google.generativeai as palm
os.environ["PALM_API_KEY"] = 'API_KEY'
palm.api_key = os.getenv("PALM_API_KEY")

palm_emb = GooglePalmEmbeddings()
vertex_emb = VertexAIEmbeddings()

In [3]:
vectordb_kor = None

def initialize_vectordb_kor():
    global vectordb_kor
    
    loaders = [
        CSVLoader("C:/Users/gka30/Desktop/langchain/Korean_data.csv", encoding='UTF-8')
    ]
    docs = []
    for loader in loaders:
        docs.extend(loader.load())
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1500,
        chunk_overlap = 150
    )
    documnent = text_splitter.split_documents(docs)
    
    persist_directory = 'docs/chroma/'

    vectordb_kor = Chroma.from_documents(
        documents=documnent,
        embedding=emb,
        # persist_directory=persist_directory
    )

    # vectordb = FAISS.from_documents(
    #     documents=documnent,
    #     embedding=hug_emb,
    #     # persist_directory=persist_directory
    # )

    # vectordb = LanceDB.from_documents(
    #     documents=documnent,
    #     embedding=hug_emb,
    #     persist_directory=persist_directory
    # )

In [4]:
vectordb_eng = None

def initialize_vectordb_eng():
    global vectordb_eng
    
    loaders = [
        CSVLoader("C:/Users/gka30/Desktop/langchain/English_data.csv", encoding='UTF-8')
    ]
    docs = []
    for loader in loaders:
        docs.extend(loader.load())
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1500,
        chunk_overlap = 150
    )
    documnent = text_splitter.split_documents(docs)
    
    persist_directory = 'docs/chroma/'

    vectordb_eng = Chroma.from_documents(
        documents=documnent,
        embedding=emb,
        # persist_directory=persist_directory
    )

In [5]:
if vectordb_kor is None:
   initialize_vectordb_kor()

In [6]:
if vectordb_eng is None:
   initialize_vectordb_eng()

In [7]:
def langchainTest_kor(request):
    global vectordb_kor
    # data = json.loads(request.body.decode("utf-8"))
    # question = data["messages"][0]["content"]

    docs = vectordb_kor.similarity_search(request, k=3)
    response_content = docs[0].page_content

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{
            "role": "user",
            "content": (
                f"지금부터 주는 정보는 한국항공대학교에 대한 정보야"
                f"내가 준 기반 정보를 바탕으로만 대답해."
                f"기반 정보: {response_content} / "
                f"내 질문: {request}"
            )
        }]
    )
    
    gpt_response = response['choices'][0]['message']['content']

    return gpt_response

In [8]:
def langchainTest_eng(request):
    global vectordb_eng
    # data = json.loads(request.body.decode("utf-8"))
    # question = data["messages"][0]["content"]
    
    docs = vectordb_eng.similarity_search(request, k=3)
    response_content = docs[0].page_content

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{
            "role": "user",
            "content": (
                f"This is the Korea AeroSpace University Information."
                f"Answer the question base on the information which I gave."
                f"Based Information: {response_content} / "
                f"Answer: {request}"
            )
        }]
    )
            
    gpt_response = response['choices'][0]['message']['content']

    return gpt_response

In [9]:
langchainTest_kor('복수전공 신청 대상자는 누구인가요?')

'복수전공 신청 대상자는 한국항공대학교의 1학년 과정 이상을 수료한 재학생입니다.'

In [10]:
langchainTest_eng('Who are the applicants for a double major?')

'The applicants for a double major are current students who have completed at least the first year of the program at Korea AeroSpace University.'

In [11]:
print(langchainTest_kor('복수전공 신청 기간은 언제인가요?'))

복수전공 신청 기간은 매학기 소정 기간 내에 가능합니다. 1학기는 1월 중, 2학기는 7월 중에 신청하실 수 있습니다. 따라서 현재 학기에 복수전공을 신청하려면 1월 중에 신청하셔야 합니다. 자세한 일정은 학사공지의 개별 공지사항을 참고하시면 됩니다.


In [12]:
print(langchainTest_eng('When is the application period for a double major?'))

The application period for a double major is within the prescribed period each semester. Students can apply in January for the first semester and in July for the second semester.


In [17]:
langchainTest_kor('전과 신청을 위해 제출해야 하는 서류는 무엇인가요?')

'전과 신청을 위해 제출해야 하는 서류는 학기재수 신청원 양식입니다. 추가적인 정보나 문의 사항은 교무팀의 김민경님에게 전화번호 02-300-0457로 문의하거나 이메일 kyomu@kau.ac.kr로 문의하시면 됩니다.'

In [18]:
langchainTest_eng('What documents do I need to submit for a criminal history application?')

"Based on the information provided, there is no specific mention of submitting documents for a criminal history application. Therefore, it is unclear what documents are required for a criminal history application at Korea AeroSpace University. It is advised to consult the university's academic system or contact the relevant department for more information on the required documents for a criminal history application."