In [2]:
!pip install chromadb

Collecting chromadb
  Obtaining dependency information for chromadb from https://files.pythonhosted.org/packages/5f/e4/82d9ab49a206a0f3030ca94b29305358a6ab61fab45e0756e23f4cc7d701/chromadb-1.0.8-cp39-abi3-win_amd64.whl.metadata
  Downloading chromadb-1.0.8-cp39-abi3-win_amd64.whl.metadata (7.0 kB)
Collecting build>=1.0.3 (from chromadb)
  Obtaining dependency information for build>=1.0.3 from https://files.pythonhosted.org/packages/84/c2/80633736cd183ee4a62107413def345f7e6e3c01563dbca1417363cf957e/build-1.2.2.post1-py3-none-any.whl.metadata
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Obtaining dependency information for fastapi==0.115.9 from https://files.pythonhosted.org/packages/32/b6/7517af5234378518f27ad35a7b24af9591bc500b8c1780929c1295999eb6/fastapi-0.115.9-py3-none-any.whl.metadata
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn[standard]>=0.18.3 (from chromadb)
  Obtaining dep


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# 기존 컬렉션 삭제
for task in ["lecture_search", "career_counsel", "academic_status", "lecture_rating"]:
    try:
        client.delete_collection(task)
        print(f"Deleted collection: {task}")
    except Exception as e:
        print(f"Error deleting collection {task}: {e}")

Error deleting collection lecture_search: name 'client' is not defined
Error deleting collection career_counsel: name 'client' is not defined
Error deleting collection academic_status: name 'client' is not defined
Error deleting collection lecture_rating: name 'client' is not defined


기능별 collection 생성

In [18]:
import os
import json
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import chromadb

#API Key 설정 
os.environ["OPENAI_API_KEY"] = "입력입력"
#임베딩 모델 초기화 
embeddings_model = OpenAIEmbeddings(model = 'text-embedding-3-small')

#json-> text 파일을 청크로 분할
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

# Chroma 클라이언트
client = chromadb.PersistentClient(path="./chroma_db")

#json 파일 정리
def load_json_documents(json_files):
    docs = []
    for file in json_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)

        #시나리오
        for item in data:
            if "Text" in item and "Completion" in item:
                docs.append(Document(
                    page_content=f"질문: {item['Text']}\n답변: {item['Completion']}",
                    metadata={
                        "카테고리": item.get("카테고리", "")
                    }
                ))

            # 수강이력
            if "lecture_name" in item and "student_id" in item:
                content = (
                    f"학번: {item['student_id']}\n"
                    f"강의명: {item['lecture_name']}\n"
                    f"학정번호: {item['lecture_id']}\n"
                    f"개설 학과: {item['department_offered']}\n"
                    f"이수 구분: {item['lecture_course_type']}\n"
                    f"학점: {item['lecture_credit']}학점\n"
                    f"성적: {item['lecutre_grade']}"
                )
                if "retake_or_delete_status" in item:
                    content += f"\n재수강/삭제 여부: {item['retake_or_delete_status']}"
                if "retake_status" in item:
                    content += f"\n재수강 여부: {item['retake_status']}"

                docs.append(Document(
                    page_content=content,
                    metadata={
                        "lecture_name": item["lecture_name"],
                        "course_type": item["lecture_course_type"]
                    }
                ))

            # 강의 탐색
            # 강의 탐색: lecture_id와 lecture_name이 있고, student_id가 없는 경우
            if "lecture_id" in item and "lecture_name" in item and "student_id" not in item:
                content = (
                    f"학정번호: {item.get('lecture_id', '')}\n"
                    f"강의명: {item.get('lecture_name', '')}\n"
                    f"강의평점: {item.get('lecture_ratings', '정보 없음')}\n"
                    f"과제: {item.get('lecture_homework', '정보 없음')}\n"
                    f"팀플: {item.get('lecture_team', '정보 없음')}\n"
                    f"성적평가정도: {item.get('lecutre_grade', '정보 없음')}\n"
                    f"출결 방식: {item.get('lecutre_attendance', '정보 없음')}\n"
                    f"시험 횟수: {item.get('lecutre_test', '정보 없음')}\n"
                    f"시험 방식: {item.get('lecture_testinform', '정보 없음')}\n"
                    f"전공 학점: {item.get('credits_major', '없음')}\n"
                    f"교양 학점: {item.get('credits_general', '없음')}\n"
                    f"총 학점: {item.get('credits_total', '없음')}\n"
                    f"교수명: {item.get('lecture_professorname', item.get('lecture_professor', '정보 없음'))}\n"
                    f"수업 시간: {item.get('lecture_time', '')}\n"
                    f"강의 유형: {item.get('lecture_course_type', '')}\n"
                    f"학점: {item.get('lecture_hours', '')}시간\n"
                    f"학기: {item.get('lecture_semester', '')}학기\n"
                    f"강의 설명: {item.get('lecture_inform', '')}"
                )
                docs.append(Document(
                    page_content=content,
                    metadata={
                        "lecture_id": item.get("lecture_id", ""),
                        "lecture_name": item.get("lecture_name", ""),
                        "lecture_course_type": item.get("lecture_course_type", ""),
                        "professor": item.get("lecture_professorname", item.get("lecture_professor", "")),
                    }
                ))
    return docs

#각 기능별 json 경로 리스트 
task_datasets = {
    "lecture_search": ["./data/강의탐색.json","./data/kw_chatbot_data - 강의 평점.json","./data/kw_chatbot_data - 강의계획서.json","./data/kw_chatbot_data - 수강신청자료집.json"],
    "career_counsel": ["./data/진로상담.json"],
    "academic_status": ["./data/학습현황.json"],
}

# 각 기능별 컬렉션 생성 및 임베딩 처리
for task, files in task_datasets.items():
    # JSON → Documents
    docs = load_json_documents(files)

    #청크 분할
    split_docs = text_splitter.split_documents(docs)

    # 청크 텍스트
    texts = [doc.page_content for doc in split_docs]

    # 임베딩
    embeddings = embeddings_model.embed_documents(texts)

    # 저장
    collection = client.get_or_create_collection(name=task)
    collection.add(
        documents=texts,
        embeddings=embeddings,
        ids=[f"{task}_{i}" for i in range(len(split_docs))],
        metadatas=[doc.metadata for doc in split_docs]
    )
    
    print("임베딩 개수:", len(embeddings))
    print("벡터 차원:", len(embeddings[0]))

임베딩 개수: 609
벡터 차원: 1536
임베딩 개수: 44
벡터 차원: 1536
임베딩 개수: 40
벡터 차원: 1536


사용자의 입력을 기능별로 분류
강의 탐색 -> 강의탐색기능 
학습 현황 -> 학습현황기능
진로 상담 -> 진로상담기능 

In [19]:
def classify_function(user_input: str) -> str:
    if "강의" in user_input or "탐색" in user_input:
        return "lecture_search"
    elif "진로" in user_input or "상담" in user_input:
        return "career_counsel"
    elif "학업" in user_input or "현황" in user_input:
        return "academic_status"
    else:
        return "lecture_search"  # fallback

In [26]:
#팀플 유뮤, 정도 함수
def filter_lectures_by_team(json_files, team_condition):
    result = []
    for file in json_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
        for item in data:
            team_value = item.get("lecture_team", "").strip()
            lecture_id = item.get("lecture_id")
            lecture_name = item.get("lecture_name")

            # lecture_id나 lecture_name이 None이면 건너뛰기
            if not lecture_id or not lecture_name:
                continue

            if team_condition == "없음" and team_value in ["없음", "0"]:
                result.append({"lecture_id": lecture_id, "lecture_name": lecture_name})
            elif team_condition == "보통" and team_value == "보통":
                result.append({"lecture_id": lecture_id, "lecture_name": lecture_name})
            elif team_condition == "많은" and team_value == "많음":
                result.append({"lecture_id": lecture_id, "lecture_name": lecture_name})
    return result

In [50]:
# 교양, 전공 학점 맞게 추천 함수

import random

def get_lectures_by_credit_and_type(user_input: str):
    import re
    import json
    import random

    credit_match = re.search(r'(\d)\s*학점', user_input)
    if not credit_match:
        return "몇 학점짜리 수업을 원하시는지 알려주세요. 예: '2학점 교양 수업 알려줘'"
    credit = int(credit_match.group(1))

    if "전공" in user_input:
        course_types = ["전필", "전선"]
    elif "교양" in user_input:
        course_types = ["교필", "교선"]
    else:
        course_types = ["전필", "전선", "교필", "교선"]

    count_match = re.search(r'(\d+)\s*개', user_input)
    limit = int(count_match.group(1)) if count_match else None

    with open("./data/kw_chatbot_data - 수강신청자료집.json", "r", encoding="utf-8") as f:
        course_data = json.load(f)

    results = []
    seen = set()
    for item in course_data:
        course_type = item.get("lecture_course_type", "").strip()
        credit_val = item.get("lecture_hours") or item.get("lecture_credit")
        if not credit_val:
            continue

        try:
            credit_val = int(str(credit_val).strip())
        except:
            continue

        professor = item.get("lecture_professor", "").strip()
        lecture_name = item.get("lecture_name", "").strip()
        key = (lecture_name, professor)

        if course_type in course_types and credit_val == credit and key not in seen:
            seen.add(key)
            results.append(f"{lecture_name} (학정번호: {item['lecture_id']}) - {professor or '교수명 없음'}")

    if not results:
        return f"{credit}학점짜리 {', '.join(course_types)} 과목을 찾을 수 없습니다."

    if limit:
        random.shuffle(results)  # ✅ 랜덤 추천
        results = results[:limit]

    response = f"{credit}학점짜리 {', '.join(course_types)} 과목 목록입니다:\n"
    response += "\n".join(f"{i+1}. {lec}" for i, lec in enumerate(results))
    return response

분류된 기능에 맞춰 ChromaDB 검색

In [51]:
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
import re


llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3)

def answer_query(user_input: str):
    # 전처리: 특수 공백 제거 및 트림
    user_input = user_input.replace('\u00A0', ' ').strip()

    task = classify_function(user_input)

    #학점 + 수업유형(교양, 전공)
    if re.search(r'\d\s*학점', user_input):
        return get_lectures_by_credit_and_type(user_input)

    # 팀플 조건 파악
    team_condition = None
    
    # ✅ 다양한 말 표현 -> 리스트 안에 넣으면 됨
    if any(phrase in user_input for phrase in ["팀플 없는", "팀플이 없는", "조별과제 없는", "조별과제가 없는"]):
        team_condition = "없음"
    elif any(phrase in user_input for phrase in ["팀플 보통", "팀플이 보통"]):
        team_condition = "보통"
    elif any(phrase in user_input for phrase in ["팀플 많은", "팀플이 많은"]):
        team_condition = "많은"
    else:
        team_condition = None

    # 개수 추출
    match = re.search(r'(\d+)\s*개', user_input)
    count = int(match.group(1)) if match else None

    # 팀플 관련 요청이면 필터링
    if team_condition:
        filtered_lectures = filter_lectures_by_team(
            ["./data/kw_chatbot_data - 강의 평점.json"],
            team_condition
        )

        if not filtered_lectures:
            return f"팀플이 {team_condition} 강의를 찾을 수 없습니다."

        # 중복 제거 및 개수 제한
        seen = set()
        unique_lectures = []
        for lec in filtered_lectures:
            if lec["lecture_name"] not in seen:
                seen.add(lec["lecture_name"])
                unique_lectures.append(lec)

        if count:
            unique_lectures = unique_lectures[:count]

        # 응답 구성
        response = f"팀플이 {team_condition} 전공 강의는 다음과 같습니다:\n"
        for i, lec in enumerate(unique_lectures, start=1):
            response += f"{i}. {lec['lecture_name']} (학정번호: {lec['lecture_id']})\n"

        return response

    # 컬렉션 로드
    vectorstore = Chroma(
         persist_directory="./chroma_db",
        collection_name=task,
        embedding_function=embeddings_model
    )

    # 유사 문서 검색
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")

    # 답변 생성
    return qa.run(user_input)

In [60]:
print(answer_query("전선 중 3학점 5개만 추천해줘"))

3학점짜리 전필, 전선, 교필, 교선 과목 목록입니다:
1. 경영통계 (학정번호: 0000-1-0109-01) - 이석준
2. 초급일본어1 (학정번호: 0000-1-3593-13) - 이상이
3. C프로그래밍 (학정번호: 0000-1-0019-01) - 정진범
4. 현대사회와스포츠 (학정번호: 0000-2-4925-02) - 강지연
5. 영어발표연습 (학정번호: 0000-2-5839-01) - 정관희
