In [None]:
!pip install langchain langchain_community language-tool-python langchain-ollama jq

In [7]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import CommaSeparatedListOutputParser
from langchain_core.prompts import PromptTemplate
import csv
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import JSONLoader
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
import json
from langchain_chroma import Chroma

# 문서 로드

In [2]:
def load_example_sentences(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return [Document(page_content=sentence) for sentence in data]

example_sentences = load_example_sentences("/Users/mane/Documents/프로젝트/eng-word/utils/json_words/combined/combined_examples.json")

example_sentences

[Document(page_content='Let me introduce my aunt.'),
 Document(page_content='I am going to play soccer.'),
 Document(page_content='The boy is my brother.'),
 Document(page_content='Jimmy is going to exercise every day.'),
 Document(page_content='The girl is my sister.'),
 Document(page_content='Let me introduce my cousin to you.'),
 Document(page_content='We are going to learn Korean history.'),
 Document(page_content='I’m going to run a marathon.'),
 Document(page_content='He is going to visit his grandpa.'),
 Document(page_content='The woman is my mom.'),
 Document(page_content='He set a new world record for swimming.'),
 Document(page_content='Just sit down and relax for an hour.'),
 Document(page_content='He repairs computers.'),
 Document(page_content='The teacher repeated the question.'),
 Document(page_content='The doctor reported that the patient recovered.'),
 Document(page_content='The firefighters rescued children from the burning building.'),
 Document(page_content='You wil

# 모델 생성

In [22]:
# llm = ChatOllama(model="Lama3.2-korean:latest", max_token = 500)
llm = ChatOllama(model="llama3.2", max_token=100, temperature=.25)

# 프롬프트 

In [28]:
# PromptTemplate 설정
# prompt_template = PromptTemplate(
#     input_variables=["word"],
#     template="""
#     Write a simple English sentence (5-8 words) that an elementary school student can understand. 
#     The sentence must include the word '{word}' exactly as it is, without any modifications or variations.
#     Avoid using conjunctions and ensure that the sentence is not a compound sentence. 
#     Make sure to create only one complete sentence for each word. Use words commonly used in the US.
#     No describe, only sentence
#     """,
# )


# # 프롬프트 템플릿 정의
# prompt_template = PromptTemplate(
#     input_variables=["query"],
#     template="""
    
#     #QUERY
#     Write a simple English sentence (5-8 words) that an elementary school student can understand.
#     The sentence must include the word '{query}' exactly as it is, without any modifications or variations.
#     Avoid using conjunctions and ensure that the sentence is not a compound sentence.
#     Make sure to create only one complete sentence for each word. Use words commonly used in the US.
    
#     #IMPORTANCE
#     Create only one complete sentence.
#     """,
# )

# prompt_template = PromptTemplate(
#     input_variables=["query"],
#     template="""
    
#     #QUERY
#     Write one simple, complete English sentence (5-8 words) that an elementary school student can understand.
#     The sentence must include the word '{query}' exactly as it is, without any modifications, variations, or additional forms.
#     Use only a single statement, not a question or command, and avoid conjunctions, question forms, or compound sentences.
    
#     #IMPORTANCE
#     Ensure that only one sentence is created and that it contains the word '{query}' precisely as specified.
#     Use only words commonly used in the US.
#     """,
# )

prompt_template = PromptTemplate(
    input_variables=["query"],
    template="""
    #TASK
    Write one simple, complete English sentence that includes the word '{query}' exactly as given.
    
    #REQUIREMENTS
    - Use 5-8 words total
    - Include '{query}' without any modifications
    - Use only words from the Dolch sight word list for K-3
    - Write a simple statement (not a question or command)
    - Avoid conjunctions and compound sentences
    
    #OUTPUT FORMAT
    Return only the sentence without any explanation or additional text.
    
    #EXAMPLE
    For query "book":
    "The red book sits on table."
    """,
)

# 임베딩 생성

In [5]:
# embeddings = OpenAIEmbeddings()
# model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_name = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cpu","trust_remote_code":True},  # cuda, cpu
    encode_kwargs={"normalize_embeddings": True},
)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:31<00:00, 15.97s/it]


# vectoc store

In [8]:
# # 벡터스토어 저장
# vector_store = FAISS.from_documents(example_sentences, hf_embeddings)
# # 생성된 벡터 스토어를 디스크에 저장
# vector_store.save_local("./sentence_vectorstore")

# 저장할 경로 지정
DB_PATH = "./sentence_vectorstore"

# DB 생성
db = Chroma.from_documents(
    documents=example_sentences, embedding=hf_embeddings, collection_name="sentence_db",persist_directory=DB_PATH
)

In [9]:
# 디스크에서 문서를 로드합니다.
chroma_db = Chroma(
    persist_directory=DB_PATH,
    embedding_function=hf_embeddings,
    collection_name="sentence_db",
)

In [None]:
# # 저장된 벡터 스토어 로드
# loaded_vector_store = FAISS.load_local(
#     "./sentence_vectorstore",
#     hf_embeddings,
#     allow_dangerous_deserialization=True  # 신뢰된 파일에서만 사용
# )

# 검색기 생성

In [10]:
# retriever = loaded_vector_store.as_retriever()
retriever = chroma_db.as_retriever()


In [16]:
retriever.invoke("mouse")

[Document(page_content='Go ahead.'),
 Document(page_content='It’s red.'),
 Document(page_content='The plane gray.'),
 Document(page_content='Stand up.')]

# 출력 파서

In [16]:
# output_parser = CommaSeparatedListOutputParser()
from langchain_core.output_parsers import StrOutputParser
output_parser = StrOutputParser()

# 체인 생성

In [29]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt_template, "document_variable_name": "query"},
)

In [24]:
qa_chain.invoke({"query":"eraser"})

{'query': 'eraser', 'result': "I found It's her eraser on the floor."}

In [None]:
# chain = prompt_template | qa_chain | output_parser
# chain = prompt_template | qa_chain


# 실행 코드(테스트)

In [26]:
# JSON 파일에서 단어 목록 불러오기
with open("/Users/mane/Documents/프로젝트/eng-word/utils/json_words/combined_words.json", "r", encoding="utf-8") as file:
    words = json.load(file)

In [30]:
from tqdm import tqdm
# CSV 파일 생성
sentences = []

slice_word = words[:10]

# CSV 파일 생성
with open("grammar_sentences.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["문장 번호", "문장", "포함된 단어"])

    # 각 단어에 대해 문장 생성 및 저장
    for idx, word in tqdm(enumerate(slice_word, start=1)):
        response = qa_chain.invoke({"query": word})  # 체인 실행
        
        # # 파싱된 문장 기록
        # for sentence in response:
        #     writer.writerow([idx, sentence, word])
        #     print(sentence)
        writer.writerow([idx, response['result'], word])
        print(response['result'])
        

1it [01:02, 62.06s/it]

This is my aunt.


2it [02:06, 63.49s/it]

I’m going to the library.


3it [03:00, 58.99s/it]

Go ahead. Look at this fork.


4it [04:07, 62.32s/it]

Go ahead. Stand up. Go straight now. The plane is gray.


5it [05:12, 63.15s/it]

The plane gray.


6it [06:00, 58.11s/it]

Nice to meet you.


7it [06:47, 54.34s/it]

Go ahead. Look at this graph. It's red. Stand up.


8it [07:32, 51.55s/it]

I’m going to run a race.


9it [08:14, 48.33s/it]

Go ahead. Look at this graph.


10it [08:56, 53.67s/it]

The plane gray. It's a fork.



