In [None]:
import pandas as pd

# 기존 CSV 파일을 불러옵니다.
file_path = "C:/Users/user/Desktop/eng-word/spell_stars/utils/Grammar_Score/data/grammar_sentences.csv"
df = pd.read_csv(file_path, encoding="utf-8")  # 적절한 인코딩 사용

In [2]:
df

Unnamed: 0,id,sentence,word
0,1,This is my aunt.,aunt
1,3,The boy is my twelve-year-old brother.,boy
2,4,I exercise every morning.,exercise
3,5,The girl is my smart sister.,girl
4,6,Let me introduce my cousin to you.,introduce
5,7,We all learn by doing things.,learn


In [None]:
import os
import json
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# 경로 설정
word_list_path = (
    "C:/Users/user/Desktop/eng-word/spell_stars/utils/generate/extracted_words.json"
)
vector_store_path = (
    "C:/Users/user/Desktop/eng-word/spell_stars/utils/generate/sentence_vectorstore"
)
index_path = os.path.join(vector_store_path, "index.faiss")

# 임베딩 모델 설정
model_name = "sentence-transformers/all-mpnet-base-v2"
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cpu", "trust_remote_code": True},
    encode_kwargs={"normalize_embeddings": True},
)


# 인덱스 생성 함수
def create_faiss_index():
    # 단어 리스트 로드
    with open(word_list_path, "r", encoding="utf-8") as file:
        words = json.load(file)
    print("단어 리스트 로드 완료.")

    # 단어 리스트 임베딩 생성
    embeddings = hf_embeddings.embed_documents(words)
    print("임베딩 생성 완료.")

    # 새로운 FAISS 인덱스 생성
    vector_store = FAISS.from_embeddings(documents=words, embedding=embeddings)

    # 저장할 폴더 생성
    os.makedirs(vector_store_path, exist_ok=True)

    # 인덱스 저장
    vector_store.save_local(vector_store_path)
    print(f"새로운 FAISS 인덱스가 {vector_store_path}에 저장되었습니다.")


# 기존 인덱스가 없으면 생성, 기존 인덱스가 있으면 로드
if not os.path.exists(index_path):
    print("FAISS 인덱스 파일을 찾을 수 없습니다. 새 인덱스를 생성합니다.")
    create_faiss_index()
else:
    print("기존 FAISS 인덱스를 불러옵니다.")
    loaded_vector_store = FAISS.load_local(
        vector_store_path, hf_embeddings, allow_dangerous_deserialization=True
    )
    print("기존 인덱스 로드 완료.")

RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

In [None]:
import json

with open(
    "C:/Users/user/Desktop/eng-word/spell_stars/utils/generate/clustered_wordbook.json"
) as file:
    data = json.load(file)

# 단어 추출 (JSON의 최상위 키)
words = list(data.keys())
len(words)

465

In [None]:
with open(file_path, "r", encoding="utf-8") as file:
    word_data = json.load(file)

# 예문 목록 생성
example_sentences = [
    entry["examples"][0]["english"]
    for word, entry in word_data.items()
    if "examples" in entry and entry["examples"]
]

In [6]:
words

['knife',
 'scissors',
 'cut',
 'fork',
 'pull',
 'push',
 'help',
 'lunch box',
 'box',
 'card',
 'gift',
 'key',
 'ticket',
 'door',
 'window',
 'button',
 'breakfast',
 'lunch',
 'dinner',
 'morning',
 'afternoon',
 'evening',
 'night',
 'home',
 'time',
 'o’clock',
 'dance',
 'dancer',
 'walk',
 'train',
 'trip',
 'shop',
 'ski',
 'skiing',
 'skate',
 'bike',
 'swim',
 'pool',
 'swimming',
 'jump',
 'run',
 'fly',
 'go',
 'throw',
 'party',
 'have a party',
 'birthday',
 'picnic',
 'piano',
 'violin',
 'music',
 'musical',
 'light',
 'drum',
 'guitar',
 'recorder',
 'song',
 'radio',
 'singer',
 'fast',
 'slow',
 'long',
 'short',
 'smart',
 'foolish',
 'clean',
 'dirty',
 'bad',
 'good',
 'honest',
 'early',
 'late',
 'close',
 'come',
 'open',
 'quiet',
 'noisy',
 'sorry',
 'stop',
 'hear',
 'understand',
 'listen',
 'bookstore',
 'bookshelf',
 'bank',
 'library',
 'book',
 'post office',
 'closet',
 'fridge',
 'house',
 'wall',
 'apartment',
 'hospital',
 'ambulance',
 'nurse',


In [5]:
import pdfplumber
import fitz  # PyMuPDF
import os
import re
import json

In [7]:
!pip install pdfplumber




In [4]:
!pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.13-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-win_amd64.whl (16.2 MB)
   ---------------------------------------- 0.0/16.2 MB ? eta -:--:--
   ------------------------------------ --- 14.7/16.2 MB 92.4 MB/s eta 0:00:01
   ---------------------------------------- 16.2/16.2 MB 40.9 MB/s eta 0:00:00
Installing collected packages: pymupdf
Successfully installed pymupdf-1.24.13


In [None]:
def pdf_to_json(
    pdf_path, output_directory
):  # beyond.pdf를 제외한 pdf용 : pdfplumber를 사용하여 PDF를 JSON으로 변환하는 함수

    with pdfplumber.open(pdf_path) as pdf:

        for page_number, page in enumerate(pdf.pages):

            lesson_titles = extract_lesson_titles(page)  # "Lesson" 제목 추출

            title_text = " - ".join(lesson_titles)  # 제목을 파일명으로 변환

            safe_title_text = safe_filename(title_text)  # 안전한 파일명 변환

            data = []

            tables = page.extract_tables()  # 페이지에서 테이블 추출

            for table in tables:

                for row in table:

                    if len(row) > 0:

                        entry = extract_data_from_table(
                            row
                        )  # 테이블 행을 데이터로 변환

                        if any(entry.values()):

                            data.append(entry)

            # JSON 파일로 저장

            output_file = os.path.join(
                output_directory,
                f"{os.path.basename(pdf_path).replace('.pdf', '')}_page_{page_number + 1}_{safe_title_text}.json",
            )

            with open(output_file, "w", encoding="utf-8") as json_file:

                json.dump(data, json_file, ensure_ascii=False, indent=4)

            print(f"Parsed data for {pdf_path} page {page_number + 1}:", data)