In [None]:
import pandas as pd
import numpy as np
import faiss
from openai import OpenAI
import pickle
# ✅ OpenAI 키 설정
client = OpenAI(api_key="")

# ✅ CSV 불러오기
df = pd.read_csv("/Users/baehanjun/Downloads/kpop-lyrics-analytics-main/datasets/lyrics_by_year_1964_2023.csv")


# ✅ 임베딩 텍스트 생성
df["text"] = df.apply(
    lambda x: f"{x['title']} / {x['singer']} / {x['lyric']}",
    axis=1
)

# ✅ GPT 임베딩 생성
def get_embedding(text):
    emb = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    ).data[0].embedding
    return np.array(emb, dtype="float32")   # faiss는 float32 필요

emb_list = df["text"].apply(get_embedding).tolist()
emb_array = np.vstack(emb_list)   # (N, dim)

# ✅ FAISS index 생성
dim = emb_array.shape[1]
index = faiss.IndexFlatL2(dim)   # cosine 대신 L2
index.add(emb_array)

print("✅ FAISS Index size:", index.ntotal)

# ✅ 메타데이터 저장 (id, 제목, 가수, 연도)
metadata = df[["id", "year", "title", "singer", "text"]].to_dict(orient="records")

# ✅ 저장
faiss.write_index(index, "songs.index")

with open("songs_meta.pkl", "wb") as f:
    pickle.dump(metadata, f)

print("✅ 저장 완료")

✅ FAISS Index size: 4666
✅ 저장 완료


In [9]:
import faiss
idx = faiss.read_index("data/songs.index")
print(idx.ntotal)

4666
