In [9]:
import pandas as pd
from openai import OpenAI
from sqlalchemy import create_engine, Column, String, MetaData, Table
from pgvector.sqlalchemy import Vector
import os
from dotenv import load_dotenv
import json

with open('service_dataset.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

df = pd.DataFrame(data['service'])
df['contentType'] = df['contentTypeId'].map({
    "12": "관광지",
    "14": "문화시설"
}).fillna("")

final_df = df[['contentTypeId','cat1']].fillna("")
df['cat1Nm'] = df['cat1Nm'].replace("인문(문화/예술/역사)", "인문")
final_df["catNm"] = df["contentType"].fillna("")+" " + df["cat1Nm"].fillna("")
final_df = final_df.drop_duplicates().reset_index(drop=True)

print(final_df)

  contentTypeId cat1    catNm
0            12  A01   관광지 자연
1            12  A02   관광지 인문
2            14  A02  문화시설 인문


In [10]:
from sqlalchemy import create_engine, MetaData, Table, Column, String
from pgvector.sqlalchemy import Vector

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
engine = create_engine("postgresql+psycopg2://postgres:1234@localhost:5432/postgres")

metadata = MetaData()

embeddings = []
for text in final_df["catNm"]:
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    embeddings.append(response.data[0].embedding)

dimension = len(embeddings[0])

table = Table(
    "service_category",
    metadata,
    Column("contentTypeId", String),
    Column("cat1", String),
    Column("catNm", String),
    Column("embedding", Vector(dimension))
)

metadata.create_all(engine)

# 5. 데이터 삽입
with engine.connect() as conn:
    for i, row in final_df.iterrows():
        print(i, row)
        # conn.execute(
        #     table.insert().values(
        #         city_code1=row["city_code1"],
        #         city_code2=row["city_code2"],
        #         city_str=row["city_str"],
        #         embedding=embeddings[i]
        #     )
        # )


0 contentTypeId        12
cat1                A01
catNm            관광지 자연
Name: 0, dtype: object
1 contentTypeId        12
cat1                A02
catNm            관광지 인문
Name: 1, dtype: object
2 contentTypeId         14
cat1                 A02
catNm            문화시설 인문
Name: 2, dtype: object


In [11]:
from sqlalchemy import insert

with engine.connect() as conn:
    for (idx, row), embedding in zip(final_df.iterrows(), embeddings):
        stmt = insert(table).values(
            contentTypeId=row["contentTypeId"],
            cat1=row["cat1"],
            catNm=row["catNm"],
            embedding=embedding
        )
        conn.execute(stmt)
    conn.commit()