In [1]:
import pandas as pd
from openai import OpenAI
from sqlalchemy import create_engine, Column, String, MetaData, Table
from pgvector.sqlalchemy import Vector
import os
from dotenv import load_dotenv
import json

with open('service_dataset.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

df = pd.DataFrame(data['service'])

final_df = df[['contentTypeId', 'cat1', 'cat2', 'cat3']].fillna("")
df['cat1Nm'] = df['cat1Nm'].replace("인문(문화/예술/역사)", "인문")
final_df["catNm"] = df["cat1Nm"].fillna("") + " " + df["cat2Nm"].fillna("")+ " " + df["cat3Nm"].fillna("")
final_df

Unnamed: 0,contentTypeId,cat1,cat2,cat3,catNm
0,12,A01,,,자연
1,12,A01,A0101,,자연 자연관광지
2,12,A01,A0101,A01010100,자연 자연관광지 국립공원
3,12,A01,A0101,A01010200,자연 자연관광지 도립공원
4,12,A01,A0101,A01010300,자연 자연관광지 군립공원
...,...,...,...,...,...
72,14,A02,A0206,A02061000,인문 문화시설 대형서점
73,14,A02,A0206,A02061100,인문 문화시설 문화전수시설
74,14,A02,A0206,A02061200,인문 문화시설 영화관
75,14,A02,A0206,A02061300,인문 문화시설 어학당


In [10]:
from sqlalchemy import create_engine, MetaData, Table, Column, String
from pgvector.sqlalchemy import Vector

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
engine = create_engine("postgresql+psycopg2://postgres:1234@localhost:5432/postgres")

metadata = MetaData()

embeddings = []
for text in final_df["catNm"]:
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    embeddings.append(response.data[0].embedding)

dimension = len(embeddings[0])

table = Table(
    "service_category",
    metadata,
    Column("contentTypeId", String),
    Column("cat1", String),
    Column("cat2", String),
    Column("cat3", String),
    Column("catNm", String),
    Column("embedding", Vector(dimension))
)

metadata.create_all(engine)

# 5. 데이터 삽입
with engine.connect() as conn:
    for i, row in final_df.iterrows():
        print(i, row)
        # conn.execute(
        #     table.insert().values(
        #         city_code1=row["city_code1"],
        #         city_code2=row["city_code2"],
        #         city_str=row["city_str"],
        #         embedding=embeddings[i]
        #     )
        # )


0 contentTypeId      12
cat1              A01
cat2                 
cat3                 
catNm            자연  
Name: 0, dtype: object
1 contentTypeId           12
cat1                   A01
cat2                 A0101
cat3                      
catNm            자연 자연관광지 
Name: 1, dtype: object
2 contentTypeId               12
cat1                       A01
cat2                     A0101
cat3                 A01010100
catNm            자연 자연관광지 국립공원
Name: 2, dtype: object
3 contentTypeId               12
cat1                       A01
cat2                     A0101
cat3                 A01010200
catNm            자연 자연관광지 도립공원
Name: 3, dtype: object
4 contentTypeId               12
cat1                       A01
cat2                     A0101
cat3                 A01010300
catNm            자연 자연관광지 군립공원
Name: 4, dtype: object
5 contentTypeId            12
cat1                    A01
cat2                  A0101
cat3              A01010400
catNm            자연 자연관광지 산
Name: 5, dtype: object


In [11]:
from sqlalchemy import insert

with engine.connect() as conn:
    for (idx, row), embedding in zip(final_df.iterrows(), embeddings):
        stmt = insert(table).values(
            contentTypeId=row["contentTypeId"],
            cat1=row["cat1"],
            cat2=row["cat2"],
            cat3=row["cat3"],
            catNm=row["catNm"],
            embedding=embedding
        )
        conn.execute(stmt)
    conn.commit()