In [1]:
import pandas as pd
rating_df = pd.read_csv("../data/test_rating.csv")

In [2]:
# 고유한 user_idx 목록 추출
unique_users = set(rating_df['user_idx'])

# 상위 2만 개 사용자 선택
selected_users = list(unique_users)[:10000]

# 선택된 사용자에 해당하는 데이터프레임 슬라이스
sliced_df = rating_df[rating_df['user_idx'].isin(selected_users)]

# 결과 확인
print(f"원래 사용자 수: {len(unique_users)}")
print(f"슬라이스된 사용자 수: {len(set(sliced_df['user_idx']))}")
print(f"슬라이스된 데이터프레임 크기: {sliced_df.shape}")
sliced_df.head()

원래 사용자 수: 28921
슬라이스된 사용자 수: 10000
슬라이스된 데이터프레임 크기: (34284, 4)


Unnamed: 0,review_idx,user_idx,product_idx,review_rating
2,3,861,1690,4
3,4,5391,3953,5
9,10,6266,1345,4
12,13,4427,61,2
19,20,8323,769,5


In [3]:
user_item_matrix = sliced_df.pivot_table(
        index='user_idx',
        columns='product_idx',
        values='review_rating',
        fill_value=0
)

In [4]:
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from datetime import datetime
import time
import logging

# 로깅 설정
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("elasticsearch")

In [5]:
user_idx_list = user_item_matrix.index
item_idx_list = user_item_matrix.columns

user_matrix_sparse = csr_matrix(user_item_matrix.values)
item_matrix_sparse = csr_matrix(user_item_matrix.T.values)

n_components = 128  # 차원 수 (하이퍼파라미터, 튜닝 가능)
user_svd = TruncatedSVD(n_components=n_components, random_state=42)
item_svd = TruncatedSVD(n_components=n_components, random_state=42)
User_svd = user_svd.fit_transform(user_matrix_sparse)
Item_svd = item_svd.fit_transform(item_matrix_sparse)

In [6]:
# 엘라스틱서치 연결
es = Elasticsearch(
    ["http://elasticsearch:9200"],
    headers={
        "Accept": "application/vnd.elasticsearch+json; compatible-with=8",
        "Content-Type": "application/json"
    }
)

In [7]:
def create_vector_indices(es, n_components):
    # 벡터 인덱스 매핑 설정
    vector_mapping = {
        "properties": {
            "vector": {
                "type": "dense_vector",
                "dims": n_components,
                "index": True,
                "similarity": "cosine",
                "index_options": {
                    "type": "hnsw",  # HNSW 알고리즘 사용
                    "m": 16,  # 최대 연결 수 (기본값)
                    "ef_construction": 200  # 인덱스 생성 시 효율성 파라미터
                }
            },
            "last_updated": {
                "type": "date"
            }
        }
    }

    # 인덱스 설정 (KNN 관련 설정 제거)
    vector_settings = {
        "index": {
            "number_of_shards": 3,
            "number_of_replicas": 1
        }
    }

    # 인덱스 목록
    indices = ["user-based", "item-based"]

    for index_name in indices:
        # 인덱스가 존재하면 삭제 후 다시 생성
        if es.indices.exists(index=index_name):
            print(f"Deleting existing index: {index_name}")
            es.indices.delete(index=index_name)

        # 인덱스 생성
        print(f"Creating index: {index_name}")
        es.indices.create(
            index=index_name,
            mappings=vector_mapping,
            settings=vector_settings
        )
        print(f"Finished Creating index: {index_name}")


def bulk_store_user_vectors(es, user_idx_list, User_svd):
    start_time = time.time()

    def generate_user_actions():
        for i, user_id in enumerate(user_idx_list):
            yield {
                "_index": "user-based",
                "_id": str(user_id),
                "_source": {
                    "vector": User_svd[i].tolist(),
                    "last_updated": datetime.now().isoformat()
                }
            }

    # 벌크 업로드 실행
    success, failed = bulk(es, generate_user_actions(), chunk_size=1000, request_timeout=120)
    # success, failed = es.options(request_timeout=120).bulk(body=generate_user_actions())

    end_time = time.time()
    print(f"Bulk indexed {success} user vectors in {end_time - start_time:.2f} seconds")
    if failed:
        print(f"Failed to index {len(failed)} documents")


def bulk_store_item_vectors(es, item_idx_list, Item_svd):
    start_time = time.time()

    def generate_item_actions():
        for i, item_id in enumerate(item_idx_list):
            yield {
                "_index": "item-based",
                "_id": str(item_id),
                "_source": {
                    "vector": Item_svd[i].tolist(),
                    "last_updated": datetime.now().isoformat()
                }
            }

    # 벌크 업로드 실행
    success, failed = bulk(es, generate_item_actions(), chunk_size=1000, request_timeout=120)
    # success, failed = es.options(request_timeout=120).bulk(body=generate_item_actions())

    end_time = time.time()
    print(f"Bulk indexed {success} item vectors in {end_time - start_time:.2f} seconds")
    if failed:
        print(f"Failed to index {len(failed)} documents")

In [None]:
# 인덱스 생성
create_vector_indices(es, n_components)

# 벌크 API로 데이터 적재
bulk_store_user_vectors(es, user_idx_list, User_svd)
bulk_store_item_vectors(es, item_idx_list, Item_svd)

INFO:elastic_transport.transport:HEAD http://elasticsearch:9200/user-based [status:404 duration:0.479s]


Creating index: user-based


INFO:elastic_transport.transport:PUT http://elasticsearch:9200/user-based [status:200 duration:2.909s]
INFO:elastic_transport.transport:HEAD http://elasticsearch:9200/item-based [status:404 duration:0.069s]


Finished Creating index: user-based
Creating index: item-based


INFO:elastic_transport.transport:PUT http://elasticsearch:9200/item-based [status:200 duration:1.031s]


Finished Creating index: item-based


  success, failed = bulk(es, generate_user_actions(), chunk_size=1000, request_timeout=120)
INFO:elastic_transport.transport:PUT http://elasticsearch:9200/_bulk [status:200 duration:15.741s]
INFO:elastic_transport.transport:PUT http://elasticsearch:9200/_bulk [status:200 duration:8.663s]
INFO:elastic_transport.transport:PUT http://elasticsearch:9200/_bulk [status:200 duration:2.223s]
INFO:elastic_transport.transport:PUT http://elasticsearch:9200/_bulk [status:200 duration:2.860s]
INFO:elastic_transport.transport:PUT http://elasticsearch:9200/_bulk [status:200 duration:2.259s]
INFO:elastic_transport.transport:PUT http://elasticsearch:9200/_bulk [status:200 duration:2.067s]
INFO:elastic_transport.transport:PUT http://elasticsearch:9200/_bulk [status:200 duration:2.577s]
INFO:elastic_transport.transport:PUT http://elasticsearch:9200/_bulk [status:200 duration:1.751s]
INFO:elastic_transport.transport:PUT http://elasticsearch:9200/_bulk [status:200 duration:2.924s]
INFO:elastic_transport.tr

Bulk indexed 10000 user vectors in 45.79 seconds


INFO:elastic_transport.transport:PUT http://elasticsearch:9200/_bulk [status:200 duration:1.145s]
INFO:elastic_transport.transport:PUT http://elasticsearch:9200/_bulk [status:200 duration:1.230s]
INFO:elastic_transport.transport:PUT http://elasticsearch:9200/_bulk [status:200 duration:1.077s]


In [None]:
# 1. 인덱스 리스트 조회
print("=== 인덱스 리스트 ===")
indices = es.cat.indices(format="json")
for index in indices:
    index_name = index['index']
    doc_count = index['docs.count']
    print(f"인덱스: {index_name}, 문서 수: {doc_count}")

In [None]:
# 2. 특정 인덱스의 문서 내용 조회 (예: 'employees' 인덱스)
print("\n=== 'employees' 인덱스 문서 내용 ===")
search_result = es.search(
    index="user-based",
    body={
        "query": {
            "match_all": {}  # 모든 문서 조회
        },
        "size": 10  # 최대 10개 문서 반환
    }
)
# 검색 결과 출력
hits = search_result['hits']['hits']
if hits:
    for hit in hits:
        print(f"문서 ID: {hit['_id']}, 내용: {hit['_source']}")
else:
    print("인덱스 'employees'에 문서가 없습니다.")