## 1. Elasticsearch 설치

    sudo apt update
    sudo apt install openjdk-11-jdk -y


    wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add -  
    sudo apt install apt-transport-https  
    echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" | sudo tee -a /etc/apt/sources.list.d/elastic-7.x.list  
    sudo apt update  
    sudo apt install elasticsearch  

    sudo systemctl start elasticsearch  
    sudo systemctl enable elasticsearch  

    sudo systemctl status elasticsearch  
    curl -X GET "localhost:9200/"

    pip install elasticsearch


    sudo vi /etc/elasticsearch/elasticsearch.yml  
    sudo systemctl restart elasticsearch


    sudo nano /etc/elasticsearch/elasticsearch.yml
    xpack.security.enabled: true ## 마지막 줄에 추가.  
    sudo systemctl restart elasticsearch  

    sudo /usr/share/elasticsearch/bin/elasticsearch-setup-passwords interactive ## 비밀번호 설정.

## 2. Elasticsearch 구동

In [10]:
# 엘라스틱서치의 데몬 인스턴스 만들기
import os
import json
import numpy as np
import pandas as pd

from subprocess import Popen, PIPE, STDOUT
from elasticsearch import Elasticsearch, helpers

es_server = Popen(['/home/pervinco/elasticsearch-8.8.0/bin/elasticsearch'],
                  stdout=PIPE, stderr=STDOUT,
                #   preexec_fn=lambda: os.setuid(1)  # as daemon
                 )

# # 인스턴스를 로드하는 데 약간의 시간이 걸림
# import time
# time.sleep(30)

In [2]:
# 서버가 구동되었는지 확인하는 명령어
os.system('ps -ef | grep elasticsearch')

elastic+  506926       1  4 22:23 ?        00:01:02 /usr/share/elasticsearch/jdk/bin/java -Xshare:auto -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -XX:+ShowCodeDetailsInExceptionMessages -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dio.netty.allocator.numDirectArenas=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Dlog4j2.formatMsgNoLookups=true -Djava.locale.providers=SPI,COMPAT --add-opens=java.base/java.io=ALL-UNNAMED -Djava.security.manager=allow -XX:+UseG1GC -Djava.io.tmpdir=/tmp/elasticsearch-981845695055292871 -XX:+HeapDumpOnOutOfMemoryError -XX:+ExitOnOutOfMemoryError -XX:HeapDumpPath=/var/lib/elasticsearch -XX:ErrorFile=/var/log/elasticsearch/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=/var/log/elasticsearch/gc.log:utctime,pid,tags:filecount

0

In [3]:
# Elasticsearch 접속 정보
username = 'elastic'
password = ''

# Elasticsearch 클라이언트 생성 (HTTP로 접속)
es = Elasticsearch(
    ['http://localhost:9200'],  # http로 변경
    basic_auth=(username, password)
)

# Elasticsearch 서버 정보 확인
resp = dict(es.info())
print(resp)


{'name': 'pervinco-B650-PG-Lightning', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'D1g_78dtTt2AY6aciu21vw', 'version': {'number': '7.17.24', 'build_flavor': 'default', 'build_type': 'deb', 'build_hash': 'fcf25fff740db6ab3ed5d145c58d70e4c3528ea7', 'build_date': '2024-09-05T07:34:51.812485320Z', 'build_snapshot': False, 'lucene_version': '8.11.3', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}


## 3. 색인 및 검색 명령 실행

In [4]:
# 실제 다운로드 및 데이터셋 로딩
import datasets
dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train").to_pandas()
dataset.drop("id", axis=1, inplace=True)
print(f"shape of dataset: {dataset.shape}")
dataset.head()

shape of dataset: (287113, 2)


Unnamed: 0,article,highlights
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."


In [5]:
## Elasticsearch 인덱스의 설정 및 매핑 정의
## 매핑은 DB의 스키마와 동일.
settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "article":{
                "type":"text"
            },
            "highlights":{
                "type":"text"
            }
        }
    }
}

In [6]:
def json_formatter(dataset, index_name):
    """
    이 함수는 Elasticsearch 색인을 위한 JSON 형식의 딕셔너리를 생성하는 데 사용됩니다.

    Args:
      dataset: 이 함수를 적용하려는 데이터입니다.
      index_name: Elasticsearch의 인덱스 이름입니다.
    """
    try:
        List = []
        columns = dataset.columns
        for idx, row in dataset.iterrows(): ## 데이터셋을 row 단위로 순회함.
            dic = {}
            dic['_index'] = index_name
            source = {}
            for i in dataset.columns:
                source[i] = row[i]
            dic['_source'] = source
            List.append(dic)
        return List

    except Exception as e:
        print("There is a problem: {}".format(e))

In [7]:
MY_INDEX = es.indices.create(index="news_index", body=settings) ## 인덱스 생성
MY_INDEX

  MY_INDEX = es.indices.create(index="news_index", body=settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'news_index'})

In [8]:
# 너무 많아서 100개만 사용
dataset = dataset[:100]

json_Formatted_dataset = json_formatter(dataset=dataset, index_name='news_index')
json_Formatted_dataset[0]

{'_index': 'news_index',
 '_source': {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his numbe

In [11]:
## Index에 데이터를 추가한다.
## 색인을 위해 elasticsearch.helpers API를 사용합니다.
res = helpers.bulk(es, json_Formatted_dataset[:100])
res

(100, [])

In [12]:
# 색인된 문서중 10개의 데이터 샘플 가져오기
query = es.search(
    index="news_index",
    body={
      "size":10,
      "query": {
        # 모든 문서가 매치된다는 의미 (_socre는 모두 1.0)
        "match_all":{}
      }
    }
)

output = pd.json_normalize((query['hits']['hits']))
output

  query = es.search(


Unnamed: 0,_index,_type,_id,_score,_source.article,_source.highlights
0,news_index,_doc,dbhiQ5IBSsti6fOEi_Rt,1.0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,news_index,_doc,drhiQ5IBSsti6fOEi_Rt,1.0,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,news_index,_doc,d7hiQ5IBSsti6fOEi_Rt,1.0,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,news_index,_doc,eLhiQ5IBSsti6fOEi_Rt,1.0,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,news_index,_doc,ebhiQ5IBSsti6fOEi_Rt,1.0,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."
5,news_index,_doc,erhiQ5IBSsti6fOEi_Rt,1.0,"BAGHDAD, Iraq (CNN) -- Dressed in a Superman s...","Parents beam with pride, can't stop from smili..."
6,news_index,_doc,e7hiQ5IBSsti6fOEi_Rt,1.0,"BAGHDAD, Iraq (CNN) -- The women are too afrai...","Aid workers: Violence, increased cost of livin..."
7,news_index,_doc,fLhiQ5IBSsti6fOEi_Rt,1.0,"BOGOTA, Colombia (CNN) -- A key rebel commande...",Tomas Medina Caracas was a fugitive from a U.S...
8,news_index,_doc,fbhiQ5IBSsti6fOEi_Rt,1.0,WASHINGTON (CNN) -- White House press secretar...,"President Bush says Tony Snow ""will battle can..."
9,news_index,_doc,frhiQ5IBSsti6fOEi_Rt,1.0,(CNN) -- Police and FBI agents are investigati...,Empty anti-tank weapon turns up in front of Ne...
