# このJupyter Notebookの使い方
これはレシピ集として作成されています。最初に、初期設定セクションにて利用するElasticsearch環境やOpenAI環境、HuggingFace環境の接続情報を設定します。その後は、実行したいセクションA. B. C. ..から始めて順番にコマンドを実行してください。
そのためにセクション間で重複したコードが繰り返しあります。

# 1.初期設定

## ライブラリの有効化

In [2]:
%pip install -q elasticsearch requests


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
from pprint import pprint
import os, json
import requests
from getpass import getpass
from elasticsearch import Elasticsearch, helpers
from elasticsearch.helpers import bulk

## Elasticsearchの設定

In [4]:
ELASTIC_CLOUD_ID = getpass("Elastic deployment Cloud ID")
ELASTIC_API_KEY = getpass("Elastic deployment API Key")
if ELASTIC_CLOUD_ID == '':
  ELASTIC_URL = getpass("Elastic deployment URL. No need if Cloud ID is provided.")
if ELASTIC_API_KEY == '':
  ELASTIC_USER = getpass("Elastic user. No need if API key is provided.")
  ELASTIC_PASSWORD = getpass("Elastic password. No need if API key is provided.")

if ELASTIC_CLOUD_ID != '' and ELASTIC_API_KEY != '':
  es = Elasticsearch(
    cloud_id=ELASTIC_CLOUD_ID,
    api_key=ELASTIC_API_KEY,
    request_timeout=300
  )
elif ELASTIC_URL != '' and ELASTIC_USER != '' and ELASTIC_PASSWORD != '':
  es = Elasticsearch(
    hosts = ELASTIC_URL,
    basic_auth=(ELASTIC_USER, ELASTIC_PASSWORD),
    request_timeout=300
  )
elif ELASTIC_URL != '' and ELASTIC_USER == '':
  es = Elasticsearch(
    hosts = ELASTIC_URL,
    # request_timeout=300,
    request_timeout=300
  )
else:
  print("env needs to set either ELASTIC_CLOUD_ID or ELASTIC_URL")


pprint(es.info()) # should return cluster info

ObjectApiResponse({'name': 'instance-0000000054', 'cluster_name': '507a2cf6ba204071943512e0537eee58', 'cluster_uuid': 'oF-xDLtXRCet87gRuM3eJg', 'version': {'number': '8.11.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '76013fa76dcbf144c886990c6290715f5dc2ae20', 'build_date': '2023-12-05T10:03:47.729926671Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})


## Elasticsearchで使うインデックス名の設定

In [5]:
INDEX_NAME=input("Elasticsearchのインデックスの名前 (空入力はqiita-blog-appsearchになります):")
if INDEX_NAME == '':
    INDEX_NAME='qiita-blog-appsearch'
INDEX_NAME

'qiita-blog-appsearch'

# 2.検索ドキュメントのセットアップ


## Qiita記事のダウンロード

In [10]:
# h = {'Authorization': 'Bearer xxxx'} # ユーザ認証する場合
h = {}
url = "https://qiita.com/api/v2/items?"

# tag別に記事をPAGEだけ繰り返し取得
query = "&query=org%3Aelasticsearch_japan"
# 検索で指定した期間内に作成された記事数を取得
res = requests.get(url=url + query, headers=h)
# サーバーからの応答
print(res.status_code, res.reason)
# print("指定しているタグ: " + tag_name)
total_count = int(res.headers['Total-Count'])
print("total_count: " + str(total_count))

page = f"page=1&per_page={total_count}"
os.makedirs("qiita-downloads", exist_ok=True)
res = requests.get(url=url + page + query, headers=h)
documents = json.loads(res.text)
for doc in documents:
    search_doc = {
        "title": doc["title"],
        "url": doc["url"],
        "body": doc["body"],
        "tags": doc["tags"],
        "created_at": doc["created_at"],
        "updated_at": doc["updated_at"],
        "id": doc["id"],
        "likes_count": doc["likes_count"],
        "reactions_count": doc["reactions_count"],
        "stocks_count": doc["stocks_count"],
        "page_views_count": doc["page_views_count"],
        "organization_url_name": doc["organization_url_name"],
        "comments_count": doc["comments_count"]
    }
    title = doc["title"].replace('/', '_')
    filename = "./qiita-downloads" + "/" + title + ".json"
    with open(filename, 'w') as f:
        json.dump(search_doc, f, indent=2, ensure_ascii=False)

200 OK
total_count: 72


In [11]:
DRIVE_FOLDER = "./qiita-downloads/"
json_docs = []

# 指定されたフォルダ内のファイルを走査
for filename in os.listdir(DRIVE_FOLDER):
    # ファイルがJSONファイルであることを確認
    if filename.endswith(".json"):
        file_path = os.path.join(DRIVE_FOLDER, filename)
        # JSONファイルを開いてデータを読み取り、dictオブジェクトとして追加
        with open(file_path, "r", encoding="utf-8") as json_file:
            json_data = json.load(json_file)
            tags_names = []
            for tag in json_data['tags']:
                tags_names.append(tag['name'])
            json_data['tags'] = tags_names
            json_docs.append(json_data)

# 全てのJSONドキュメントがjson_docsリストに格納されました
print(f'document count: {len(json_docs)}')
print(json_docs[0] if len(json_docs) > 0 else None)

document count: 72
{'title': 'Elastic Stack 8.0 の NLP で日本語センチメント分析を試してみた - 後編', 'url': 'https://qiita.com/ijokarumawak@github/items/6cc714060090160cf2d5', 'body': '先日 [Elastic Stack 8.0 の NLP で日本語センチメント分析を試してみた](https://qiita.com/ijokarumawak@github/items/9b0c2d650536488718a5) を書いたところ、「これ、ちゃんと日本語で処理できるのかな？中の動きが知りたい」とコメントいただきました。確かに、モデル側では fugashi などを使っているのに Elasticsearch 側では使ってないはずですね。\n\n今回は Elastic Stack 8.0.1 を使って、 inference で判定させるテキストをどうやって tokenize しているかを調査してみました。\n\n勿体ぶらずにまずは結論から。\n\n## 結論: Elastic Stack 8.0.1 時点では、日本語は Unigram で扱われている\n\n全体の処理の流れを Tokenizer を中心に整理してみました。\n\n![nlp-tokenizer.png](https://qiita-image-store.s3.ap-northeast-1.amazonaws.com/0/20515/3f752cb2-c072-badf-a117-075bdc4971b8.png)\n\n\n1. ***学習フェーズ*** 事前に学習する部分。 BertForSequenceClassification というアーキテクチャの一部で、入力文字列を機械学習で扱うために変換する Transformer 。学習の成果物として **モデル** と **ボキャブラリ** ができます。\n1. Eland を使って学習済みのモデルとボキャブラリを Hugging Face から Elasticsearch インデックスにインポートします。\n1. ***判定フェーズ*** こちらが今回のテーマ、 Elastic Stack で分析対象の文字列をトーク

# 3.様々なサーチを試そう

## 3-A. Elasticsearchのキーワード検索 (BM25)

### インデックス作成
Kuromojiをアナライザーとして設定したElasticsearchのインデックスを作成します。

In [120]:
if es.indices.exists(index=INDEX_NAME):
    # If it exists, delete the index
    es.indices.delete(index=INDEX_NAME)
    print(f"Index '{INDEX_NAME}' deleted successfully.")
else:
    print(f"Index '{INDEX_NAME}' does not exist.")

es.indices.create(
  index=INDEX_NAME,
  settings={
      "index": {
          "number_of_shards": 1,
          "number_of_replicas": 1
      }
  }
)


Index 'qiita-blog-appsearch' deleted successfully.


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'qiita-blog-appsearch'})

### Kuromojiアナライザの設定

In [121]:
es.indices.close(index=INDEX_NAME)

add_settings = {
  "index": {
    "analysis": {
      "char_filter": {
        "normalize": {
          "mode": "compose",
          "name": "nfkc",
          "type": "icu_normalizer"
        }
      }
    }
  }
}
es.indices.put_settings(index=INDEX_NAME, body=add_settings)

add_settings = {
  "index": {
    "analysis": {
      "tokenizer": {
        "ja_kuromoji_tokenizer": {
          "mode": "search",
          "discard_compound_token": "true",
          "type": "kuromoji_tokenizer"
        }
      }
    }
  }
}

es.indices.put_settings(index=INDEX_NAME, body=add_settings)

# Define the new settings you want to apply
add_settings = {
  "index": {
    "analysis": {
      "analyzer": {
        "ja_kuromoji_index_analyzer": {
          "filter": [
            "kuromoji_baseform",
            "kuromoji_part_of_speech",
            "cjk_width",
            "ja_stop",
            "kuromoji_stemmer",
            "lowercase"
          ],
          "char_filter": [
            "normalize"
          ],
          "type": "custom",
          "tokenizer": "ja_kuromoji_tokenizer"
        },
        "ja_kuromoji_search_analyzer": {
          "filter": [
            "kuromoji_baseform",
            "kuromoji_part_of_speech",
            "cjk_width",
            "ja_stop",
            "kuromoji_stemmer",
            "lowercase"
          ],
          "char_filter": [
            "normalize"
          ],
          "type": "custom",
          "tokenizer": "ja_kuromoji_tokenizer"
        }
      }
    }
  }
}

es.indices.put_settings(index=INDEX_NAME, body=add_settings)

es.indices.open(index=INDEX_NAME, request_timeout=60)


  es.indices.open(index=INDEX_NAME, request_timeout=60)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True})

### App Search用のアナライザの設定

In [7]:
es.indices.close(index=INDEX_NAME)
add_settings = {
  "index": {
    "analysis": {
        "filter": {
          "front_ngram": {
            "type": "edge_ngram",
            "min_gram": "1",
            "max_gram": "12"
          },
          "bigram_joiner": {
            "max_shingle_size": "2",
            "token_separator": "",
            "output_unigrams": "false",
            "type": "shingle"
          },
          "bigram_max_size": {
            "type": "length",
            "max": "16",
            "min": "0"
          },
          "bigram_joiner_unigrams": {
            "max_shingle_size": "2",
            "token_separator": "",
            "output_unigrams": "true",
            "type": "shingle"
          },
          "delimiter": {
            "split_on_numerics": "true",
            "generate_word_parts": "true",
            "preserve_original": "false",
            "catenate_words": "true",
            "generate_number_parts": "true",
            "catenate_all": "true",
            "split_on_case_change": "true",
            "type": "word_delimiter_graph",
            "catenate_numbers": "true",
            "stem_english_possessive": "true"
          },
          "ja-stop-words-filter": {
            "type": "stop",
            "stopwords": "_japanese_"
          },
          "ja-stem-filter": {
            "type": "kuromoji_stemmer",
            "minimum_length": 4
          }
        },
        "analyzer": {
          "i_prefix": {
            "filter": [
              "front_ngram"
            ],
            "tokenizer": "kuromoji_tokenizer"
          },
          "iq_text_delimiter": {
            "filter": [
              "delimiter",
              "kuromoji_baseform",
              "kuromoji_part_of_speech",
              "cjk_width",
              "ja_stop",
              "kuromoji_stemmer",
              "lowercase"
            ],
            "char_filter": [
              "normalize"
            ],
            "tokenizer": "whitespace"
          },
          "q_prefix": {
            "filter": [
              "front_ngram"
            ],
            "tokenizer": "kuromoji_tokenizer"
          },
          "iq_text_base": {
            "filter": [
              "kuromoji_baseform",
              "kuromoji_part_of_speech",
              "cjk_width",
              "ja_stop",
              "kuromoji_stemmer",
              "lowercase"
            ],
            "char_filter": [
              "normalize"
            ],
            "tokenizer": "kuromoji_tokenizer"
          },
          "iq_text_stem": {
            "filter": [
              "kuromoji_baseform",
              "kuromoji_part_of_speech",
              "cjk_width",
              "ja_stop",
              "kuromoji_stemmer",
              "lowercase"
            ],
            "char_filter": [
              "normalize"
            ],
            "tokenizer": "kuromoji_tokenizer"
          },
          "i_text_bigram": {
            "filter": [
              "delimiter",
              "kuromoji_baseform",
              "kuromoji_part_of_speech",
              "cjk_width",
              "ja_stop",
              "kuromoji_stemmer",
              "lowercase",
              "bigram_joiner",
              "bigram_max_size"
            ],
            "char_filter": [
              "normalize"
            ],
            "tokenizer": "kuromoji_tokenizer"
          },
          "q_text_bigram": {
            "filter": [
              "delimiter",
              "kuromoji_baseform",
              "kuromoji_part_of_speech",
              "cjk_width",
              "ja_stop",
              "kuromoji_stemmer",
              "lowercase",
              "bigram_joiner_unigrams",
              "bigram_max_size"
            ],
            "char_filter": [
              "normalize"
            ],
            "tokenizer": "kuromoji_tokenizer"
          }
        }
      }
  }
}

es.indices.put_settings(index=INDEX_NAME, body=add_settings)
es.indices.open(index=INDEX_NAME, request_timeout=60)

  es.indices.put_settings(index=INDEX_NAME, body=add_settings)
  es.indices.open(index=INDEX_NAME, request_timeout=60)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True})

### Mapping定義

In [8]:
es.indices.close(index=INDEX_NAME, request_timeout=60)

add_mapping = {
  "properties": {
    "title": {
      "type": "text",
      "search_analyzer": "ja_kuromoji_search_analyzer",
      "analyzer": "ja_kuromoji_index_analyzer",
      "fields": {
        "prefix": {
          "search_analyzer": "ja_kuromoji_search_analyzer",
          "analyzer": "ja_kuromoji_index_analyzer",
          "type": "text",
          "index_options": "docs"
        },
        "delimiter": {
          "analyzer": "iq_text_delimiter",
          "type": "text",
          "index_options": "freqs"
        },
        "joined": {
          "search_analyzer": "q_text_bigram",
          "analyzer": "i_text_bigram",
          "type": "text",
          "index_options": "freqs"
        },
        "stem": {
          "analyzer": "iq_text_stem",
          "type": "text"
        }
      }
    },
    "body": {
      "type": "text",
      "search_analyzer": "ja_kuromoji_search_analyzer",
      "analyzer": "ja_kuromoji_index_analyzer",
      "fields": {
        "prefix": {
          "search_analyzer": "ja_kuromoji_search_analyzer",
          "analyzer": "ja_kuromoji_index_analyzer",
          "type": "text",
          "index_options": "docs"
        },
        "delimiter": {
          "analyzer": "iq_text_delimiter",
          "type": "text",
          "index_options": "freqs"
        },
        "joined": {
          "search_analyzer": "q_text_bigram",
          "analyzer": "i_text_bigram",
          "type": "text",
          "index_options": "freqs"
        },
        "stem": {
          "analyzer": "iq_text_stem",
          "type": "text"
        }
      }
    },
    "seq_num": {
      "type": "long"
    },
    "source": {
      "type": "keyword"
    },
    "tags": {
      "type": "keyword"
    },
    "url": {
      "type": "keyword"
    },
    "id": {
      "type": "keyword"
    },
    "created_at": {
      "type": "date"
    },
    "updated_at": {
      "type": "date"
    },
    "likes_count": {
      "type": "long"
    },
    "reactions_count": {
      "type": "date"
    },
    "stocks_count": {
      "type": "long"
    },
    "page_views_count": {
      "type": "long"
    },
    "comments_count": {
      "type": "long"
    },
    "organization_url_name": {
      "type": "keyword"
    }
  }
}

es.indices.put_mapping(index=INDEX_NAME, body=add_mapping)

es.indices.open(index=INDEX_NAME)

  es.indices.close(index=INDEX_NAME, request_timeout=60)
  es.indices.put_mapping(index=INDEX_NAME, body=add_mapping)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True})

### インジェスト

In [12]:
from elasticsearch import Elasticsearch, helpers

# 前の実行で残っているDocumentはクリアしてからインジェストします
if es.indices.exists(index=INDEX_NAME):
    es.delete_by_query(index=INDEX_NAME, body={"query": {"match_all": {}}})

index_docs = []
for doc in json_docs:
    # doc_json['_run_ml_inference'] = True
    index_docs.append({
        "_index": INDEX_NAME,
        "_source": doc,
    })

try:
    r = helpers.bulk(es, index_docs)
except Exception as e:
    print(e)
    raise e

response = es.search(index=INDEX_NAME, query={"match_all": {}}, source=["title"])
for hit in response['hits']['hits']:
    print(hit['_source'])

  es.delete_by_query(index=INDEX_NAME, body={"query": {"match_all": {}}})


{'title': 'Elastic Stack 8.0 の NLP で日本語センチメント分析を試してみた - 後編'}
{'title': 'ElasticsearchでRAG (Retrieval Augmented Generation) を試す'}
{'title': 'Elastic Stack 8.0 の NLP で日本語センチメント分析を試してみた - 前編'}
{'title': 'Elastic Observability による Kubernetes クラスタの管理'}
{'title': 'Elasticsearchで日付周りをPainlessを使ってうまい具合にハンドリングする'}
{'title': '[v8.5版] ElasticsearchとKibanaとElastic Agentの最速インストール手順 (試用環境として）'}
{'title': 'Lookup Runtime Field\u3000〜Elasticsearch 8.2 新機能〜'}
{'title': 'ElasticsearchのFrozenデータティアにデータが入るのをテストしてみた (2)'}
{'title': 'Elasticsearchのマシン・ラーニング異常検知の動きを理解する(3) [変更設定編]'}
{'title': 'Elasticsearchにカスタム時系列データを取り込む (Elastic Agent編)'}


### App Seach用のALIASの設定

In [118]:
ALIAS=f"search-{INDEX_NAME}"
es.indices.put_alias(index=INDEX_NAME, name=ALIAS)
ALIAS

qiita-blog-appsearch


'search-qiita-blog-appsearch'