<a href="https://colab.research.google.com/github/nakamura196/cj_notes/blob/main/CJ_get_title_list.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cultural Japanの公開データから、タイトルと画像の一覧を取得する

`.env`ファイルを持っている内部の方向け

## セットアップ

`host`、`username`、`passowrd`を含む`.env`ファイルをアップロードしてください。

In [None]:
from google.colab import files
files.upload()

In [None]:
!pip install python-dotenv

In [None]:
!pip install opensearch-py

In [None]:
index = "items"

In [None]:
import os
import json
from dotenv import load_dotenv

from opensearchpy import OpenSearch

load_dotenv(override=True)

host = os.getenv('host')
username = os.getenv('username')
password = os.getenv('password')

client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=(username, password),
    use_ssl=True
)

## source一覧のダウンロード

In [None]:
query = {
    "size": 0,
    "aggs": {
        "source": {
          "terms": {
            "field": "source.ja",
            "order": {
              "_count": "desc"
            },
            "size": 5000
          }
        }
    }
}

In [None]:
result = client.search(index=index, body=json.dumps(query))

In [None]:
buckets = result['aggregations']['source']['buckets']

## コレクションごとにダウンロードを実行

In [None]:
from tqdm import tqdm


def get_data(collection, size, per_page=1000):

    last = None

    loop = int(size / per_page)

    for page in tqdm(range(0, loop)):

        query = {
            "query": {
                "match": {
                    "source.ja": collection
                }
            },
            "sort": [
                {
                    "_uri": {
                        "order": "asc"
                    }
                }
            ],
            "_source": [
                "_title_ja",
                "_image"
            ],

            "size": per_page
        }

        if last is not None:
            query["search_after"] = last

        result = client.search(index="items", body=json.dumps(query))

        last = result['hits']['hits'][-1]['sort']

        opath = f'result/{collection}/{str(page).zfill(8)}.json'
        os.makedirs(os.path.dirname(opath), exist_ok=True)

        with open(opath, 'w') as f:
            json.dump(result, f, ensure_ascii=False, indent=4,
                      sort_keys=True, separators=(',', ': '))

In [None]:
for bucket in tqdm(buckets):
    get_data(bucket['key'], bucket['doc_count'])

## 出力結果をマージ

In [None]:
import glob
import json
from tqdm import tqdm
import pandas as pd

files = glob.glob("result/*/*.json")
files.sort()

rows = []

for file in tqdm(files):
    with open(file, "r") as f:
        try:
            data = json.load(f)

        except Exception as e:
            print(file, e)
            
            continue

        hits = data["hits"]["hits"]

        for item in hits:
            id = item["_id"]

            if "_image" not in item["_source"] or len(item["_source"]["_image"]) == 0:
                continue

            image = item["_source"]["_image"][0]
            title = item["_source"]["_title_ja"][0]

            row = [id, image, title]
            rows.append(row)

df = pd.DataFrame(rows, columns=["id", "image", "title"])
df.to_json('items.jsonl', orient='records', force_ascii=False, lines=True)

In [None]:
!tar zcvf items.tar.gz items.jsonl

### 確認用のテストデータの作成

In [None]:
import random

rows_rand = random.sample(rows, 200)
df = pd.DataFrame(rows_rand, columns=["id", "image", "title"])
df.to_json('test.jsonl', orient='records', force_ascii=False, lines=True)