In [5]:
import pandas as pd
import json

In [3]:
df = pd.read_csv("fuseji.csv")

In [13]:
df["Author"] = df["Author"].apply(lambda x: None if x in "NN" else x)

In [14]:
data = json.loads(df.to_json(orient="records"))

In [15]:
data[0]

{'Author': None,
 'Fuseji_number': 0,
 'Fuseji_pages': 0,
 'Fuseji_total': 0,
 'Month': 1,
 'Section': '巻頭言',
 'Title': '近く開かるべき軍縮大会議',
 'Year': 1926}

In [27]:
from tqdm import tqdm
from elasticsearch import Elasticsearch
from datetime import datetime

In [28]:
ES_SERVER = 'http://elastic:derridablablabla@37.120.165.192:9200'

In [37]:
ARTICLE_INDEX = "ck_fuseji"
ARTICLE_DOC_TYPE = "article"
ARTICLE_MAPPING = {
    "article": {
        "properties": {
            "section": {"type": "keyword"},
            "issue": {"type": "keyword"},
            "year": {"type": "integer"},
            "month": {"type": "integer"},
            "timestamp": {"type": "date"},
            "title": { 
                "type": "text",  
                "fielddata": True,
                "analyzer": "kuromoji"
            },
            "authors": { "type": "keyword" },
            "fuseji_number": {"type": "integer"},
            "fuseji_pages": {"type": "integer"},
            "fuseji_total": {"type": "integer"}
        }
    }
}

In [38]:
es = Elasticsearch(ES_SERVER)
if es.indices.exists(index=ARTICLE_INDEX):
    es.indices.delete(index=ARTICLE_INDEX)

es.indices.create(ARTICLE_INDEX)
es.indices.put_mapping(index=ARTICLE_INDEX, doc_type=ARTICLE_DOC_TYPE, body=ARTICLE_MAPPING)

for i, article in tqdm(enumerate(data)):

    id_ = str(i)

    if article["Month"] == 71 or article["Month"] == 72:
        month = 7
    else:
        month = article["Month"]
    timestamp = datetime(article["Year"], month, 1).isoformat()
    
    doc = {
        "issue": "{}/{}".format(article["Year"], article["Month"]),
        "year": article["Year"],
        "month": article["Month"],
        "timestamp": timestamp,
        "section": article["Section"],
        "title": article["Title"],
        "authors": article["Author"],
        "fuseji_number": article["Fuseji_number"],
        "fuseji_pages": article["Fuseji_pages"],
        "fuseji_total": article["Fuseji_total"]
    }
    res = es.index(index=ARTICLE_INDEX, doc_type=ARTICLE_DOC_TYPE, id=id_, body=doc)


4974it [03:57, 20.98it/s]
