In [18]:
from elasticsearch import Elasticsearch
from cassandra.cluster import Cluster
from cassandra.query import BatchStatement

es = Elasticsearch(['http://localhost:9200']) 
print("Elasticsearch is ready.")
cluster = Cluster(['127.0.0.1'], port=32769)
session = cluster.connect()
print("Cassandra is ready.\n")

#ElasticSearch settings:
index_name = 'books'
batch_size = 1500
sort_field = '_doc'
sort_order = 'asc'
i=0
search_after = None
session.execute("DROP KEYSPACE IF EXISTS books")
session.execute("CREATE KEYSPACE IF NOT EXISTS books WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1};")

#Cassandra Settings:
insert_query_for_parents = "INSERT INTO books.parents (trove_id,title,url,contributors,date,format,pages) VALUES (?,?,?,?,?,?,?)"
create_table_query_parents= "CREATE TABLE books.parents (\
    title text,\
    url text,\
    contributors text,\
    date text,\
    format text,\
    trove_id text PRIMARY KEY,\
    pages int\
);"

# Prepare the insert statement
session.execute(create_table_query_parents)
insert_statement = session.prepare(insert_query_for_parents)

res = es.search(
    index=index_name,
    body={
        "query": {
            "bool": {
                "must": [],
                "must_not": [
                    {
                        "term": {
                            "parent": ""
                        }
                    }
                ]
            }
        },
        'sort': [{sort_field: sort_order}],
        'size': batch_size,
        'search_after': search_after
    }
)

while res['hits']['hits']:
    last_hit = res['hits']['hits'][-1]
    search_after = last_hit['sort']

    for hit in res['hits']['hits']:
        doc_id = hit['_id']
        doc = hit['_source']
        i += 1
        # Perform any necessary data transformations or mappings here
        row = (
            doc['trove_id'],
            doc['title'],
            doc['url'],
            doc['contributors'],
            doc['date'],
            doc['format'],
            int(doc['pages'])
        )
        session.execute(insert_statement, row)
        print(f"{i} docs added to books.parents.", end="\r")

    res = es.search(
        index=index_name,
        body={
            "query": {
                "bool": {
                    "must": [],
                    "must_not": [
                        {
                            "term": {
                                "parent": ""
                            }
                        }
                    ]
                }
            },
            'sort': [{sort_field: sort_order}],
            'size': batch_size,
            'search_after': search_after
        }
    )

print(f"\nTotal documents: {i}\n")


Elasticsearch is ready.
Cassandra is ready.



  res = es.search(


1423 docs added to books.parents.
Total documents: 1423



  res = es.search(


In [19]:
#ElasticSearch settings:
index_name = 'books'
batch_size = 1000
sort_field = '_doc'
sort_order = 'asc'
i=0
search_after = None

#Cassandra Settings:
insert_query_for_children = "INSERT INTO books.children (trove_id,title,url,contributors,date,format,pages) VALUES (?,?,?,?,?,?,?)"
create_table_query_children= "CREATE TABLE books.children (\
    title text,\
    url text,\
    contributors text,\
    date text,\
    format text,\
    trove_id text PRIMARY KEY,\
    pages int\
);"

# Prepare the insert statement
session.execute(create_table_query_children)
insert_statement = session.prepare(insert_query_for_children)

res = es.search(
    index=index_name,
    body={
        "query": {
            "bool": {
                
                "must_not": [
                    {
                        "term": {
                            "children": ""
                        }
                    }
                ]
            }
        },
        'sort': [{sort_field: sort_order}],
        'size': batch_size,
        'search_after': search_after
    }
)

while res['hits']['hits']:
    last_hit = res['hits']['hits'][-1]
    search_after = last_hit['sort']

    for hit in res['hits']['hits']:
        doc_id = hit['_id']
        doc = hit['_source']
        i += 1
        # Perform any necessary data transformations or mappings here
        row = (
            doc['trove_id'],
            doc['title'],
            doc['url'],
            doc['contributors'],
            doc['date'],
            doc['format'],
            int(doc['pages'])
        )
        session.execute(insert_statement, row)
        print(f"{i} docs added to books.children.", end="\r")

    res = es.search(
        index=index_name,
        body={
            "query": {
                "bool": {
                    
                    "must_not": [
                        {
                            "term": {
                                "children": ""
                            }
                        }
                    ]
                }
            },
            'sort': [{sort_field: sort_order}],
            'size': batch_size,
            'search_after': search_after
        }
    )

print(f"\nTotal documents: {i}\n")


  res = es.search(


240 docs added to books.children.
Total documents: 240



  res = es.search(
