In [None]:
import os
import json
import re
import time
import pandas as pd
from elasticsearch import Elasticsearch, helpers
from kagglehub import dataset_download
from tqdm import tqdm
from requests.exceptions import ConnectionError

# ---- Utility Functions ----
def clean_text(text):
    if not text:
        return ""
    return re.sub(r'\s+', ' ', text.strip())

def safe_date(date_str):
    try:
        return pd.to_datetime(date_str).isoformat()
    except Exception:
        return None

# ---- Preprocessing ----
def preprocess_arxiv_json(json_file_path, max_records=None):
    records = []
    with open(json_file_path, 'r') as f:
        for i, line in enumerate(f):
            if max_records and i >= max_records:
                break

            try:
                entry = json.loads(line)
            except Exception:
                continue

            # Parse authors
            authors_list = entry.get('authors_parsed', [])
            authors = [f"{a[1]} {a[0]}" for a in authors_list if isinstance(a, list)]
            if not authors:
                authors = [entry.get("authors", "")]

            # Parse publication date from versions[0]
            published = ""
            if isinstance(entry.get('versions'), list) and len(entry['versions']) > 0:
                published = entry['versions'][0].get('created', '')

            processed_entry = {
                'id': entry.get('id'),
                'submitter': entry.get('submitter', ''),
                'title': clean_text(entry.get('title', '')),
                'summary': clean_text(entry.get('abstract', '')),
                'authors': ', '.join(authors),
                'comments': entry.get('comments', ''),
                'journal_ref': entry.get('journal-ref', ''),
                'doi': entry.get('doi', ''),
                'report_no': entry.get('report-no', ''),
                'categories': entry.get('categories', ''),
                'license': entry.get('license', '') or '',
                'update_date': safe_date(entry.get('update_date', '')),
                'published': safe_date(published)
            }
            records.append(processed_entry)
    return pd.DataFrame(records)

# ---- Elasticsearch Setup ----
def create_index(es, index_name):
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)

    index_config = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "submitter": {"type": "text"},
                "title": {"type": "text"},
                "summary": {"type": "text"},
                "authors": {"type": "text"},
                "comments": {"type": "text"},
                "journal_ref": {"type": "text"},
                "doi": {"type": "keyword"},
                "report_no": {"type": "keyword"},
                "categories": {"type": "keyword"},
                "license": {"type": "keyword"},
                "update_date": {"type": "date", "format": "strict_date_optional_time||epoch_millis"},
                "published": {"type": "date", "format": "strict_date_optional_time||epoch_millis"}
            }
        }
    }
    es.indices.create(index=index_name, body=index_config)

# ---- Indexing with Sample Display ----
def index_data(es, df, index_name):
    print("\n📊 Sample data to be indexed:")
    print(df.head(3).to_markdown(index=False))

    print("\n📤 Starting bulk indexing...")
    successes = 0
    failed_docs = []

    actions = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Indexing"):
        doc = row.to_dict()
        action = {
            "_index": index_name,
            "_id": doc["id"],
            "_source": doc
        }
        actions.append(action)

        if len(actions) >= 500:
            try:
                resp = helpers.bulk(es, actions, raise_on_error=False)
                successes += resp[0]
            except Exception as e:
                print("Error during bulk index:", e)
            actions = []

    if actions:
        try:
            resp = helpers.bulk(es, actions, raise_on_error=False)
            successes += resp[0]
        except Exception as e:
            print(" Error during final batch indexing:", e)

    print(f"\n Indexing complete. Total successful documents: {successes}")

# ---- Connect to Elasticsearch ----
def connect_to_elasticsearch_http(max_retries=10, wait_seconds=5):
    es = None
    for attempt in range(max_retries):
        try:
            es = Elasticsearch("http://localhost:9200")
            if es.ping():
                print("Successfully connected to Elasticsearch via HTTP!")
                return es
        except ConnectionError:
            print(f"Waiting for Elasticsearch (attempt {attempt + 1})...")
            time.sleep(wait_seconds)
    raise Exception(" Could not connect to Elasticsearch via HTTP after several attempts.")

# ---- Main Pipeline ----
if __name__ == "__main__":
    print(" Downloading dataset...")
    dataset_path = dataset_download("Cornell-University/arxiv")
    print("Path to dataset files:", dataset_path)

    json_file_path = os.path.join(dataset_path, "arxiv-metadata-oai-snapshot.json")

    print(" Preprocessing data...")
    df = preprocess_arxiv_json(json_file_path, max_records=10000)

    print(" Connecting to Elasticsearch...")
    es = connect_to_elasticsearch_http()

    index_name = "arxiv-papers"

    print(f" Creating index '{index_name}'...")
    create_index(es, index_name)

    index_data(es, df, index_name)

    print("\n Done!")


📦 Downloading dataset...
📂 Path to dataset files: C:\Users\Atharv\.cache\kagglehub\datasets\Cornell-University\arxiv\versions\227
🧼 Preprocessing data...
🔌 Connecting to Elasticsearch...
✅ Successfully connected to Elasticsearch via HTTP!
📁 Creating index 'arxiv-papers'...

📊 Sample data to be indexed:
|   id | submitter      | title                                                                                 | summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

Indexing: 100%|██████████| 10000/10000 [00:01<00:00, 5273.97it/s]


✅ Indexing complete. Total successful documents: 10000

🚀 Done!



