In [None]:
from pymongo import MongoClient

from inflection import underscore

import duckdb
import pandas as pd
from datetime import datetime

import pprint


In [None]:
def create_duckdb_file(filename):
    """
    Creates a file-based DuckDB database and returns the connection.
  
    Args:
      filename: The name of the DuckDB database file to create.
  
    Returns:
      duckdb.DuckDBPyConnection: The DuckDB connection object.
    """
    conn = duckdb.connect(database=filename)
    return conn


In [None]:
# Example usage:
filename = "biosamples.duckdb"
duckdb_conn = create_duckdb_file(filename)

In [None]:
# MongoDB connection details
connection_string = "mongodb://localhost:27017/"
db_name = "biosamples"
collection_name = "biosamples"

# Connect to MongoDB
client = MongoClient(connection_string)
db = client[db_name]
collection = db[collection_name]

In [None]:
# Retrieve the first document
first_document = collection.find_one()


In [None]:
# # Print the document
# pprint.pprint(first_document)

In [None]:
# df['content'].value_counts()

In [None]:
# # now processing everything except Description.Comment (antibiograms) and BioSample.Owner.Contacts
# paths = [
#     "BioSample",
#     "BioSample.Attributes.Attribute",
#     "BioSample.Curation",
#     "BioSample.Description.Comment.Paragraph",
#     "BioSample.Description.Organism",
#     "BioSample.Description.Organism.OrganismName",
#     "BioSample.Description.Synonym",
#     "BioSample.Description.Title",
#     "BioSample.Ids.Id",
#     "BioSample.Links.Link",
#     "BioSample.Models.Model",
#     "BioSample.Owner.Name",
#     "BioSample.Package",
#     "BioSample.Status"
# ]


In [None]:

def infer_duckdb_type(series, col_name):
    # If the column is "id", always return BIGINT
    if col_name.lower() == "id":
        return "BIGINT"

    # Otherwise use simple inference
    if pd.api.types.is_integer_dtype(series):
        return "BIGINT"
    elif pd.api.types.is_float_dtype(series):
        return "DOUBLE"
    elif pd.api.types.is_bool_dtype(series):
        return "BOOLEAN"
    return "TEXT"

def ensure_columns_exist(conn, table_name, df):
    table_info = conn.execute(f"PRAGMA table_info({table_name})").fetchall()
    existing_columns = {col[1].lower() for col in table_info}

    new_columns = [c for c in df.columns if c.lower() not in existing_columns]
    if new_columns:
        print(f"{datetime.now().isoformat()}: Adding {len(new_columns)} new column(s) to {table_name}.")
    for col in new_columns:
        dtype = infer_duckdb_type(df[col], col)
        alter_sql = f'ALTER TABLE {table_name} ADD COLUMN "{col}" {dtype}'
        conn.execute(alter_sql)

def insert_df(conn, table_name, df):
    table_info = conn.execute(f"PRAGMA table_info({table_name})").fetchall()
    existing_columns = [col[1] for col in table_info]

    for col in existing_columns:
        if col not in df.columns:
            df[col] = None

    df = df[existing_columns]

    conn.register("temp_df", df)
    conn.execute(f"INSERT INTO {table_name} SELECT * FROM temp_df")
    conn.unregister("temp_df")

def extract_biosample_data(collection, conn, path="BioSample", max_docs=None, client=None, batch_size=10000):
    """
    Extract scalar data and build a table in DuckDB.
    The 'id' column is always BIGINT.
    """

    def process_data(data, id_value):
        if isinstance(data, dict):
            scalar_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, bool))}
            # Force id to int if possible
            scalar_data["id"] = int(id_value) if id_value is not None else None
            return pd.DataFrame([scalar_data])
        elif isinstance(data, list) and all(isinstance(item, dict) for item in data):
            all_scalar_data = []
            for item in data:
                scalar_data = {k: v for k, v in item.items() if isinstance(v, (str, int, float, bool))}
                scalar_data["id"] = int(id_value) if id_value is not None else None
                all_scalar_data.append(scalar_data)
            if all_scalar_data:
                return pd.DataFrame(all_scalar_data)
            else:
                return None
        else:
            return None

    if path != "BioSample":
        path_parts = path.split(".")[1:]
    else:
        path_parts = []

    if client is None:
        raise ValueError("Client must be provided to start a session for no_cursor_timeout.")

    table_created = False
    processed_docs = 0
    table_name = path.split(".")[-1].replace("-", "_").replace(".", "_").lower()

    batch = []
    with client.start_session() as session:
        cursor = collection.find({}, no_cursor_timeout=True, session=session)

        for doc in cursor:
            if max_docs is not None and processed_docs >= max_docs:
                break

            if not path_parts:  # top-level
                scalar_data = {k: v for k, v in doc.items() if isinstance(v, (str, int, float, bool))}
                # Force id to int if possible
                scalar_data["id"] = int(doc["id"]) if "id" in doc else None
                df = pd.DataFrame([scalar_data])
            else:
                current_data = doc
                for part in path_parts:
                    current_data = current_data.get(part)
                    if current_data is None:
                        break
                if current_data is not None:
                    df = process_data(current_data, doc.get('id'))
                else:
                    df = None

            if df is not None and not df.empty:
                batch.append(df)

            processed_docs += 1

            if len(batch) >= batch_size:
                combined_df = pd.concat(batch, ignore_index=True)

                if not table_created:
                    # Use the first batch to create the table
                    schema_parts = []
                    for col in combined_df.columns:
                        dtype = infer_duckdb_type(combined_df[col], col)
                        schema_parts.append(f'"{col}" {dtype}')
                    schema_sql = ", ".join(schema_parts)
                    conn.execute(f"CREATE TABLE {table_name} ({schema_sql})")
                    table_created = True
                else:
                    ensure_columns_exist(conn, table_name, combined_df)

                insert_df(conn, table_name, combined_df)
                print(f"{datetime.now().isoformat()}: Inserted {processed_docs} documents into {table_name} so far.")
                batch.clear()

        cursor.close()

    if batch:
        combined_df = pd.concat(batch, ignore_index=True)
        if not table_created:
            # Create table if not created yet
            schema_parts = []
            for col in combined_df.columns:
                dtype = infer_duckdb_type(combined_df[col], col)
                schema_parts.append(f'"{col}" {dtype}')
            schema_sql = ", ".join(schema_parts)
            conn.execute(f"CREATE TABLE {table_name} ({schema_sql})")
            table_created = True
        else:
            ensure_columns_exist(conn, table_name, combined_df)

        insert_df(conn, table_name, combined_df)
        print(f"{datetime.now().isoformat()}: Final insert - total {processed_docs} documents processed for {table_name}.")
        batch.clear()


In [None]:
paths = [
    "BioSample",
    "BioSample.Attributes.Attribute",
    "BioSample.Package",
]

max_docs =  1000000
batch_size = 100000

for current_path in paths:
    print(f"Processing path: {current_path}")
    print(datetime.now().isoformat())

    extract_biosample_data(collection, duckdb_conn, path=current_path, max_docs=max_docs, client=client, batch_size=batch_size)

    print(f"Completed path: {current_path}")
    print(datetime.now().isoformat())

In [None]:
# Close the connection when you're finished
duckdb_conn.close()

# close the pymongo connection
client.close()