In [None]:
import json
from qdrant_client import QdrantClient
from qdrant_client import QdrantClient, models
import json
import gzip
from tqdm import tqdm
import os
import json
import gzip
import uuid
import time
import tiktoken
from tqdm import tqdm
from qdrant_client import QdrantClient, models
from openai import OpenAI, RateLimitError
from concurrent.futures import ThreadPoolExecutor
from qdrant_client import QdrantClient, models
from qdrant_client.models import PointStruct
import numpy as np


### Importing necessary information

In [25]:
with open("/Users/paolocadei/Documents/Masters/Thesis/Spider2/api_keys/api_keys.json", "r") as f:
    api_keys = dict(json.load(f))

qdrant_key =  api_keys["qdrant_key"]

qdrant_client = QdrantClient(
    url="https://17582e9e-4a04-4068-bf13-bd4fdc0d688d.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key=qdrant_key
)

The different embeddings and information are stored in the following files:
- new_embeddings.json $\rightarrow$ stores the 1536 dimensions
- bm25_embedddings.json.gz $\rightarrow$ stores the sparse embeddings made using BM25
- final_structure_with_descriptions $\rightarrow$ stores all the other information that goes in the metadata

### Creating the Qdrant collection

Here we consider 3 different types of embeddings:
- dense column embeddings
- dense table embeddings
- sparse bm25 column embeddings

In [26]:
dense_dim = 1536  # for text-embedding-3-small (OpenAI)

try:
    qdrant_client.create_collection(
        collection_name="thesis",
        vectors_config={
            "openai_column": models.VectorParams(
                size=1536,  # or your dim
                distance=models.Distance.COSINE
            ),
            "openai_table": models.VectorParams(
                size=1536,
                distance=models.Distance.COSINE
            )
        },
        sparse_vectors_config={
            "bm25": models.SparseVectorParams(
                modifier=models.Modifier.IDF
            )
        }
    )

    print("✅ Qdrant collection 'thesis' created with dense_dim =", dense_dim)

except Exception as e:
    print(e)


✅ Qdrant collection 'thesis' created with dense_dim = 1536


### Uploading the Qdrant

In [4]:
# === FILE PATHS ===
TABLE_EMBEDDINGS_PATH = "3_table_embeddings.json"
COLUMN_EMBEDDINGS_PATH = "3_dense_embeddings.json"
BM25_PATH = "4_bm25_embeddings.json.gz"
PAYLOAD_PATH = "2_final_structure_all.json"

# === LOAD FILES ===
with open(TABLE_EMBEDDINGS_PATH) as f:
    table_embeddings = json.load(f)

print("\n✅ Loaded table embeddings.")

with open(COLUMN_EMBEDDINGS_PATH) as f:
    column_embeddings = json.load(f)

print("\n✅ Loaded column embeddings.")

with gzip.open(BM25_PATH, 'rt') as f:
    sparse_embeddings = json.load(f)

print("\n✅ Loaded sparse embeddings.")

with open(PAYLOAD_PATH) as f:
    metadata = json.load(f)

print("\n✅ Loaded metadata.")

# === INIT QDRANT ===
print("\n✅ Loaded all files. The Qdrant upload can be started.")


✅ Loaded table embeddings.

✅ Loaded column embeddings.

✅ Loaded sparse embeddings.

✅ Loaded metadata.

✅ Loaded all files. The Qdrant upload can be started.


In [None]:
import hashlib
import json
from tqdm.notebook import tqdm

COLLECTION_NAME = "thesis"
BATCH_SIZE = 10

def upload_batch(batch_points):
    qdrant_client.upsert(
        collection_name=COLLECTION_NAME,
        points=batch_points
    )

def get_payload_hash(payload):
    return hashlib.md5(json.dumps(payload, sort_keys=True).encode('utf-8')).hexdigest()

def get_uid(database, table, column_name, group=None, ungrouped_key=None):
    base = {
        "database": database,
        "table": table,
        "column": column_name,
        "group": group,
        "ungrouped_key": ungrouped_key,
    }
    key = json.dumps(base, sort_keys=True)
    return hashlib.md5(key.encode("utf-8")).hexdigest()

def get_existing_ids():
    existing_ids = set()
    scroll = qdrant_client.scroll(
        collection_name=COLLECTION_NAME,
        limit=10000,
        with_payload=False,
        with_vectors=False
    )
    while True:
        existing_ids.update([point.id for point in scroll[0]])
        if scroll[1] is None:
            break
        scroll = qdrant_client.scroll(
            collection_name=COLLECTION_NAME,
            limit=10000,
            offset=scroll[1],
            with_payload=False,
            with_vectors=False
        )
    return existing_ids

def upload_embeddings(metadata, table_embeddings, column_embeddings, sparse_embeddings, test_limit=None):
    total_inserted = 0
    points = []
    seen_payloads = set()
    existing_ids = get_existing_ids()

    planned_points = 0
    for db in metadata.values():
        for tbl in db.values():
            for group_entries in tbl.get("grouped", {}).values():
                for entry in group_entries:
                    planned_points += len(entry.get("details", {}).get("columns", {}))
            for entry in tbl.get("ungrouped", {}).values():
                planned_points += len(entry.get("details", {}).get("columns", {}))

    pbar = tqdm(total=planned_points, desc="Uploading missing points", dynamic_ncols=False)

    for database in metadata:
        for table in metadata[database]:

            for template in metadata[database][table].get('grouped', {}):
                entries = metadata[database][table]['grouped'][template]

                if 'table_embedding' not in table_embeddings.get(database, {}).get(table, {}).get('grouped', {}).get(template, {}):
                    continue

                table_embedding = table_embeddings[database][table]['grouped'][template]['table_embedding']

                for index, entry in enumerate(entries):
                    grouped_cols = entry['details']['columns']
                    col_embed_entry = column_embeddings.get(database, {}).get(table, {}).get('grouped', {}).get(template, [])
                    sparse_embed_entry = sparse_embeddings.get(database, {}).get(table, {}).get('grouped', {}).get(template, [])

                    if index >= len(col_embed_entry) or index >= len(sparse_embed_entry):
                        continue

                    col_embeds = col_embed_entry[index]['details']['column_embeddings']
                    sparse_embeds = sparse_embed_entry[index]['details']['column_embeddings']

                    table_payload = {
                        'database': database,
                        'table': table,
                        'template': template,
                        'variables': entry['variables'],
                        'combinations': entry['combinations'],
                        'table_keywords': entry['keywords'],
                        'table_description': entry['description']
                    }

                    for col in grouped_cols:
                        uid = get_uid(database, table, col, group=template)
                        pbar.update(1)

                        if test_limit is not None and total_inserted >= test_limit:
                            if points:
                                upload_batch(points)
                            pbar.close()
                            tqdm.write(f"\U0001f9ea Test limit reached: {total_inserted} points uploaded.")
                            return

                        if col not in col_embeds or col not in sparse_embeds:
                            continue

                        if uid in existing_ids:
                            continue

                        col_embedding = col_embeds[col]
                        column_sparse_embeddings = sparse_embeds[col]

                        col_payload = {
                            'column_name': col,
                            'column_type': grouped_cols[col],
                            'description': entry['details']['description'][col],
                            'sample_values': [row[col] for row in entry['details']['sample_row']] if entry['details'].get('sample_row') else [],
                            'column_keywords': entry['details']['keywords'].get(col, [])
                        }

                        full_payload = {**table_payload, **col_payload}
                        payload_hash = get_payload_hash(full_payload)

                        if payload_hash in seen_payloads:
                            continue

                        seen_payloads.add(payload_hash)

                        combined_embedding = (
                            0.3 * np.array(table_embedding) +
                            0.7 * np.array(col_embedding)
                        ).tolist()

                        vectors = {
                            "openai_combined": combined_embedding,
                            "bm25": models.SparseVector(
                                indices=column_sparse_embeddings["indices"],
                                values=column_sparse_embeddings["values"]
                            )
                        }

                        points.append(PointStruct(id=uid, vector=vectors, payload=full_payload))
                        total_inserted += 1

                        if len(points) == BATCH_SIZE:
                            upload_batch(points)
                            points = []

            for key, entry in metadata[database][table].get('ungrouped', {}).items():
                if 'table_embedding' not in table_embeddings.get(database, {}).get(table, {}).get('ungrouped', {}).get(key, {}):
                    continue

                table_embedding = table_embeddings[database][table]['ungrouped'][key]['table_embedding']
                col_embeds = column_embeddings.get(database, {}).get(table, {}).get('ungrouped', {}).get(key, {}).get('details', {}).get('column_embeddings', {})
                sparse_embeds = sparse_embeddings.get(database, {}).get(table, {}).get('ungrouped', {}).get(key, {}).get('details', {}).get('column_embeddings', {})

                table_payload = {
                    'database': database,
                    'table': table,
                    'ungrouped_key': key,
                    'table_keywords': entry['keywords'],
                    'table_description': entry['description']
                }

                for col in entry['details']['columns']:
                    uid = get_uid(database, table, col, ungrouped_key=key)
                    pbar.update(1)

                    if test_limit is not None and total_inserted >= test_limit:
                        if points:
                            upload_batch(points)
                        pbar.close()
                        tqdm.write(f"\U0001f9ea Test limit reached: {total_inserted} points uploaded.")
                        return

                    if col not in col_embeds or col not in sparse_embeds:
                        continue

                    if uid in existing_ids:
                        continue

                    col_embedding = col_embeds[col]
                    column_sparse_embeddings = sparse_embeds[col]

                    col_payload = {
                        'column_name': col,
                        'column_type': entry['details']['columns'][col],
                        'description': entry['details']['description'][col],
                        'sample_values': [row[col] for row in entry['details']['sample_row']] if entry['details'].get('sample_row') else [],
                        'column_keywords': entry['details']['keywords'].get(col, [])
                    }

                    full_payload = {**table_payload, **col_payload}
                    payload_hash = get_payload_hash(full_payload)

                    if payload_hash in seen_payloads:
                        continue

                    seen_payloads.add(payload_hash)

                    combined_embedding = (
                        0.3 * np.array(table_embedding) +
                        0.7 * np.array(col_embedding)
                    ).tolist()

                    vectors = {
                        "openai_combined": combined_embedding,
                        "bm25": models.SparseVector(
                            indices=column_sparse_embeddings["indices"],
                            values=column_sparse_embeddings["values"]
                        )
                    }

                    points.append(PointStruct(id=uid, vector=vectors, payload=full_payload))
                    total_inserted += 1

                    if len(points) == BATCH_SIZE:
                        upload_batch(points)
                        points = []

    if points:
        upload_batch(points)

    pbar.close()
    tqdm.write(f"\u2705 All done. Total points uploaded: {total_inserted}")

In [18]:
upload_embeddings(
    metadata=metadata,
    table_embeddings=table_embeddings,
    column_embeddings=column_embeddings,
    sparse_embeddings=sparse_embeddings,
    test_limit=None  # ⬅️ Upload only 20 points
)

Uploading missing points:  35%|███▍      | 46870/134095 [2:03:50<3:50:28,  6.31it/s]
Uploading missing points:   2%|▏         | 2210/134095 [2:03:02<122:22:21,  3.34s/it]


Uploading missing points:   0%|          | 0/134095 [00:00<?, ?it/s]



[A[A                                                                            
[A

✅ All done. Total points uploaded: 134095


In [20]:
actual_count = qdrant_client.count(COLLECTION_NAME, exact=True).count

print(f"Total points in 'thesis': {actual_count}")

Total points in 'thesis': 128249


In [None]:
import hashlib
import json
from tqdm.notebook import tqdm

COLLECTION_NAME = "thesis"
BATCH_SIZE = 10

def upload_batch(batch_points):
    qdrant_client.upsert(
        collection_name=COLLECTION_NAME,
        points=batch_points
    )

def get_payload_hash(payload):
    return hashlib.md5(json.dumps(payload, sort_keys=True).encode('utf-8')).hexdigest()

def get_uid(database, table, column_name, group=None, ungrouped_key=None, index=None):
    base = {
        "database": database,
        "table": table,
        "column": column_name,
        "group": group,
        "ungrouped_key": ungrouped_key,
        "index": index,
    }
    key = json.dumps(base, sort_keys=True)
    return hashlib.md5(key.encode("utf-8")).hexdigest()

def get_existing_ids():
    existing_ids = set()
    scroll = qdrant_client.scroll(
        collection_name=COLLECTION_NAME,
        limit=10000,
        with_payload=False,
        with_vectors=False
    )
    while True:
        existing_ids.update([point.id for point in scroll[0]])
        if scroll[1] is None:
            break
        scroll = qdrant_client.scroll(
            collection_name=COLLECTION_NAME,
            limit=10000,
            offset=scroll[1],
            with_payload=False,
            with_vectors=False
        )
    return existing_ids

def upload_embeddings(metadata, table_embeddings, column_embeddings, sparse_embeddings, test_limit=None):
    total_inserted = 0
    points = []
    seen_payloads = set()
    existing_ids = get_existing_ids()

    planned_points = 0
    for db in metadata.values():
        for tbl in db.values():
            for group_entries in tbl.get("grouped", {}).values():
                for entry in group_entries:
                    planned_points += len(entry.get("details", {}).get("columns", {}))
            for entry in tbl.get("ungrouped", {}).values():
                planned_points += len(entry.get("details", {}).get("columns", {}))

    pbar = tqdm(total=planned_points, desc="Uploading missing points", dynamic_ncols=False)

    for database in metadata:
        for table in metadata[database]:

            for template in metadata[database][table].get('grouped', {}):
                entries = metadata[database][table]['grouped'][template]

                if 'table_embedding' not in table_embeddings.get(database, {}).get(table, {}).get('grouped', {}).get(template, {}):
                    continue

                table_embedding = table_embeddings[database][table]['grouped'][template]['table_embedding']

                for index, entry in enumerate(entries):
                    grouped_cols = entry['details']['columns']
                    col_embed_entry = column_embeddings.get(database, {}).get(table, {}).get('grouped', {}).get(template, [])
                    sparse_embed_entry = sparse_embeddings.get(database, {}).get(table, {}).get('grouped', {}).get(template, [])

                    if index >= len(col_embed_entry) or index >= len(sparse_embed_entry):
                        continue

                    col_embeds = col_embed_entry[index]['details']['column_embeddings']
                    sparse_embeds = sparse_embed_entry[index]['details']['column_embeddings']

                    table_payload = {
                        'database': database,
                        'table': table,
                        'template': template,
                        'variables': entry['variables'],
                        'combinations': entry['combinations'],
                        'table_keywords': entry['keywords'],
                        'table_description': entry['description']
                    }

                    for col in grouped_cols:
                        uid = get_uid(database, table, col, group=template, index=index)
                        pbar.update(1)

                        if test_limit is not None and total_inserted >= test_limit:
                            if points:
                                upload_batch(points)
                            pbar.close()
                            tqdm.write(f"\U0001f9ea Test limit reached: {total_inserted} points uploaded.")
                            return

                        if col not in col_embeds or col not in sparse_embeds:
                            continue

                        if uid in existing_ids:
                            continue

                        col_embedding = col_embeds[col]
                        column_sparse_embeddings = sparse_embeds[col]

                        col_payload = {
                            'column_name': col,
                            'column_type': grouped_cols[col],
                            'description': entry['details']['description'][col],
                            'sample_values': [row[col] for row in entry['details']['sample_row']] if entry['details'].get('sample_row') else [],
                            'column_keywords': entry['details']['keywords'].get(col, [])
                        }

                        full_payload = {**table_payload, **col_payload}
                        payload_hash = get_payload_hash(full_payload)

                        if payload_hash in seen_payloads:
                            continue

                        seen_payloads.add(payload_hash)

                        combined_embedding = (
                            0.3 * np.array(table_embedding) +
                            0.7 * np.array(col_embedding)
                        ).tolist()

                        vectors = {
                            "openai_combined": combined_embedding,
                            "bm25": models.SparseVector(
                                indices=column_sparse_embeddings["indices"],
                                values=column_sparse_embeddings["values"]
                            )
                        }

                        points.append(PointStruct(id=uid, vector=vectors, payload=full_payload))
                        total_inserted += 1

                        if len(points) == BATCH_SIZE:
                            upload_batch(points)
                            points = []

            for key, entry in metadata[database][table].get('ungrouped', {}).items():
                if 'table_embedding' not in table_embeddings.get(database, {}).get(table, {}).get('ungrouped', {}).get(key, {}):
                    continue

                table_embedding = table_embeddings[database][table]['ungrouped'][key]['table_embedding']
                col_embeds = column_embeddings.get(database, {}).get(table, {}).get('ungrouped', {}).get(key, {}).get('details', {}).get('column_embeddings', {})
                sparse_embeds = sparse_embeddings.get(database, {}).get(table, {}).get('ungrouped', {}).get(key, {}).get('details', {}).get('column_embeddings', {})

                table_payload = {
                    'database': database,
                    'table': table,
                    'ungrouped_key': key,
                    'table_keywords': entry['keywords'],
                    'table_description': entry['description']
                }

                for idx, col in enumerate(entry['details']['columns']):
                    uid = get_uid(database, table, col, ungrouped_key=key, index=idx)
                    pbar.update(1)

                    if test_limit is not None and total_inserted >= test_limit:
                        if points:
                            upload_batch(points)
                        pbar.close()
                        tqdm.write(f"\U0001f9ea Test limit reached: {total_inserted} points uploaded.")
                        return

                    if col not in col_embeds or col not in sparse_embeds:
                        continue

                    if uid in existing_ids:
                        continue

                    col_embedding = col_embeds[col]
                    column_sparse_embeddings = sparse_embeds[col]

                    col_payload = {
                        'column_name': col,
                        'column_type': entry['details']['columns'][col],
                        'description': entry['details']['description'][col],
                        'sample_values': [row[col] for row in entry['details']['sample_row']] if entry['details'].get('sample_row') else [],
                        'column_keywords': entry['details']['keywords'].get(col, [])
                    }

                    full_payload = {**table_payload, **col_payload}
                    payload_hash = get_payload_hash(full_payload)

                    if payload_hash in seen_payloads:
                        continue

                    seen_payloads.add(payload_hash)

                    combined_embedding = (
                        0.3 * np.array(table_embedding) +
                        0.7 * np.array(col_embedding)
                    ).tolist()

                    vectors = {
                        "openai_combined": combined_embedding,
                        "bm25": models.SparseVector(
                            indices=column_sparse_embeddings["indices"],
                            values=column_sparse_embeddings["values"]
                        )
                    }

                    points.append(PointStruct(id=uid, vector=vectors, payload=full_payload))
                    total_inserted += 1

                    if len(points) == BATCH_SIZE:
                        upload_batch(points)
                        points = []

    if points:
        upload_batch(points)

    pbar.close()
    tqdm.write(f"\u2705 All done. Total points uploaded: {total_inserted}")


In [28]:
upload_embeddings(
    metadata=metadata,
    table_embeddings=table_embeddings,
    column_embeddings=column_embeddings,
    sparse_embeddings=sparse_embeddings,
    test_limit=None  # ⬅️ Upload only 20 points
)

Uploading missing points:   0%|          | 0/134095 [00:00<?, ?it/s]

✅ All done. Total points uploaded: 134095
