In [2]:
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()

server = os.getenv("DB_SERVER")
database = os.getenv("DB_NAME")

connection_string = (
    'mssql+pyodbc:///?odbc_connect='
    f'DRIVER={{ODBC Driver 18 for SQL Server}};'
    f'SERVER={server};'
    f'DATABASE={database};'
    'Trusted_Connection=yes;'
    'TrustServerCertificate=yes;'
)

engine = create_engine(connection_string)

In [4]:
_model = None

def init_model():
    global _model
    _model = SentenceTransformer("BAAI/bge-small-en")

In [5]:
def clean_claim(text):
    if not text:
        return ""
    
    text = re.sub(r'<.*?>', '', text) #removing html
    text = re.sub(r'the present invention.*?\.', '', text, flags=re.IGNORECASE)

    return text.strip()

In [6]:
def encode_claims(claims):
    global _model
    return _model.encode(claims, batch_size=256)

def process_batch_parallel(df):
    df['cleaned_claim'] = df['claim_text'].apply(clean_claim)

    num_cores = min(cpu_count(), 16)
    chunks = np.array_split(df['cleaned_claim'].tolist(), num_cores)

    with Pool(processes=num_cores, initializer=init_model) as pool:
        results = list(tqdm(pool.imap(encode_claims, chunks), total=len(chunks), desc="Embedding chunks"))

    all_embeddings = np.vstack(results).astype('float32')

    return df[['patent_id', 'cleaned_claim']], all_embeddings

In [None]:
def main():
    # Ensure output folder exists
    os.makedirs("output", exist_ok=True)

    batch_size = 50000
    offset = 0
    batch_num = 0

    # Use engine.connect() to control transactions manually
    with engine.connect() as conn:
        while True:
            try:
                # Load one batch
                query = f"""
                    SELECT patent_id, claim_text
                    FROM independent_claims_sample
                    ORDER BY patent_id
                    OFFSET {offset} ROWS
                    FETCH NEXT {batch_size} ROWS ONLY
                """
                df = pd.read_sql(query, conn)

                if df.empty:
                    print("✅ Done! No more rows.")
                    break

                # Process the batch (clean + embed)
                meta_df, embed_matrix = process_batch_parallel(df)

                # Save outputs
                meta_df.to_csv(f"output/metadata_batch_{batch_num}.csv", index=False)
                np.save(f"output/embeddings_batch_{batch_num}.npy", embed_matrix)

                print(f"✅ Batch {batch_num} saved. Rows: {len(df)}")

                # Update offset
                offset += batch_size
                batch_num += 1

            except Exception as e:
                print(f"❌ Error at batch {batch_num} (offset {offset}): {e}")
                conn.rollback()
                break


if __name__ == "__main__":
    main()

Embedding chunks:   0%|          | 0/16 [00:00<?, ?it/s]