In [3]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [47]:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sqlalchemy import create_engine, select, values, update, and_, exists, text, create_engine, insert
from sqlalchemy.orm import sessionmaker, join
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink, SummaryChunks
from app.models.schema import NoticeBase, ResourceLinkBase, SummaryChunksBase
from dotenv import load_dotenv
import os
import requests
import json
import re
import pandas as pd
import pendulum
from openai import OpenAI
import tiktoken



In [46]:

DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [8]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=1).strftime("%Y%m%d")

## Fetch Data

In [28]:
with SessionLocal() as db:
    stmt = (select(ResourceLink.id, 
                   ResourceLink.summary)
            .where(ResourceLink.summary
                   .isnot(None)))
    results = db.execute(stmt).fetchall()
    data = [dict(result) for result in results]

## Split an example

In [40]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200, chunk_overlap=30, separators=["\n\n", "\n", " ", ""]
    )

In [41]:
split = text_splitter.split_text(data[0]['summary'])
len(split)

14

In [42]:
split

["Based on the provided government contracting document, here is a summary of the key information that would help contractors assess the project's alignment with their capabilities and feasibility:",
 '1. Scope of Work: The project requires upgrading signage and wayfinding at the VANIHCS-Ft Wayne, IN location. The specific services or products needed are not explicitly stated in the document.',
 '2. Special Equipment Needed: The document does not mention any unique tools or machinery necessary for project completion.',
 '3. Domain of Expertise Required: The document does not specify the specialized knowledge or skills needed to execute this project. However, experience in signage and wayfinding design and',
 'and wayfinding design and installation may be required.',
 '4. Contractor Workforce Size: The document does not provide an estimate of the workforce size needed to meet the project demands. This would depend on the scope of work and timeline.',
 'Additional Factors:\n- Project Dur

## Split a Batch and Commit to db

In [49]:
def num_tokens_in_corpus(input: str, encoding_name: str = "gpt-3.5-turbo") -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(input))
    return num_tokens

In [54]:
for entry in data[:1]:
    with SessionLocal() as db:
        # Check for existing entries 
        stmt = exists().where(SummaryChunks.resource_link_id == entry['id'])
        is_exists = db.query(stmt).scalar()

        # Create new entries only if none already exist 
        if not is_exists:
            split_texts = text_splitter.split_text(entry['summary'])
            for split_text in split_texts:
                tokens = num_tokens_in_corpus(split_text)
                stmt = insert(SummaryChunks).values(chunk_text=split_text, resource_link_id=entry['id'], chunk_tokens=tokens)
                db.execute(stmt)
            db.commit()

## Get Embeddings


In [48]:
client = OpenAI()

In [61]:
with SessionLocal() as db:
    stmt = select(SummaryChunks).where(and_(SummaryChunks.chunk_text.isnot(None), SummaryChunks.chunk_embedding == None))
    results = db.execute(stmt).scalars().all()
    data = [SummaryChunksBase.model_validate(result) for result in results]

In [68]:
data[0].id

55

In [70]:
with SessionLocal() as db:
    for entry in data:
        res = client.embeddings.create(input=entry.chunk_text, model="text-embedding-3-small")
        stmt = (update(SummaryChunks)
                .where(SummaryChunks.id == entry.id)
                .values(chunk_embedding=res.data[0].embedding))
        db.execute(stmt)
        db.commit()

In [67]:
res.data[0].embedding

[-0.018820807337760925,
 0.016265396028757095,
 0.05689508467912674,
 -0.047890301793813705,
 0.009870107285678387,
 -0.01252016332000494,
 0.01056642271578312,
 -0.04183303192257881,
 -0.007118646055459976,
 0.01155343372374773,
 0.03547830507159233,
 -0.004941814113408327,
 -0.07257909327745438,
 -0.026487041264772415,
 0.036424752324819565,
 0.00424211798235774,
 -0.038696229457855225,
 -0.05662466958165169,
 -0.008984501473605633,
 0.0049958969466388226,
 0.022065773606300354,
 0.05492106452584267,
 -0.07555364072322845,
 -0.022173939272761345,
 0.010350092314183712,
 -0.033801738172769547,
 0.014859243296086788,
 0.05002656951546669,
 0.036992620676755905,
 0.010546142235398293,
 -0.017401134595274925,
 -0.04440196231007576,
 -0.048971958458423615,
 0.008409872651100159,
 -0.00642909063026309,
 0.02113284543156624,
 0.0009058865834958851,
 -0.016765661537647247,
 0.024607663974165916,
 -0.03258487582206726,
 -0.023945150896906853,
 0.01634651981294155,
 -0.02322855405509472,
 0.01

In [None]:

response = client.embeddings.create(input=row[0], model="text-embedding-3-small")