# Set-up

In [4]:
import sys
import os
from pathlib import Path
import logging
from pprint import pprint
from sqlalchemy import create_engine, text, func
from sqlalchemy.orm import declarative_base, relationship, sessionmaker

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(filename)s - %(levelname)s - %(message)s',
                    handlers=[logging.StreamHandler(sys.stdout)])
logger = logging.getLogger()

project_root = Path(os.getcwd()).resolve().parents[1]
sys.path.insert(0, str(project_root))
import group4py

# Functions that you want to test/work on
#from docchunk import DocChunk, Embedding
#from helpers import Test, TaskInfo, Logger

In [5]:
import requests
from requests.exceptions import RequestException
from urllib.parse import urlparse
import json
from time import sleep
from pprint import pprint
# Additional imports if needed for specific HTTP request handling
from tqdm import tqdm  # For progress bars if needed

In [8]:
s = requests.Session()

response = s.get("https://unfccc.int/NDCREG")

pprint(response.json)
response.text

<bound method Response.json of <Response [200]>>


'<html style="height:100%"><head><META NAME="ROBOTS" CONTENT="NOINDEX, NOFOLLOW"><meta name="format-detection" content="telephone=no"><meta name="viewport" content="initial-scale=1.0"><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"><script type="text/javascript" src="/_Incapsula_Resource?SWJIYLWA=719d34d31c8e3a6e6fffd425f7e032f3"></script></head><body style="margin:0px;height:100%"><iframe id="main-iframe" src="/_Incapsula_Resource?SWUDNSAI=31&xinfo=13-94542570-0%202cNN%20RT%281748016538697%208109%29%20q%280%20-1%20-1%200%29%20r%280%20-1%29%20B12%284%2c315%2c0%29&incident_id=198000260131277796-390830960113223309&edet=12&cinfo=04000000&rpinfo=0&cts=FH1wrLO7efptEuDjHOrQMk38pUoTf7zrW3FUCHz7ZjO%2bwNoOwyvpOanL5UnVBZVx&mth=GET" frameborder=0 width="100%" height="100%" marginheight="0px" marginwidth="0px">Request unsuccessful. Incapsula incident ID: 198000260131277796-390830960113223309</iframe></body></html>'

### E.g. Testing a function

In [None]:
pprint(DocChunk().chunking_function(None))

[{'key1': 'value1',
  'key2': 42,
  'key3': [1, 2, 3],
  'key4': {'nested_key': 'nested_value'}},
 {'key1': 'value1',
  'key2': 42,
  'key3': [1, 2, 3],
  'key4': {'nested_key': 'nested_value'}},
 {'key1': 'value1',
  'key2': 42,
  'key3': [1, 2, 3],
  'key4': {'nested_key': 'nested_value'}}]


### E.g. Forcing an output in a specific format

In [3]:
@Test.dummy_chunk()
def one_plus_one():
    return 1 + 1

pprint(one_plus_one())

['I am dummy chunk 1', 'I am dummy chunk 2', 'I am dummy chunk 3']


### E.g. Forcing an input in a specific format

In [5]:
@Test.force_input(1)
def number_plus_one(number):
    return number + 1

number = "I don't have this number yet"
pprint(number_plus_one(number))

2


### Example use of decorators
- Rui Kai is working on this function
- He wants to let others know that this function has been completed
- I want to log outputs of everything related to this function in a specific file

In [6]:
@TaskInfo.ruikai()
@TaskInfo.completed()
@Logger.log(log_file=Path("logs/test.log"), log_level="INFO")
def some_complicated_function_that_calls_other_functions():
    object().some_complicated_operations()
    pass

## Create Vector Similarity Indices

In [9]:
db_url = os.getenv('DATABASE_URL')

engine = create_engine(db_url)
Session = sessionmaker(bind=engine)

with engine.connect() as conn:
    # Create the pgvector extension if it doesn't exist
    conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
    
    # Convert transformer_embedding column to vector type
    conn.execute(text("""
        ALTER TABLE doc_chunks 
        ALTER COLUMN transformer_embedding TYPE vector(768)
        USING transformer_embedding::vector(768);
    """))
    
    # Convert word2vec_embedding column to vector type
    conn.execute(text("""
        ALTER TABLE doc_chunks 
        ALTER COLUMN word2vec_embedding TYPE vector(300)
        USING word2vec_embedding::vector(300);
    """))
    
    # Create an index for transformer_embedding
    conn.execute(text("""
        CREATE INDEX IF NOT EXISTS transformer_embedding_idx 
        ON doc_chunks 
        USING ivfflat (transformer_embedding vector_cosine_ops)
        WITH (lists = 100);
    """))
    
    # Create an index for word2vec_embedding
    conn.execute(text("""
        CREATE INDEX IF NOT EXISTS word2vec_embedding_idx 
        ON doc_chunks 
        USING ivfflat (word2vec_embedding vector_cosine_ops)
        WITH (lists = 100);  -- Decrease this to improve performance, will reduce precision
    """))
