In [12]:
import json
with open("elasticcreds.json") as f:
    elastic_creds = json.load(f)

In [13]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk


# Create the client instance
client = Elasticsearch(
    cloud_id=elastic_creds['cloud_id'],
    api_key=(elastic_creds['id'],elastic_creds['api_key'])
)

# Successful response!
client.info()['name']

'instance-0000000000'

In [14]:
with open('./billsum_v4_1/us_test_data_final_OFFICIAL.jsonl', 'r') as json_file:
    docs_list = list(json_file)

json.loads(docs_list[0])

{'bill_id': '110_hr37',
 'text': "SECTION 1. SHORT TITLE.\n\n    This Act may be cited as the ``National Science Education Tax \nIncentive for Businesses Act of 2007''.\n\nSEC. 2. CREDITS FOR CERTAIN CONTRIBUTIONS BENEFITING SCIENCE, \n              TECHNOLOGY, ENGINEERING, AND MATHEMATICS EDUCATION AT THE \n              ELEMENTARY AND SECONDARY SCHOOL LEVEL.\n\n    (a) In General.--Subpart D of part IV of subchapter A of chapter 1 \nof the Internal Revenue Code of 1986 (relating to business related \ncredits) is amended by adding at the end the following new section:\n\n``SEC. 45O. CONTRIBUTIONS BENEFITING SCIENCE, TECHNOLOGY, ENGINEERING, \n              AND MATHEMATICS EDUCATION AT THE ELEMENTARY AND SECONDARY \n              SCHOOL LEVEL.\n\n    ``(a) In General.--For purposes of section 38, the elementary and \nsecondary science, technology, engineering, and mathematics (STEM) \ncontributions credit determined under this section for the taxable year \nis an amount equal to 100 pe

In [15]:
len(docs_list)

3269

In [16]:
import openai
with open(r'openaiconfig.json') as config_file:
    config_details = json.load(config_file)


openai.api_type = "azure"
openai.api_key = config_details["OPENAI_API_KEY"]

# The base URL for your Azure OpenAI resource. e.g. "https://<your resource name>.openai.azure.com"
openai.api_base = config_details['OPENAI_API_BASE']

openai.api_version = "2022-12-01"

In [17]:
response = openai.Embedding.create(input=json.loads(docs_list[0])['text'],deployment_id="adaembedding")
len(response['data'][0]['embedding'])

1536

In [18]:
client.indices.delete(index='bills', ignore=[404])

with open("index.json") as index_file:
    source = index_file.read().strip()
    client.indices.create(index='bills', body=source)

  client.indices.create(index='bills', body=source)


In [19]:
def embed_text(text):
    vectors = openai.Embedding.create(input=text,deployment_id="adaembedding")
    return vectors['data'][0]['embedding']

def index_batch(docs,embeddings):

    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = index_name
        request["text_vector"] = embeddings[i]
        requests.append(request)
    bulk(client, requests)


In [33]:
batch_size = 100
index_name = 'bills'
docs = []
embeddings = []
count = 0
for d in docs_list:
    doc = json.loads(d)

    docs.append(doc)
    count+=1

    embeddings.append(embed_text(doc['text']))

    if count % batch_size == 0:
        index_batch(docs,embeddings)
        docs = []
        embeddings = []
        print("Indexed {} documents.".format(count))
if docs:
    index_batch(docs,embeddings)
    print("Indexed {} documents.".format(count))

client.indices.refresh(index=index_name)
print("Done indexing.")

Indexed 100 documents.
Indexed 200 documents.
Indexed 300 documents.
Indexed 400 documents.
Indexed 500 documents.
Indexed 600 documents.
Indexed 700 documents.
Indexed 800 documents.
Indexed 900 documents.
Indexed 1000 documents.
Indexed 1100 documents.
Indexed 1200 documents.
Indexed 1300 documents.
Indexed 1400 documents.
Indexed 1500 documents.
Indexed 1600 documents.
Indexed 1700 documents.
Indexed 1800 documents.
Indexed 1900 documents.
Indexed 2000 documents.
Indexed 2100 documents.
Indexed 2200 documents.
Indexed 2300 documents.
Indexed 2400 documents.
Indexed 2500 documents.
Indexed 2600 documents.
Indexed 2700 documents.
Indexed 2800 documents.
Indexed 2900 documents.
Indexed 3000 documents.
Indexed 3100 documents.
Indexed 3200 documents.
Indexed 3269 documents.
Done indexing.


In [36]:


def handle_query(query,n_results):
    query_vector = embed_text(query)

    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'text_vector') + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    }


    response = client.search(
        index=index_name,
        body={
            "size": n_results,
            "query": script_query,
            "_source": {"includes": ["title", "text"]}
        }
    )

    print()
    print("{} total hits.".format(response["hits"]["total"]["value"]))
    for hit in response["hits"]["hits"]:
        print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
        print(hit["_source"])
        print()
    return response


In [37]:
query_response = handle_query("bill related to water rights",10)


6538 total hits.
id: 9YAf5YgBS_Mg6LDHGUf7, score: 1.8060461
{'text': "SECTION 1. AMENDMENTS RELATED TO SUPPLEMENTAL WATER SUPPLY AND FUNDING.\n\n    (a) Supplemental Water Supply.--Section 106(a) of the San Luis Rey \nIndian Water Rights Settlement Act (Public Law 100-675; 102 Stat. 4000) \nis amended to read as follows:\n    ``(a) Obligation To Arrange for Development of Water for Bands and \nLocal Entities.--\n            ``(1) To provide a supplemental water supply for the \n        benefit of the Bands and the local entities, subject to the \n        provisions of the settlement agreement, the Secretary shall--\n                    ``(A) arrange for the development of not more than \n                16,000 acre-feet per year of supplemental water from \n                public lands within the boundaries of the State of \n                California outside the service area of the Central \n                Valley Project;\n                    ``(B) obtain not more than 16,000 acre-f

  response = client.search(


In [38]:
query_response

{'took': 21,
 'timed_out': False,
 '_shards': {'total': 2, 'successful': 2, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 6538, 'relation': 'eq'},
  'max_score': 1.8060461,
  'hits': [{'_index': 'bills',
    '_id': '9YAf5YgBS_Mg6LDHGUf7',
    '_score': 1.8060461,
    '_source': {'text': "SECTION 1. AMENDMENTS RELATED TO SUPPLEMENTAL WATER SUPPLY AND FUNDING.\n\n    (a) Supplemental Water Supply.--Section 106(a) of the San Luis Rey \nIndian Water Rights Settlement Act (Public Law 100-675; 102 Stat. 4000) \nis amended to read as follows:\n    ``(a) Obligation To Arrange for Development of Water for Bands and \nLocal Entities.--\n            ``(1) To provide a supplemental water supply for the \n        benefit of the Bands and the local entities, subject to the \n        provisions of the settlement agreement, the Secretary shall--\n                    ``(A) arrange for the development of not more than \n                16,000 acre-feet per year of supplemental water from \n  