In [None]:
%pip install -q "unstructured[csv,xlsx]" "astrapy" "openai" "numpy" "cassandra-driver" "python-dotenv"


In [None]:
!ls -R "data/01 SMART Methane Project Registration Samples"


In [4]:
import os
from dotenv import load_dotenv  
load_dotenv()


True

In [5]:
import glob
files = glob.glob("data/01 SMART Methane Project Registration Samples/*.xlsx")


In [None]:
from unstructured.partition.auto import partition
from unstructured.staging.base import convert_to_dict
from IPython.display import display, HTML
import json

def to_astradb(el):
    return {
          "source": el.metadata.filename,
          "page": el.metadata.page_name,
          "number": el.metadata.page_number,
          "text": el.text.replace("'", ""),
          "render": el.metadata.text_as_html.replace("'","")
    }
    
print("Starting to parse data: ", end="")
jsons = []
for file in files:
    for element in partition(filename=file):
        print(".", end="", flush=True)
        if element.metadata.text_as_html is not None:
            jsons.append(to_astradb(element))
            
print("Finished parsing data", end="")


In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(jsons, orient='columns')
df = df.dropna()
df.info()


### Connect via CQL

In [None]:

from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import json

# Load secrets from JSON file
with open("xxx-token.json") as f:
    secrets = json.load(f)

# Extract credentials from secrets
id = secrets.get("clientId")
secret = secrets.get("secret")

# Configure cloud connection
cloud_config = {
    'secure_connect_bundle': "secure-connect-xxx.zip"
}

# Create authentication provider
auth_provider = PlainTextAuthProvider(id, secret)

# Connect to Cassandra cluster
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
# Create session
session = cluster.connect()
# Execute query
row = session.execute("select release_version from system.local").one()
if row:
    print(row[0])
    a = session
else:
    print("An error occurred.")


### OpenAI

In [12]:
from openai import Client
client = Client(api_key=os.environ.get("OPENAI_API_KEY"))


### Embedding via Data API

In [13]:
import numpy as np

BATCH_SIZE = 20
MODEL_NAME = "text-embedding-3-small"

print("Starting to vectorize data ", end="")
vectors = []

# Loop BATCH_SIZE at the time
for k,g in df.groupby(np.arange(len(df))//BATCH_SIZE):
    # compute the embedding vectors for this batch    
    for vector in client.embeddings.create(input=g.text, model=MODEL_NAME).data:
        vectors.append(vector.embedding)
        print(".", end="", flush=True)

vector_series  = pd.Series(vectors, name="vector")
print("Finished vectorizing entries.")


Starting to vectorize data: .............................................................Finished vectorizing entries.


In [None]:
df.sample(n=10)


In [17]:
cql = "use default_keyspace;"
session.execute(cql)


<cassandra.cluster.ResultSet at 0x2a3d62b10>

In [18]:
cql = """CREATE TABLE IF NOT EXISTS well_data (
        id INT PRIMARY KEY,
        source TEXT,
        page TEXT,
        number INT,
        text TEXT,
        render TEXT,
        vector VECTOR <FLOAT, 1536>
);"""
session.execute(cql)


<cassandra.cluster.ResultSet at 0x2a49eda90>

In [19]:
cql = """CREATE INDEX IF NOT EXISTS vector_index ON well_data(vector);"""
session.execute(cql)


<cassandra.cluster.ResultSet at 0x29df8e310>

In [21]:
cql = "truncate table well_data"
session.execute(cql)


<cassandra.cluster.ResultSet at 0x171855910>

In [25]:
insert_statements = []
for line, vector in zip(df.iterrows(), vectors):
    indx = line[0]
    row = line[1]
    insert_statement = f"""INSERT INTO well_data (id, source, page, number, text, render, vector) VALUES ({indx}, '{row['source']}', '{row['page']}', {row['number']}, '{row['text']}', '{row['render']}', {vector});"""
    insert_statements.append(insert_statement)


In [26]:
insert_statement[:100]


"INSERT INTO well_data (id, source, page, number, text, render, vector) VALUES (60, 'BPX Gas Quality "

In [None]:
for insert_statement in insert_statements:
    print(insert_statement[:100])
    session.execute(insert_statement)


### Embedding manually via CQL

In [None]:
question = "meter name meter number"
similarity = client.embeddings.create(input=question,model=MODEL_NAME)
spreadsheet = ''.join([doc['text'] for doc in collection.find({}, limit=10)])


In [None]:
prompt = """Oil and Gas Environmental Impact Analysis

As an oil and gas analyst, I'm examining the environmental impact of well operations on the surrounding environment. I'm particularly interested in assessing factors such as air and water pollution, habitat disruption, and overall ecosystem health.

To conduct this analysis, I'll need to gather and analyze various types of unstructured data. Answer the question based on the data available bellow.

Data:

{data}

Question:
{question}
"""

prompt.format(data=spreadsheet)


In [None]:
print(client.chat.completions.create(
        messages=[{
            "role": "user",
            "content": prompt,
        }],
        model="gpt-3.5-turbo",
    ).choices[0].message.content)


In [None]:
for row, vector in zip(df.iterrows(), embeddings.data):
    print(row[1].metadata["filename"] + "," + row[1].metadata["] )
       #display(HTML(row[1]['metadata']['text_as_html']))
