In [1]:
import pandas as pd

df = pd.read_excel("base_reports.xlsx")

In [2]:
df["combined_text"] = (
    df["what_happened"].fillna("") + " " +
    df["what_could_have_happened"].fillna("") + " " +
    df["why_did_it_happen"].fillna("") + " " +
    df["causal_factors"].fillna("") + " " +
    df["what_went_well"].fillna("") + " " +
    df["lessons_to_prevent"].fillna("")
)

In [3]:
df["combined_text"] = df["combined_text"].astype(str)

In [4]:
import vertexai
from vertexai.preview.language_models import TextEmbeddingModel

vertexai.init(
    project="methanex-safety",
    location="us-central1"
)

embedding_model = TextEmbeddingModel.from_pretrained(
    "text-embedding-004"
)



In [5]:
embeddings = []

batch_size = 20

texts = df["combined_text"].tolist()

for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]

    response = embedding_model.get_embeddings(batch)

    for emb in response:
        embeddings.append(emb.values)

    print(f"Processed {i + len(batch)} / {len(texts)}")

Processed 20 / 196
Processed 40 / 196
Processed 60 / 196
Processed 80 / 196
Processed 100 / 196
Processed 120 / 196
Processed 140 / 196
Processed 160 / 196
Processed 180 / 196
Processed 196 / 196


In [6]:
import chromadb

client = chromadb.PersistentClient(path="./chroma_db")

collection = client.get_or_create_collection(
    name="safety_incidents"
)

In [7]:
for row, emb in zip(df.itertuples(), embeddings):
    collection.add(
        documents=[row.combined_text],
        ids=[str(row.case_id)],
        embeddings=[emb]
    )

**Test Retrieval**

In [8]:
query = "office slip or trip incident"
query_embedding = embedding_model.get_embeddings([query])[0].values

In [9]:
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

In [10]:
context = ""

for i, doc in enumerate(results["documents"][0]):
    context += f"\nIncident {i+1}:\n{doc}\n"

In [11]:
prompt = f"""
You are a safety knowledge assistant for industrial operations.

Answer ONLY using the incidents below.
Do not add external safety advice or assumptions.

Summarize in three sections:

1. Common Causes
2. Typical Risks or Escalation Potential
3. Prevention Lessons Observed

Incidents:
{context}
"""

**Adding Gemini**

In [12]:
pip install --upgrade google-generativeai

Note: you may need to restart the kernel to use updated packages.


In [13]:
from google import genai

client = genai.Client(
    vertexai=True,
    project="methanex-safety",
    location="us-central1"
)

response = client.models.generate_content(
    model="gemini-2.0-flash",  # updated model
    contents=prompt
)

print(response.text)

Here is a summary of the incidents, based only on the information provided.

### 1. Common Causes

*   **Inadequate Electrical Equipment Management:** Lack of routine inspections for cords and equipment, improper installation, and use of unapproved personal devices.
*   **Poor Cable Routing and Management:** Cables obstructing walkways, tangled bundles, no defined pathways, and cords bent in tight spaces.
*   **Insufficient Risk Assessment:** Incomplete isolation planning, missing circuits, overlooking combined risks (electrical, falls, dropped objects), and inadequate contractor onboarding.
*   **Poor Housekeeping:** Accumulated items under desks, stacked materials on table edges, and lack of clear ownership for shared area maintenance.
*   **Workplace Layout:** Congested areas, limited space, and inadequate lighting beneath desks.
*   **Rushing/Multitasking:** Employees multitasking, rushing, and assuming isolations are complete without verification.

### 2. Typical Risks or Escalati

In [14]:
def ask_safety_assistant(query):

    query_embedding = embedding_model.get_embeddings([query])[0].values

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=5
    )

    context = "\n".join(results["documents"][0])

    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=context
    )

    return response.text