In [3]:
import os
import json
import uuid
import faiss
import numpy as np
from openai import OpenAI
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [35]:
api_key = "key"
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
client = OpenAI(api_key=api_key)
dimension = embedding_model.get_sentence_embedding_dimension()
faiss_index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))


In [36]:
event_metadata = {}
# Mapping between FAISS numeric ID and real UUID event_id
faiss_id_to_event_id = {}
metadata_file_path = "event_store/events_metadata.json"
idmap_file_path = "event_store/faiss_id_map.json"
os.makedirs(os.path.dirname(metadata_file_path), exist_ok=True)

In [37]:
#Save metadata
def sync_event_metadata_to_json(metadata):
    with open(metadata_file_path, "w") as f:
        json.dump(metadata, f, indent=2)
    with open(idmap_file_path, "w") as f:
        json.dump(faiss_id_to_event_id, f, indent=2)

In [45]:
def load_event_metadata_and_faiss():
    global event_metadata, faiss_index, faiss_id_to_event_id
    if os.path.exists(metadata_file_path):
        try:
            with open(metadata_file_path, "r") as f:
                content = f.read().strip()
                event_metadata = json.loads(content) if content else {}
        except json.JSONDecodeError:
            print("⚠️ events_metadata.json is empty or corrupted. Initializing new store.")
            event_metadata = {}

        if os.path.exists(idmap_file_path):
            try:
                with open(idmap_file_path, "r") as f:
                    content = f.read().strip()
                    faiss_id_to_event_id = json.loads(content) if content else {}
            except json.JSONDecodeError:
                print("⚠️ faiss_id_map.json is empty or corrupted. Initializing new map.")
                faiss_id_to_event_id = {}

        rebuild_faiss_index()
        print(f"\u2705 Loaded {len(event_metadata)} events from disk.")
    else:
        event_metadata = {}
        faiss_id_to_event_id = {}
        faiss_index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
        print("\U0001f195 No metadata found. Starting fresh.")

In [46]:
#rebuilding fiass index
def get_faiss_compatible_id(event_id):
    return hash(event_id) % (2**63 - 1) 

def rebuild_faiss_index():
    global faiss_index
    faiss_index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
    for event_id, data in event_metadata.items():
        embedding = embedding_model.encode([data["summary"]])[0]
        event_int_id = get_faiss_compatible_id(event_id)
        faiss_index.add_with_ids(
            np.array([embedding], dtype=np.float32),
            np.array([event_int_id], dtype=np.int64)
        )
        faiss_id_to_event_id[event_int_id] = event_id
    print(f"\U0001f501 FAISS rebuilt with {len(event_metadata)} vectors.")

In [47]:
def get_embedding(text):
    return embedding_model.encode([text])[0]

In [48]:
def call_gpt_extractor(tweet, existing_summary):
    prompt = f"""
    You are an expert in extracting structured information from tweets about disasters.
    Given the tweet and existing event summary, return a JSON with important details about the event.
    The JSON may include the following fields:
    - event_type
    - location
    - people_killed
    - people_trapped
    - infrastructure_damage
    - any other details you find relevant
    - summary (updated, more detailed)

    Tweet: "{tweet}"
    Existing Event Summary: "{existing_summary or 'None'}"

    Respond only in JSON format.
    """
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    return response.choices[0].message.content

In [49]:
def find_matching_event(tweet_text, threshold=0.75):
    if faiss_index.ntotal == 0:
        return None, None, None

    query_embedding = get_embedding(tweet_text).reshape(1, -1)
    D, I = faiss_index.search(query_embedding, k=1)

    best_faiss_id = I[0][0]
    best_distance = D[0][0]
    best_score = 1 - best_distance / 2

    event_id = faiss_id_to_event_id.get(str(best_faiss_id))
    if event_id and best_score >= threshold:
        return event_id, event_metadata[event_id], best_faiss_id

    return None, None, None


In [50]:
def process_tweet(tweet_text):
    matched_id, matched_event, _ = find_matching_event(tweet_text)
    existing_summary = matched_event["summary"] if matched_event else None

    gpt_output = call_gpt_extractor(tweet_text, existing_summary)

    try:
        parsed_data = json.loads(gpt_output)
    except json.JSONDecodeError:
        return {"error": "Invalid JSON from GPT", "raw_output": gpt_output}

    updated_summary = parsed_data["summary"]
    embedding = get_embedding(updated_summary)

    if matched_id:
        event_metadata[matched_id] = {
            "summary": updated_summary,
            "structured_data": parsed_data,
            "last_tweet": tweet_text
        }
    else:
        new_event_id = str(uuid.uuid4())
        event_metadata[new_event_id] = {
            "summary": updated_summary,
            "structured_data": parsed_data,
            "last_tweet": tweet_text
        }
        matched_id = new_event_id

    # Rebuild FAISS index and sync metadata
    sync_event_metadata_to_json(event_metadata)
    rebuild_faiss_index()

    return {"event_id": matched_id, "data": parsed_data}

In [55]:
load_event_metadata_and_faiss()

🔁 FAISS rebuilt with 0 vectors.
✅ Loaded 0 events from disk.


In [57]:
tweet_1 = "A 6.8 magnitude earthquake strikes Tokyo, Japan. 18 people reported dead, several buildings have collapsed. Rescue operations underway."
tweet_2 = "Cyclone Mahina lashes Bangladesh coast. Winds up to 150 km/h. 40 fishermen missing and 200 homes destroyed."
tweet_3 = "Massive wildfire near Los Angeles spreads to 50,000 acres. Over 70 houses burned, 15 people injured, evacuation ongoing."
tweet_4 = "Explosion at chemical plant in Houston. Toxic smoke spreading, 3 confirmed dead, dozens hospitalized. Emergency declared."
tweet_5 = "Heavy rains cause flooding in Venice. 10,000 evacuated, historic sites submerged, damage expected to be in millions."


In [58]:
for tweet in [tweet_1, tweet_2, tweet_3, tweet_4, tweet_5]:
    result = process_tweet(tweet)
    print(result)


🔁 FAISS rebuilt with 1 vectors.
{'event_id': 'd4a0daaa-79cf-4054-93fc-35dcac44fcb1', 'data': {'event_type': 'Earthquake', 'location': 'Tokyo, Japan', 'people_killed': 18, 'infrastructure_damage': 'Several buildings have collapsed', 'rescue_operations': 'Underway', 'summary': 'A 6.8 magnitude earthquake has struck Tokyo, Japan. 18 people have been reported dead and several buildings have collapsed. Rescue operations are currently underway.'}}
🔁 FAISS rebuilt with 2 vectors.
{'event_id': '92aa4802-a09b-42d8-a260-f3f14055cc3a', 'data': {'event_type': 'Cyclone', 'location': 'Bangladesh coast', 'people_missing': 40, 'infrastructure_damage': '200 homes destroyed', 'wind_speed': 'up to 150 km/h', 'summary': 'Cyclone Mahina has hit the Bangladesh coast with winds up to 150 km/h. 40 fishermen are reported missing and 200 homes have been destroyed.'}}
🔁 FAISS rebuilt with 3 vectors.
{'event_id': '7db33f71-3849-44ce-85e7-44887a79aacf', 'data': {'event_type': 'Wildfire', 'location': 'Los Angeles',