In [1]:
import os
import json
import faiss
import uuid
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, BertTokenizer
import numpy as np
from openai import OpenAI
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open(file = "./key.txt") as f:
    api_key = f.read().strip()

In [3]:

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
client = OpenAI(api_key=api_key)

### 1: Loading our trained Binary and multiclass classifier models

In [6]:
def binary_classifier(tweet, model_path="./models/binary_model"):
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertForSequenceClassification.from_pretrained(model_path)
    device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    #tokenize input
    inputs = tokenizer(tweet, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=1).item()

    label_map = {1: "non-informative", 0: "informative"}
    return label_map[pred]

In [7]:
def humanitarianClassifier(tweet, model_path="./models/multi-class-humanitarian_model"):
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertForSequenceClassification.from_pretrained(model_path)
    device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    #tokenize input
    inputs = tokenizer(tweet, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=1).item()
    
    label_map = {0:'affected_individuals', 1: 'infrastructure_and_utility_damage', 2: 'injured_or_dead_people', 3:'missing_or_found_people', 4: 'not_humanitarian', 5:'other_relevant_information',6:'rescue_volunteering_or_donation_effort', 7:'vehicle_damage'}
    return label_map[pred]

In [8]:
def categorize_tweet(tweet):
    info_category = binary_classifier(tweet)
    if info_category=="non-informative":
        return "This tweet doesn't contain any disaster related information."
    humanitarian_category = humanitarianClassifier(tweet)
    return info_category, humanitarian_category

In [9]:

def call_gpt_extractor(tweet, existing_summary=None):
    prompt = f"""
    You are an expert in extracting structured information from tweets about disasters.
    Given the tweet and existing event summary, return a JSON with important details about the event.
    The JSON may include the following fields:
    - event_type
    - location
    - people_killed
    - people_trapped
    - infrastructure_damage
    - any other details you find relevant
    - summary (updated, more detailed)

    Tweet: "{tweet}"
    Existing Event Summary: "{existing_summary or 'None'}"

    Respond only in JSON format.
    """
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    return response.choices[0].message.content

In [10]:
string = "There were 19 killed in india and pakistan fight; chance of nuclear fight was positive"
info, humanitarian = categorize_tweet(string)
print(info, humanitarian)


informative injured_or_dead_people


In [11]:
event = call_gpt_extractor(string)

### 3: Implementing Fiass vectorDB and json storing

In [51]:
embedding_dim = 384
index = faiss.IndexFlatL2(embedding_dim)
json_storage_path = "./event_store/event_metadata.json"
id_to_metadata= {}

In [11]:
class RAGBackend:
    def __init__(self, json_path="./event_store/event_metadata.json", index_path= "./event_store/faiss_index.index",mapping_path="id_mapping.json"):
        self.json_path = json_path
        self.index_path = index_path
        self.mapping_path = mapping_path
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        self.dim = self.model.getSentenceEmbeddingDim()
        self.documents = self.load_or_create_json_store()
        self.index = self.load_or_create_faiss_index()
        self.id_mapping = self.load_or_create_id_mapping()


    def load_or_create_json_store(self):
        if os.path.exists(self.json_path):
            with open(self.json_path,"r") as f:
                return json.load(f)
        else:
            with open(self.json_path, "w") as f:
                json.dump([],f)
            return []
        
    def load_or_create_faiss_index(self):
        if os.path.exists(self.index_path):
            return faiss.read_index(self.index_path)
        else:
            return faiss.IndexFlatL2(self.dim)
        
    def load_or_create_id_mapping(self):
        if os.path.exists(self.mapping_path):
            with open(self.mapping_path, "r") as f:
                return json.load(f)
        else:
            return {}
        
    def save_all(self):
        with open(self.json_path, "w") as f:
            json.dump(self.documents, f, indent=2)
        faiss.write_index(self.index, self.index_path)
        with open(self.mapping_path, "w") as f:
            json.dump(self.id_mapping, f, indent=2)
    
    def add_documents(self, text):
        doc_id = str(uuid())
        new_doc = {"id":doc_id, "text": text}
        self.documents.append(new_doc)

        #adding to faiss
        vector = self.model.encode([text])
        self.index.add(vector)

        self.id_mapping[str(len(self.id_mapping))] = doc_id
        self.save_all()

        return doc_id
    
    def search_documents(self, query, top_k=3):
        query_vector = self.model.encode([query])
        distances, indices = self.index.search(query_vector, top_k)
        2





SyntaxError: incomplete input (735953769.py, line 59)

## Trying Chroma since it allows to update and delete individual docs which faiss doesn't

[31mERROR: Could not find a version that satisfies the requirement uuid4 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for uuid4[0m[31m
[0m

In [56]:
import chromadb
from chromadb.config import Settings
from uuid import uuid4
class RAGBackend:
    def __init__(self, json_path="events.json", db_path="./chroma_db", collection_name="events"):
        self.json_path = json_path
        self.db_path = db_path
        self.collection_name = collection_name

        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.similarity_threshold = 0.4  # Adjust as needed

        # Loading document store
        self.documents = self.load_or_create_json_store()

        # Initializing Chroma DB
        client = chromadb.PersistentClient(path="./chroma_db")
        self.collection = client.get_or_create_collection("events")

    def load_or_create_json_store(self):
        if os.path.exists(self.json_path):
            with open(self.json_path, "r") as f:
                return json.load(f)
        else:
            with open(self.json_path, "w") as f:
                json.dump([],f)
            return []
        
    def save_json_stor(self):
        with open(self.json_path, "w") as f:
            json.dump(self.documents, f, indent=2)

    def add_documents(self, text):
        doc_id = str(uuid4())
        doc = {"id": doc_id, "text":text}
        self.documents.append(doc)
        self.save_json_stor()
        #saving in chroma
        self.collection.add(documents=[text], ids=[doc_id])
        return doc_id
    
    def update_document(self, doc_id, new_text):
        found = False
        for doc in self.documents:
            if doc["id"] == doc_id:
                doc["text"] = new_text
                found = True
                break

        if not found:
            return False
        
        self.save_json_stor()
        self.collection.update(ids=[doc_id], documents=[new_text])
        return True
    
    def delete_document(self, doc_id):
        before = len(self.documents)
        self.documents = [doc for doc in self.documents if doc["id"]!=doc_id]
        after = len(self.documents)
        if before == after:
            return False
        
        self.save_json_store()
        self.collection.delete(ids=[doc_id])
        return True
    
    def search_similar_event(self, text, top_k=1):
        embedding = self.model.encode(text)
        results = self.collection.query(query_embeddings=[embedding], n_results=top_k)
        ids = results.get("ids",[[]])[0]
        distances = results.get("distances",[[]])[0]
        if ids and distances:
            return {"id": ids[0], "distance": distances[0]}
        return None
    
    def process_event(self, text):
        result = self.search_similar_event(text, top_k=1)
        print("result found: ", result)
        if result and result["distance"]< self.similarity_threshold:
            self.update_document(result["id"], text)
            return {"action": "updated", "id": result["id"], "distance": result["distance"]}
        else:
            new_id = self.add_documents(text)
            return {"action": "added", "id": new_id}
    


In [18]:
def call_gpt_extractor(tweet, existing_summary=None):
    prompt = f"""
    You are an expert in extracting structured information from tweets about disasters.
    Given the tweet and existing event summary, return a JSON with important details about the event.
    The JSON may include the following fields:
    - event_type
    - location
    - people_killed
    - people_trapped
    - infrastructure_damage
    - any other details you find relevant
    - summary (updated, more detailed)

    Tweet: "{tweet}"
    Existing Event Summary: "{existing_summary or 'None'}"

    Respond only in JSON format.
    """
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    return response.choices[0].message.content

In [4]:
import chromadb
from chromadb.config import Settings
from uuid import uuid4
class RAGBackend1:
    def __init__(self, json_path="events.json", db_path="./chroma_db", collection_name="events"):
        self.json_path = json_path
        self.db_path = db_path
        self.collection_name = collection_name

        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.similarity_threshold = 0.4  # Adjust as needed

        # Loading document store
        self.documents = self.load_or_create_json_store()

        # Initializing Chroma DB
        client = chromadb.PersistentClient(path="./chroma_db")
        self.collection = client.get_or_create_collection("events")

    def load_or_create_json_store(self):
        if os.path.exists(self.json_path):
            with open(self.json_path, "r") as f:
                return json.load(f)
        else:
            with open(self.json_path, "w") as f:
                json.dump([],f)
            return []
        
    def save_json_store(self):
        with open(self.json_path, "w") as f:
            json.dump(self.documents, f, indent=2)

    def add_document(self, event: dict):
        doc_id = str(uuid4())
        event["id"] = doc_id

        self.documents.append(event)
        self.save_json_store()

        summary = event.get("summary", "")
        self.collection.add(documents=[summary], ids=[doc_id])
        return doc_id

        
    
    def update_document(self, doc_id, new_event: dict):
        found = False
        for i, doc in enumerate(self.documents):
            if doc["id"] == doc_id:
                self.documents[i] = {**new_event, "id": doc_id}
                found = True
                break

        if not found:
            return False

        self.save_json_store()

        summary = new_event.get("summary", "")
        self.collection.update(ids=[doc_id], documents=[summary])
        return True
    
    def delete_document(self, doc_id):
        before = len(self.documents)
        self.documents = [doc for doc in self.documents if doc["id"]!=doc_id]
        after = len(self.documents)
        if before == after:
            return False
        
        self.save_json_store()
        self.collection.delete(ids=[doc_id])
        return True
    
    def search_similar_event(self, summary, top_k=1):
        embedding = self.model.encode(summary)
        results = self.collection.query(query_embeddings=[embedding], n_results=top_k)
        ids = results.get("ids", [[]])[0]
        distances = results.get("distances", [[]])[0]
        if ids and distances:
            return {"id": ids[0], "distance": distances[0]}
        return None
    

        
    def process_event(self, event: dict):
        if isinstance(event, str):
            try:
                event = json.loads(event)
                print(type(event))
            except json.JSONDecodeError:
                raise ValueError("Invalid JSON string passed to process_event")
        summary = event.get("summary", "")
        if not summary:
            raise ValueError("Event must include a summary field")

        result = self.search_similar_event(summary, top_k=1)
        print("result found:", result)

        if result and result["distance"] < self.similarity_threshold:
            self.update_document(result["id"], event)
            return {"action": "updated", "id": result["id"], "distance": result["distance"]}
        else:
            new_id = self.add_document(event)
            return {"action": "added", "id": new_id}

In [7]:

def call_gpt_extractor(tweet, existing_summary=None):
    prompt = f"""
    You are an expert in extracting structured information from tweets about disasters.
    Given the tweet and existing event summary, return a JSON with important details about the event.
    The JSON may include the following fields:
    - event_type
    - location
    - people_killed
    - people_trapped
    - infrastructure_damage
    - any other details you find relevant
    - summary (updated, more detailed)

    Tweet: "{tweet}"
    Existing Event Summary: "{existing_summary or 'None'}"

    Respond only in JSON format.
    """
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    return response.choices[0].message.content

In [8]:
rag = RAGBackend1()


In [15]:
# Step 1: Submit a new event
text = "India and pakistan war is started again due to pakistan hitting india different cities"

gpt_response = call_gpt_extractor(text)
print(gpt_response)
result_1 = rag.process_event(gpt_response)
print("First event result:", result_1)

{
    "event_type": "War",
    "location": ["India", "Pakistan"],
    "people_killed": "Unknown",
    "people_trapped": "Unknown",
    "infrastructure_damage": "Unknown",
    "other_details": "Conflict initiated due to Pakistan hitting different cities in India",
    "summary": "War has erupted between India and Pakistan, instigated by Pakistan's attacks on various Indian cities."
}
<class 'dict'>
result found: {'id': '7bff9c0c-f1cd-4b56-912b-3a5be16cab61', 'distance': 0.7950828075408936}
First event result: {'action': 'added', 'id': 'a5ed4dcc-f114-49d4-aee7-4a38acf6ff4d'}


In [16]:
def DisasterExtraction(tweet):
    rag = RAGBackend1()
    info_category = binary_classifier(tweet)
    if info_category=="non-informative":
        return "This tweet doesn't contain any disaster related information."
    humanitarian_category = humanitarianClassifier(tweet)
    tweet+= f", Category: {humanitarian_category}"
    gpt_response = call_gpt_extractor(tweet)
    result = rag.process_event(gpt_response)