In [4]:
import os
from glob import glob
import hashlib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from copy import deepcopy
import ipynbname
import matplotlib.pyplot as plt
import shap
from datetime import datetime
import re
import requests
import json
import platform

# Constants
EXPERIMENT_NAME = "nursing_home"
DATA_PATH = f"..{os.sep}data{os.sep}"

In [5]:
def query_vector_db(query_text, top_k=5, similarity_threshold=0.75):
    """
    Query the vector database service.
    """
    try:
        response = requests.post(
            f"{SERVER_URL}/query",
            json={
                "query": query_text,
                "top_k": top_k,
                "similarity_threshold": similarity_threshold
            }
        )
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error querying vector DB: {str(e)}")
        return None
 
def rag_model(test_query, top_k, similarity):
    """
    Queries the vector database and extracts Source and Content as a JSON object.

    Args:
        test_query (str): The query string to search in the vector database.
        top_k (int): Number of top results to retrieve.
        similarity_threshold (float): Minimum similarity score threshold for the results.

    Returns:
        str: JSON-formatted string containing Source and Content.
    """

    # Query the vector database
    results = query_vector_db(
        query_text=test_query,
        top_k=top_k,
        similarity_threshold=similarity
    )

    # Regular expression to match Source and Content
    rag_pattern = r"Source: (.+?)\nContent: (.+?)(?:\nRelevance Score:|\n---|$)"

    # Find all matches
    rag_matches = re.findall(rag_pattern, results['context'], re.DOTALL)

    # Convert matches into a list of dictionaries
    rag_data = [{"Source": source.strip(), "Content": content.strip()} for source, content in rag_matches]

    # Convert the list of dictionaries to JSON
    rag_json = json.dumps(rag_data, indent=4)

    return rag_json

In [7]:
win_url = "http://127.0.0.1:5000"
ubuntu_url = "http://146.189.163.52:5000"

# Check operating system
if platform.system() == "Windows":
    SERVER_URL = win_url
else:
    SERVER_URL = ubuntu_url

print(f"SERVER_URL set to: {SERVER_URL}")

def query_vector_db(query_text, top_k=5, similarity_threshold=0.75):
    """
    Query the vector database service.
    """
    try:
        response = requests.post(
            f"{SERVER_URL}/query",
            json={
                "query": query_text,
                "top_k": top_k,
                "similarity_threshold": similarity_threshold
            }
        )
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error querying vector DB: {str(e)}")
        return None

# Test the function
test_query = "what is alzheimer's disease"
results = query_vector_db(
    query_text=test_query,
    top_k=5,
    similarity_threshold=0.75
)

if results:
    print(f"\nMessage: {results.get('message')}")
    print("\nResults:")
    for i, result in enumerate(results.get('results', []), 1):
        print(f"\nResult {i}:")
        print(f"Database: {result['db_name']}")
        print(f"Source: {result['source']}")
        print(f"Relevance Score: {result['relevance_score']:.3f}")
        print(f"Content: {result['content'][:200]}...")
    print("\nFull Context:")
    print(results.get('context', ''))

SERVER_URL set to: http://146.189.163.52:5000

Message: Found 5 relevant context entries

Results:

Result 1:
Database: chroma_db1
Source: PMC8599320.xml
Relevance Score: 0.832
Content: will need to obtain permission directly from the copyright holder. to view a copy of this licence, visit http://creativecommons.org/licenses/by/4.0/. alzheimer's disease (ad) is a progressive neurodeg...

Result 2:
Database: chroma_db1
Source: PMC9792538.xml
Relevance Score: 0.832
Content: alzheimer's disease (ad) is considered one of the most diseases that much prevalent among elderly people all over the world. ad is an incurable neurodegenerative disease affecting cognitive functions ...

Result 3:
Database: chroma_db1
Source: PMC8434202.xml
Relevance Score: 0.832
Content: alzheimer’s disease (ad) is a progressive age-related neurodegenerative disease recognized as the most common form of dementia among elderly people. due to the fact that the exact pathogenesis of ad s...

Result 4:
Database: chroma_

In [9]:
current_time = datetime.now()
print("Current time is:",current_time)

nb_fname = ipynbname.name()
print("File:",nb_fname)

Current time is: 2025-02-26 09:41:28.761058
File: ADAM_summarization


In [10]:
# Function to generate a consistent hash number for a given file name
def generate_hash_number(file_name):
    """
    Generates a consistent hash number based on the file name.
    
    Args:
        file_name (str): The file name to hash.
    
    Returns:
        int: A 32-bit hash number.
    """
    # Create an MD5 hash object
    hash_obj = hashlib.md5()
    
    # Update the hash object with the file name, encoded to bytes
    hash_obj.update(file_name.encode())

    # Convert the hash to an integer and ensure the range fits 32-bit
    return int(hash_obj.hexdigest(), 16) % (2**32)

# Generate experiment seed from experiment name
initial_seed = generate_hash_number(EXPERIMENT_NAME)
print(f"Experiment Name: {EXPERIMENT_NAME}, Initial Seed: {initial_seed}")

Experiment Name: nursing_home, Initial Seed: 127573839


In [11]:
# Set the seed for reproducibility
np.random.seed(initial_seed)

# Generate a list of 5 random integers in the range of 32-bit integers
random_integers_list = np.random.randint(low=0, high=2**31 - 1, size=30).tolist()
print("Random Integers List:", random_integers_list)
seed = random_integers_list[experiment_number - 1] if 'experiment_number' in locals() and 0 < experiment_number <= len(random_integers_list) else initial_seed

# Load clinical microbiome data
df_path = glob(f'{DATA_PATH}*clinical_microbiome_df*')[0]
print(f"Data File Path: {df_path}")

clinical_microbiome_df = pd.read_csv(df_path)
print("Clinical Microbiome Data Loaded.")

# Display value counts for the 'Alzheimers' column
alzheimers_counts = clinical_microbiome_df['Alzheimers'].value_counts()
alzheimers_counts_normalized = clinical_microbiome_df['Alzheimers'].value_counts(normalize=True)

print("Alzheimers Counts:\n", alzheimers_counts)
print("Normalized Alzheimers Counts:\n", alzheimers_counts_normalized)
print(f"Seed in this experiment: {seed}")

Random Integers List: [315491657, 135304577, 1572185508, 1029933563, 1950160822, 563269619, 1573902777, 513873336, 442822283, 1770168448, 2042199113, 441301073, 236793113, 387615796, 1899788307, 788877054, 681076636, 504404028, 1620098870, 847071637, 1967649092, 1010220053, 644534191, 844067037, 59236795, 1814102137, 787036043, 1853222053, 745124550, 2136397237]
Data File Path: ../data/clinical_microbiome_df.csv
Clinical Microbiome Data Loaded.
Alzheimers Counts:
 0.0    225
1.0    110
Name: Alzheimers, dtype: int64
Normalized Alzheimers Counts:
 0.0    0.671642
1.0    0.328358
Name: Alzheimers, dtype: float64
Seed in this experiment: 127573839


In [12]:
clinical_microbiome_df["Dementia Other"].value_counts()

0    335
Name: Dementia Other, dtype: int64

In [13]:
clinical_microbiome_df = clinical_microbiome_df[clinical_microbiome_df["Dementia Other"] != 1]
# Display value counts for the 'Alzheimers' column
alzheimers_counts = clinical_microbiome_df['Alzheimers'].value_counts()
alzheimers_counts_normalized = clinical_microbiome_df['Alzheimers'].value_counts(normalize=True)
print("Alzheimers Counts:\n", alzheimers_counts)
print("Normalized Alzheimers Counts:\n", alzheimers_counts_normalized)

Alzheimers Counts:
 0.0    225
1.0    110
Name: Alzheimers, dtype: int64
Normalized Alzheimers Counts:
 0.0    0.671642
1.0    0.328358
Name: Alzheimers, dtype: float64


In [21]:
sorted(glob(f"local_resources{os.sep}*{os.sep}classification{os.sep}*agent_output*"))

['local_resources/experiment01/classification/classification_agent_output_gpt-4o-mini-2024-07-18 - Copy.csv',
 'local_resources/experiment01/classification/classification_agent_output_gpt-4o-mini-2024-07-18.csv',
 'local_resources/experiment02/classification/classification_agent_output_gpt-4o-mini-2024-07-18.csv',
 'local_resources/experiment07/classification/classification_agent_output_gpt-4o-mini-2024-07-18.csv']

In [22]:
sorted(glob(f"local_resources{os.sep}*{os.sep}summarization{os.sep}*agent_output*"))

['local_resources/experiment01/summarization/summarization_agent_output.csv',
 'local_resources/experiment01/summarization/summarization_agent_output_gpt-4o-2024-11-20.csv',
 'local_resources/experiment01/summarization/summarization_agent_output_gpt-4o-2024-11-20_bugfix.csv',
 'local_resources/experiment01/summarization/summarization_agent_outputgpt-4o-2024-11-20_bug.csv',
 'local_resources/experiment02/summarization/summarization_agent_output_gpt-4o-2024-11-20.csv',
 'local_resources/experiment07/summarization/summarization_agent_output_gpt-4o-2024-11-20.csv',
 'local_resources/experiment15/summarization/summarization_agent_output.csv']