<a href="https://colab.research.google.com/github/nobrainghost/SymptoChat/blob/main/transforming%26vectorgeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install together pinecone
from google.colab import files
files.upload()
#Download and import the disease_symptom_list.csv

In [None]:
#The original dataset was transformed to create the new csv disease_symptom_list.csv which is a simple disease-symptoms mapping

import pandas as pd
df2=pd.read_csv('disease_symptom_list.csv')
disease_symptom_pairs=[]

for idx,row in df2.iterrows():
    disease=row['disease']
    symptoms=row['symptoms'].split(',')
    disease_symptom_pairs.append((disease,symptoms))

for disease,symptoms in disease_symptom_pairs[:5]:
       print(f"Disease: {disease}\nSymptoms: {', '.join(symptoms)}\n")



In [None]:
import pandas as pd
df2=pd.read_csv('disease_symptom_list.csv')
disease_symptom_pairs=[]

for idx,row in df2.iterrows():
    disease=row['disease']
    symptoms=row['symptoms'].split(',')
    disease_symptom_pairs.append((disease,symptoms))

for disease,symptoms in disease_symptom_pairs[:5]:
       print(f"Disease: {disease}\nSymptoms: {', '.join(symptoms)}\n")


import requests
import re
import time
import together
import pinecone
from pinecone import Pinecone
from google.colab import userdata
from pinecone import Pinecone, ServerlessSpec

# Initialize API clients with
#If executing you'll need to set these keys in your secrets
#pinecone_key-->Key to your pinecone db (For Storing of the vectors)
#TOGETHER_API_KEY-->Token to your TogetherAI account

try:
    client = together.Client()
    pinecone_key = userdata.get('pinecone_key')
    pc = Pinecone(api_key=pinecone_key)
    index_name = "llama-text-embed-v2-index"
    index = pc.Index(index_name)
except Exception as e:
    print(f"Error during initialization: {e}")

#Current Version only uses the regular llms as I find a suitable domain specific model. Exact model deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free
def generate_description(disease, symptoms):
    try:
        prompt = f"""Give an in depth description of what is {disease} with its symptoms and any other necessary information for its diagnosis. Explain common indicators/signs and also discuss its symptoms. Some of its symptoms are {', '.join(symptoms)}.

        Explain how it's diagnosed and any other relevant clinical information."""

        response = client.chat.completions.create(
            model="deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
            messages=[{
                "role": "user",
                "content": prompt,
            }],
            max_tokens=5400,
            temperature=0.7
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error generating description for {disease}: {e}")
        retry_count = 0
        max_retries = 3
        while retry_count < max_retries:
            retry_count += 1
            print(f"Retrying ({retry_count}/{max_retries}) after waiting {2**retry_count} seconds...")
            time.sleep(2**retry_count)  # Exponential backoff
            try:
                response = client.chat.completions.create(
                    model="deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
                    messages=[{
                        "role": "user",
                        "content": prompt,
                    }],
                    max_tokens=5400,
                    temperature=0.7
                )
                return response.choices[0].message.content
            except Exception as retry_e:
                print(f"Retry {retry_count} failed: {retry_e}")

        return None

#Used Pinecone to generate and store the embeddings using their provided "llama-text-embed-v2"
def generate_embeddings(description):
    try:
        embeddings = pc.inference.embed(
            model="llama-text-embed-v2",
            inputs=description,
            parameters={
                "input_type": "passage"
            }
        )
        return embeddings
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        retry_count = 0
        max_retries = 3
        while retry_count < max_retries:
            retry_count += 1
            print(f"Retrying embeddings ({retry_count}/{max_retries}) after waiting {2**retry_count} seconds...")
            time.sleep(2**retry_count)  # Exponential backoff
            try:
                embeddings = pc.inference.embed(
                    model="llama-text-embed-v2",
                    inputs=description,
                    parameters={
                        "input_type": "passage"
                    }
                )
                return embeddings
            except Exception as retry_e:
                print(f"Retry {retry_count} failed: {retry_e}")

        return None

# Main processing loop
for disease, symptoms in disease_symptom_pairs:
    try:
        print(f"Processing disease: {disease}")
        description = generate_description(disease, symptoms)

        if description is None:
            print(f"Skipping {disease} due to description generation failure")
            continue
        #Remove the Thinking part from the response to reduce chances of hallucinations
        if "<think>" in description and "</think>" in description:
            description = re.sub(r"<think>.*?</think>", "", description, flags=re.DOTALL)
            description = description.strip()

        if description:
            embeddings = generate_embeddings(description)
            if embeddings is None:
                print(f"Skipping embeddings for {disease} due to embedding generation failure")
                continue

            for e in embeddings:
                try:
                    vectors = [{"id": disease, "values": e['values'], "metadata": {"symptoms": ', '.join(symptoms)}}]
                    print(vectors)
                    index.upsert(vectors=vectors)
                    print(f"Successfully uploaded vector for: {disease}")
                except Exception as upsert_e:
                    print(f"Error upserting vector for {disease}: {upsert_e}")
                    retry_count = 0
                    max_retries = 3
                    while retry_count < max_retries:
                        retry_count += 1
                        print(f"Retrying upsert ({retry_count}/{max_retries}) after waiting {2**retry_count} seconds...")
                        time.sleep(2**retry_count)
                        try:
                            index.upsert(vectors=vectors)
                            print(f"Successfully uploaded vector for: {disease} on retry {retry_count}")
                            break
                        except Exception as retry_e:
                            print(f"Upsert retry {retry_count} failed: {retry_e}")

    except Exception as e:
        print(f"Error processing {disease}: {e}")

In [None]:
#Initial attempt at using HuggingFace inference to generate the desriptions with microsoft/biogpt
#Does not work

from google.colab import userdata
import requests
model_name="microsoft/biogpt"
hf_token=userdata.get('hf_token')
api_url=f"https://api-inference.huggingface.co/models/{model_name}"
headers = {
    "Authorization": f"Bearer {hf_token}"
}
def generate_description_via_api(disease, symptoms):
    input_text = f"Give an in depth description of {disease} with its symptoms and any other necessary information for its diagnosis {', '.join(symptoms)}:"

    payload = {
        "inputs": input_text,
        "parameters": {
            "max_length": 500,
            "temperature": 0.7,
            "top_k": 50
        }
    }

    response = requests.post(api_url, headers=headers, json=payload)
    if response.status_code == 200:
        result = response.json()
        generated_description = result[0]['generated_text']
        return generated_description
    else:
        print("Error:", response.status_code, response.text)
        return None

# disease_description_pairs = []
# for disease, symptoms in disease_symptom_pairs:
#     description = generate_description_via_api(disease, symptoms)
#     if description:
#         disease_description_pairs.append((disease, description))
#         print(f"Disease: {disease}\nGenerated Description: {description}\n")
#     else:
#         print(f"Failed to generate description for {disease}")

disease_description_pairs = []
for disease, symptoms in disease_symptom_pairs[:1]:  # Example with the first pair
    description = generate_description_via_api(disease, symptoms)
    if description:
        disease_description_pairs.append((disease, description))
        print(f"Disease: {disease}\nGenerated Description: {description}\n")
