# Loading the libraries

In [9]:
import pandas as pd

# Load medication guides dataset
medication_guides = pd.read_csv("csv/Medication Guides.csv")

# Display basic info
print(medication_guides.head())
print(medication_guides.columns)
# Extract relevant columns
relevant_columns = ["Drug Name", "Active Ingredient", "Form; Route", "Date", "URL"]
medication_data = medication_guides[relevant_columns]

# Clean and preprocess data
def clean_drug_name(name):
    return name.strip().lower() if isinstance(name, str) else ""

medication_data["Drug Name"] = medication_data["Drug Name"].apply(clean_drug_name)
medication_data["Active Ingredient"] = medication_data["Active Ingredient"].apply(clean_drug_name)

# Save cleaned data
medication_data.to_csv("csv/cleaned_medication_guides.csv", index=False)

                 Drug Name            Active Ingredient  \
0  ABACAVIR and LAMIVUDINE  ABACAVIR SULFATE;LAMIVUDINE   
1                  ABILIFY                 ARIPIPRAZOLE   
2                  ABILIFY                 ARIPIPRAZOLE   
3                  ABILIFY                 ARIPIPRAZOLE   
4                  ABILIFY                 ARIPIPRAZOLE   

                          Form; Route  Appl. No.                     Company  \
0                         TABLET;ORAL     204311  MYLAN LABORATORIES LIMITED   
1                       SOLUTION;ORAL      21713                      OTSUKA   
2  TABLET, ORALLY DISINTEGRATING;ORAL      21729                      OTSUKA   
3                         TABLET;ORAL      21436                      OTSUKA   
4            INJECTABLE;INTRAMUSCULAR      21866                      OTSUKA   

         Date                                                URL  
0  12/22/2023  https://www.accessdata.fda.gov/drugsatfda_docs...  
1  02/05/2020  https://www.acc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medication_data["Drug Name"] = medication_data["Drug Name"].apply(clean_drug_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medication_data["Active Ingredient"] = medication_data["Active Ingredient"].apply(clean_drug_name)


In [11]:
import pandas as pd

# Load symptom prediction dataset
symptom_data = pd.read_csv("csv/symbipredict_2022.csv")

# Display basic info
print(symptom_data.head())

# Check for missing values
print(symptom_data.isnull().sum())

# Separate symptoms and prognosis
symptoms = symptom_data.iloc[:, :-1]  # All columns except the last one
prognosis = symptom_data["prognosis"]

# Convert binary symptoms to a list of active symptoms for each row
def get_active_symptoms(row):
    return [col for col, value in row.items() if value == 1]

symptom_data["active_symptoms"] = symptoms.apply(get_active_symptoms, axis=1)

# Combine active symptoms and prognosis into a single dataset
processed_data = pd.DataFrame({
    "symptoms": symptom_data["active_symptoms"],
    "diagnosis": prognosis
})

# Save processed data
processed_data.to_json("csv/processed_symptom_data.jsonl", orient="records", lines=True)

   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  \
0       0           0             0        0                 0  ...   
1       0           0             0        0                 0  ...   
2       0           0             0        0                 0  ...   
3       0           0             0        0                 0  ...   
4       0           0             0        0                 0  ...   

   blackheads  scurring  skin_peeling  silver_like_dusting  \
0           0         0             

In [13]:
import json

# Load processed symptom data
with open("csv/processed_symptom_data.jsonl", "r") as f:
    symptom_data = [json.loads(line) for line in f]

# Load medication data
medication_data = pd.read_csv("csv/cleaned_medication_guides.csv")

# Create combined dataset
combined_data = []

# Add symptom-diagnosis data
for entry in symptom_data:
    combined_data.append({
        "input": ", ".join(entry["symptoms"]),
        "output": f"Diagnosis: {entry['diagnosis']}"
    })

# Add medication data
for _, row in medication_data.iterrows():
    combined_data.append({
        "input": f"Drug Name: {row['Drug Name']}, Active Ingredient: {row['Active Ingredient']}",
        "output": f"Form: {row['Form; Route']}, Last Updated: {row['Date']}, More Info: {row['URL']}"
    })

# Save combined data as JSONL
with open("combined_data.jsonl", "w") as f:
    for item in combined_data:
        f.write(json.dumps(item) + "\n")

In [27]:
import pdfplumber

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text
            page_text = page.extract_text()
            if page_text:  # Ensure text exists
                text += page_text + "\n"
    return text

# Extract text from guideline-170-en.pdf
guideline_text = extract_text_from_pdf("pdf/guideline-170-en.pdf")

# Save extracted text to a file
with open("pdf/guideline_text.txt", "w", encoding="utf-8") as f:
    f.write(guideline_text)

print("Guideline text extracted and saved.")

Guideline text extracted and saved.


In [28]:
# Function to extract tables from PDF
def extract_tables_from_pdf(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract tables
            page_tables = page.extract_tables()
            if page_tables:
                tables.extend(page_tables)
    return tables

# Extract tables from guideline-170-en.pdf
guideline_tables = extract_tables_from_pdf("pdf/guideline-170-en.pdf")

# Save tables to a file
with open("pdf/guideline_tables.txt", "w", encoding="utf-8") as f:
    for table in guideline_tables:
        f.write(str(table) + "\n\n")

print("Guideline tables extracted and saved.")

Guideline tables extracted and saved.


In [29]:
import pdfplumber
import re

# Function to extract drug safety data
def extract_drug_safety_data(pdf_path):
    data = []
    conclusion_definitions = {}

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()

            # Extract table rows using regex
            rows = re.findall(r"([A-Za-z\s]+)\s+([A-Za-z\s]+)\s+([A-Za-z\s\-]+)\s+([A-Za-z\s\-]+)\s+([A-Za-z\!\?]+)\s+([\d\,\s]+)", text)
            for row in rows:
                class_, type_, generic_name, brand_name, conclusion, references = row
                data.append({
                    "Class": class_.strip(),
                    "Type": type_.strip(),
                    "Generic Name": generic_name.strip(),
                    "Brand Name": brand_name.strip(),
                    "Conclusion": conclusion.strip(),
                    "References": references.strip()
                })

            # Extract conclusion definitions
            if "Definitions" in text:
                definitions = re.findall(r"([A-Za-z\!\?]+):\s+(.+)", text)
                for key, value in definitions:
                    conclusion_definitions[key.strip()] = value.strip()

    return data, conclusion_definitions

# Extract drug safety data and definitions
drug_safety_data, conclusion_definitions = extract_drug_safety_data("pdf/Drug Safety Database Print - American Porphyria Foundation.pdf")

# Save drug safety data to JSON
import json
with open("pdf/drug_safety_data.json", "w", encoding="utf-8") as f:
    json.dump(drug_safety_data, f, indent=4)

# Save conclusion definitions to JSON
with open("pdf/conclusion_definitions.json", "w", encoding="utf-8") as f:
    json.dump(conclusion_definitions, f, indent=4)

print("Drug safety data and conclusion definitions extracted and saved.")

Drug safety data and conclusion definitions extracted and saved.


In [30]:
# Normalize drug safety data
for entry in drug_safety_data:
    for key, value in entry.items():
        entry[key] = value.strip().lower() if isinstance(value, str) else value

# Save normalized data
with open("pdf/normalized_drug_safety_data.json", "w", encoding="utf-8") as f:
    json.dump(drug_safety_data, f, indent=4)

print("Drug safety data normalized and saved.")

Drug safety data normalized and saved.


In [31]:
# Load guideline text
with open("pdf/guideline_text.txt", "r", encoding="utf-8") as f:
    guideline_text = f.read()

# Load drug safety data
with open("pdf/normalized_drug_safety_data.json", "r", encoding="utf-8") as f:
    drug_safety_data = json.load(f)

# Combine data into a single JSONL file
combined_data = []

# Add guideline text
combined_data.append({"input": "Clinical Guidelines", "output": guideline_text})

# Add drug safety data
for entry in drug_safety_data:
    combined_data.append({
        "input": f"{entry['Generic Name']} ({entry['Brand Name']})",
        "output": f"Class: {entry['Class']}, Type: {entry['Type']}, Conclusion: {entry['Conclusion']}, References: {entry['References']}"
    })

# Save combined data
with open("combined_data_pdf.jsonl", "w", encoding="utf-8") as f:
    for item in combined_data:
        f.write(json.dumps(item) + "\n")

print("Combined data saved as JSONL.")

Combined data saved as JSONL.


In [32]:
import json

# File paths for the two input files and the output file
file1_path = "combined_data_pdf.jsonl"  # Adjust this to your first file's name/path
file2_path = "combined_data.jsonl"  # Adjust this to your second file's name/path
output_path = "combined_data_final.jsonl"

# Step 1: Read and combine the data from both files
combined_data = []

# Read the first file
try:
    with open(file1_path, "r") as f1:
        for line in f1:
            if line.strip():  # Skip empty lines
                combined_data.append(json.loads(line))
    print(f"Loaded {len(combined_data)} entries from {file1_path}")
except FileNotFoundError:
    print(f"Error: {file1_path} not found")
    exit()
except json.JSONDecodeError as e:
    print(f"Error decoding JSON in {file1_path}: {e}")
    exit()

# Read the second file
try:
    with open(file2_path, "r") as f2:
        initial_count = len(combined_data)
        for line in f2:
            if line.strip():  # Skip empty lines
                combined_data.append(json.loads(line))
    print(f"Added {len(combined_data) - initial_count} entries from {file2_path}")
except FileNotFoundError:
    print(f"Error: {file2_path} not found")
    exit()
except json.JSONDecodeError as e:
    print(f"Error decoding JSON in {file2_path}: {e}")
    exit()

# Step 2: Write the combined data to a new file
try:
    with open(output_path, "w") as f_out:
        for item in combined_data:
            f_out.write(json.dumps(item) + "\n")
    print(f"Combined data saved to {output_path} with {len(combined_data)} total entries")
except Exception as e:
    print(f"Error writing to {output_path}: {e}")

Loaded 42 entries from combined_data_pdf.jsonl
Added 5923 entries from combined_data.jsonl
Combined data saved to combined_data_final.jsonl with 5965 total entries


In [5]:
import json
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network

def load_data(file_path):
    with open(file_path, 'r') as file:
        data = [json.loads(line) for line in file]
    return data

def build_graph(data, simple=False):
    G = nx.DiGraph()
    sample_size = 5 if simple else len(data)
    
    for entry in data[:sample_size]:
        user_query = entry['messages'][1]['content']
        assistant_response = entry['messages'][2]['content']
        
        # Extract structured attributes correctly
        response_parts = assistant_response.split(', ')
        attributes = {}
        for part in response_parts:
            if ': ' in part:
                key, value = part.split(': ')
                attributes[key.strip()] = value.strip()
            else:
                attributes.setdefault('Other', []).append(part.strip())
        
        G.add_node(user_query, color='blue', size=20)
        for key, value in attributes.items():
            if isinstance(value, list):
                for v in value:
                    G.add_node(v, color='red', size=10)
                    G.add_edge(user_query, v, label=key)
            else:
                G.add_node(value, color='red', size=10)
                G.add_edge(user_query, value, label=key)
    
    return G

def visualize_graph(G, title="Knowledge Graph"):
    net = Network(notebook=False, height="800px", width="100%", directed=True)
    
    for node, data in G.nodes(data=True):
        net.add_node(node, label=node, color=data.get('color', 'black'), size=data.get('size', 10))
    
    for source, target, data in G.edges(data=True):
        net.add_edge(source, target, title=data.get('label', ''))
    
    html_file = title.replace(" ", "_") + ".html"
    net.write_html(html_file)
    print(f"Graph saved as {html_file}")

# Example usage
file_path = "/Users/pavankonam/Desktop/RRL_Project/transformed_data.jsonl"
data = load_data(file_path)

# Generate simple graph
simple_G = build_graph(data, simple=True)
visualize_graph(simple_G, "Simple_Knowledge_Graph")

# Generate complex graph
complex_G = build_graph(data, simple=False)
visualize_graph(complex_G, "Complex_Knowledge_Graph")


Graph saved as Simple_Knowledge_Graph.html
Graph saved as Complex_Knowledge_Graph.html
