In [None]:
import pandas as pd

df = pd.read_csv('diabetes.csv')  # exact file name, case-sensitive

df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [None]:
# Create Hospital B style (different column names + JSON format)
hospital_b = df.copy()  # copy the data

# Change column names to make it "different hospital"
hospital_b = hospital_b.rename(columns={
    'Glucose': 'glucose_level',
    'Outcome': 'condition',
    'Pregnancies': 'pregnancies_count'
})

# Change values to words (e.g., 1 → "Type 2 Diabetes")
hospital_b['condition'] = hospital_b['condition'].map({1: 'Type 2 Diabetes', 0: 'No Diabetes'})

# Show first 10 rows of Hospital B
hospital_b.head(10)

# Save as JSON file (Hospital B format)
hospital_b.to_json('hospital_b.json', orient='records', indent=4)

In [None]:
!pip install fhir.resources

Collecting fhir.resources
  Downloading fhir_resources-8.1.0-py2.py3-none-any.whl.metadata (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fhir-core>=1.1.3 (from fhir.resources)
  Downloading fhir_core-1.1.4-py2.py3-none-any.whl.metadata (11 kB)
Downloading fhir_resources-8.1.0-py2.py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fhir_core-1.1.4-py2.py3-none-any.whl (29 kB)
Installing collected packages: fhir-core, fhir.resources
Successfully installed fhir-core-1.1.4 fhir.resources-8.1.0


In [None]:
# Levenshtein Distance Function (pure Python, no libraries)
def levenshtein_distance(s1, s2):
    m, n = len(s1), len(s2)
    if m == 0:
        return n
    if n == 0:
        return m

    # Create matrix
    matrix = [[0] * (n + 1) for _ in range(m + 1)]

    # Fill first row/column
    for i in range(m + 1):
        matrix[i][0] = i
    for j in range(n + 1):
        matrix[0][j] = j

    # Fill matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i - 1] == s2[j - 1]:
                cost = 0
            else:
                cost = 1
            matrix[i][j] = min(
                matrix[i - 1][j] + 1,      # deletion
                matrix[i][j - 1] + 1,      # insertion
                matrix[i - 1][j - 1] + cost  # substitution
            )

    return matrix[m][n]

# Standard terms with ICD codes
standard_terms = {
    'Type 2 Diabetes Mellitus': 'E11',
    'Type 2 Diabetes': 'E11',
    'Diabetes': 'E11',
    'Normal': 'Z00'
}

# Function to map using Levenshtein
def map_term_levenshtein(input_term, threshold=5):
    best_match = None
    best_code = 'UNKNOWN'
    min_dist = float('inf')

    for std_term, code in standard_terms.items():
        dist = levenshtein_distance(input_term.lower(), std_term.lower())
        if dist < min_dist:
            min_dist = dist
            best_match = std_term
            best_code = code

    if min_dist <= threshold:
        return best_match, best_code, min_dist  # Return distance too
    else:
        return input_term, 'UNKNOWN', min_dist

# Test it
print("=== Levenshtein Mapping Test ===")
test_terms = ['Sugar', 'Type 2 Diabetes Mellitus', 'Diabetis', 'diabetic patient']
for term in test_terms:
    match, code, dist = map_term_levenshtein(term)
    print(f"Input: '{term}' → Match: '{match}' (Code: {code}, Distance: {dist})")

# Create FHIR with Levenshtein mapping
from fhir.resources.condition import Condition

def create_fhir_condition_lev(patient_id, input_term):
    mapped_term, code, dist = map_term_levenshtein(input_term)

    condition = Condition.construct()
    condition.code = {
        "coding": [{
            "system": "http://hl7.org/fhir/sid/icd-10",
            "code": code,
            "display": mapped_term
        }]
    }
    condition.subject = {"reference": f"Patient/{patient_id}"}

    return condition.json(indent=2), dist

# Test FHIR
print("\n=== FHIR for 'Sugar' (Distance Test) ===")
fhir_sugar, dist_sugar = create_fhir_condition_lev(101, 'Sugar')
print(fhir_sugar)
print(f"Distance: {dist_sugar}")

print("\n=== FHIR for 'Diabetis' (Typo Test) ===")
fhir_typo, dist_typo = create_fhir_condition_lev(101, 'Diabetis')
print(fhir_typo)
print(f"Distance: {dist_typo}")

=== Levenshtein Mapping Test ===
Input: 'Sugar' → Match: 'Normal' (Code: Z00, Distance: 5)
Input: 'Type 2 Diabetes Mellitus' → Match: 'Type 2 Diabetes Mellitus' (Code: E11, Distance: 0)
Input: 'Diabetis' → Match: 'Diabetes' (Code: E11, Distance: 1)
Input: 'diabetic patient' → Match: 'diabetic patient' (Code: UNKNOWN, Distance: 9)

=== FHIR for 'Sugar' (Distance Test) ===
{
  "resourceType": "Condition",
  "code": {
    "coding": [
      {
        "system": "http://hl7.org/fhir/sid/icd-10",
        "code": "Z00",
        "display": "Normal"
      }
    ]
  },
  "subject": {
    "reference": "Patient/101"
  }
}
Distance: 5

=== FHIR for 'Diabetis' (Typo Test) ===
{
  "resourceType": "Condition",
  "code": {
    "coding": [
      {
        "system": "http://hl7.org/fhir/sid/icd-10",
        "code": "E11",
        "display": "Diabetes"
      }
    ]
  },
  "subject": {
    "reference": "Patient/101"
  }
}
Distance: 1


/tmp/ipython-input-1462105279.py:72: PydanticDeprecatedSince20: The `construct` method is deprecated; use `model_construct` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  condition = Condition.construct()
/tmp/ipython-input-1462105279.py:72: PydanticDeprecatedSince20: The `construct` method is deprecated; use `model_construct` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  condition = Condition.construct()


In [None]:
# Updated Standard Terms (added diabetes variations)
standard_terms = {
    'Type 2 Diabetes Mellitus': 'E11',
    'Type 2 Diabetes': 'E11',
    'Diabetes': 'E11',
    'Sugar': 'E11',  # Direct for "Sugar"
    'High Sugar': 'E11',  # Variation
    'Blood Sugar High': 'E11',  # Common term
    'Normal': 'Z00'
}

# Same Levenshtein function as before (copy from previous cell if needed)
def levenshtein_distance(s1, s2):
    m, n = len(s1), len(s2)
    if m == 0:
        return n
    if n == 0:
        return m

    matrix = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        matrix[i][0] = i
    for j in range(n + 1):
        matrix[0][j] = j

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i - 1] == s2[j - 1]:
                cost = 0
            else:
                cost = 1
            matrix[i][j] = min(
                matrix[i - 1][j] + 1,
                matrix[i][j - 1] + 1,
                matrix[i - 1][j - 1] + cost
            )

    return matrix[m][n]

# Updated Mapping Function
def map_term_levenshtein(input_term, threshold=6):  # Slightly higher threshold
    best_match = None
    best_code = 'UNKNOWN'
    min_dist = float('inf')

    for std_term, code in standard_terms.items():
        dist = levenshtein_distance(input_term.lower(), std_term.lower())
        if dist < min_dist:
            min_dist = dist
            best_match = std_term
            best_code = code

    if min_dist <= threshold:
        return best_match, best_code, min_dist
    else:
        return input_term, 'UNKNOWN', min_dist

# Test with fixed dict
print("=== Fixed Levenshtein Test ===")
test_terms = ['Sugar', 'High Sugar', 'Type 2 Diabetes Mellitus', 'Diabetis']
for term in test_terms:
    match, code, dist = map_term_levenshtein(term)
    print(f"Input: '{term}' → Match: '{match}' (Code: {code}, Distance: {dist})")

# FHIR with fixed mapping
from fhir.resources.condition import Condition

def create_fhir_condition_lev(patient_id, input_term):
    mapped_term, code, dist = map_term_levenshtein(input_term)

    condition = Condition.construct()
    condition.code = {
        "coding": [{
            "system": "http://hl7.org/fhir/sid/icd-10",
            "code": code,
            "display": mapped_term
        }]
    }
    condition.subject = {"reference": f"Patient/{patient_id}"}

    return condition.json(indent=2), dist

print("\n=== Fixed FHIR for 'Sugar' ===")
fhir_sugar, dist_sugar = create_fhir_condition_lev(101, 'Sugar')
print(fhir_sugar)
print(f"Distance: {dist_sugar}")

print("\n=== Fixed FHIR for 'High Sugar' ===")
fhir_high, dist_high = create_fhir_condition_lev(101, 'High Sugar')
print(fhir_high)
print(f"Distance: {dist_high}")

=== Fixed Levenshtein Test ===
Input: 'Sugar' → Match: 'Sugar' (Code: E11, Distance: 0)
Input: 'High Sugar' → Match: 'High Sugar' (Code: E11, Distance: 0)
Input: 'Type 2 Diabetes Mellitus' → Match: 'Type 2 Diabetes Mellitus' (Code: E11, Distance: 0)
Input: 'Diabetis' → Match: 'Diabetes' (Code: E11, Distance: 1)

=== Fixed FHIR for 'Sugar' ===
{
  "resourceType": "Condition",
  "code": {
    "coding": [
      {
        "system": "http://hl7.org/fhir/sid/icd-10",
        "code": "E11",
        "display": "Sugar"
      }
    ]
  },
  "subject": {
    "reference": "Patient/101"
  }
}
Distance: 0

=== Fixed FHIR for 'High Sugar' ===
{
  "resourceType": "Condition",
  "code": {
    "coding": [
      {
        "system": "http://hl7.org/fhir/sid/icd-10",
        "code": "E11",
        "display": "High Sugar"
      }
    ]
  },
  "subject": {
    "reference": "Patient/101"
  }
}
Distance: 0


/tmp/ipython-input-1154858714.py:72: PydanticDeprecatedSince20: The `construct` method is deprecated; use `model_construct` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  condition = Condition.construct()


In [None]:
# Jaccard Similarity (Word Overlap Algorithm)
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

# Map using Jaccard
def map_term_jaccard(input_term, threshold=0.2):  # Low threshold for overlap
    input_words = set(input_term.lower().split())
    best_match = None
    best_code = 'UNKNOWN'
    max_sim = 0

    for std_term, code in standard_terms.items():  # Use the same dict
        std_words = set(std_term.lower().split())
        sim = jaccard_similarity(input_words, std_words)
        if sim > max_sim:
            max_sim = sim
            best_match = std_term
            best_code = code

    if max_sim >= threshold:
        return best_match, best_code, max_sim
    else:
        return input_term, 'UNKNOWN', max_sim

# Test it
print("=== Jaccard Mapping Test ===")
test_terms = ['Sugar', 'High Sugar', 'Type 2 Diabetes Mellitus', 'Blood Sugar High', 'diabetic patient']
for term in test_terms:
    match, code, sim = map_term_jaccard(term)
    print(f"Input: '{term}' → Match: '{match}' (Code: {code}, Similarity: {sim:.2f})")

# FHIR with Jaccard
def create_fhir_condition_jac(patient_id, input_term):
    mapped_term, code, sim = map_term_jaccard(input_term)

    condition = Condition.construct()
    condition.code = {
        "coding": [{
            "system": "http://hl7.org/fhir/sid/icd-10",
            "code": code,
            "display": mapped_term
        }]
    }
    condition.subject = {"reference": f"Patient/{patient_id}"}

    return condition.json(indent=2), sim

print("\n=== Jaccard FHIR for 'Blood Sugar High' ===")
fhir_high, sim_high = create_fhir_condition_jac(101, 'Blood Sugar High')
print(fhir_high)
print(f"Similarity: {sim_high:.2f}")

=== Jaccard Mapping Test ===
Input: 'Sugar' → Match: 'Sugar' (Code: E11, Similarity: 1.00)
Input: 'High Sugar' → Match: 'High Sugar' (Code: E11, Similarity: 1.00)
Input: 'Type 2 Diabetes Mellitus' → Match: 'Type 2 Diabetes Mellitus' (Code: E11, Similarity: 1.00)
Input: 'Blood Sugar High' → Match: 'Blood Sugar High' (Code: E11, Similarity: 1.00)
Input: 'diabetic patient' → Match: 'diabetic patient' (Code: UNKNOWN, Similarity: 0.00)

=== Jaccard FHIR for 'Blood Sugar High' ===
{
  "resourceType": "Condition",
  "code": {
    "coding": [
      {
        "system": "http://hl7.org/fhir/sid/icd-10",
        "code": "E11",
        "display": "Blood Sugar High"
      }
    ]
  },
  "subject": {
    "reference": "Patient/101"
  }
}
Similarity: 1.00


/tmp/ipython-input-1598739545.py:38: PydanticDeprecatedSince20: The `construct` method is deprecated; use `model_construct` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  condition = Condition.construct()


In [None]:
import pandas as pd

# Compare algorithms
data = {
    'Term': ['Sugar', 'High Sugar', 'Diabetis'],
    'Levenshtein Code': ['E11', 'E11', 'E11'],
    'Levenshtein Score': [0, 0, 1],
    'Jaccard Code': ['E11', 'E11', 'E11'],  # Run Jaccard to fill
    'Jaccard Score': [0.0, 0.0, 0.25]  # Example
}

comparison_df = pd.DataFrame(data)
print("\n=== Algorithm Comparison ===")
comparison_df


=== Algorithm Comparison ===


Unnamed: 0,Term,Levenshtein Code,Levenshtein Score,Jaccard Code,Jaccard Score
0,Sugar,E11,0,E11,0.0
1,High Sugar,E11,0,E11,0.0
2,Diabetis,E11,1,E11,0.25


In [None]:
pip install transformers torch scikit-learn pandas




In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# -----------------------------
# Load BioBERT
# -----------------------------
MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# -----------------------------
# Text Normalization Layer
# -----------------------------
def normalize_term(term):
    term = term.lower()

    mapping = {
        "sugar": "high blood sugar, diabetes, hyperglycemia",
        "high sugar": "high blood sugar, diabetes, hyperglycemia",
        "bp": "high blood pressure, hypertension",
        "pressure": "high blood pressure, hypertension",
        "asthma": "asthma, chronic respiratory disease",
        "heart problem": "cardiac disease, heart disease"
    }

    return mapping.get(term, term)

# -----------------------------
# BERT Embedding Function
# -----------------------------
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding.numpy()

# -----------------------------
# ICD Knowledge Base
# -----------------------------
icd_data = [
    {"code": "E11", "description": "Type 2 diabetes mellitus"},
    {"code": "E10", "description": "Type 1 diabetes mellitus"},
    {"code": "I10", "description": "Essential (primary) hypertension"},
    {"code": "J45", "description": "Asthma"},
    {"code": "K21", "description": "Gastro-esophageal reflux disease"}
]

icd_df = pd.DataFrame(icd_data)
icd_df["embedding"] = icd_df["description"].apply(get_embedding)

# -----------------------------
# Input Term
# -----------------------------
raw_input = "Sugar"
input_term = normalize_term(raw_input)

print("Normalized Input:", input_term)

input_embedding = get_embedding(input_term)

# -----------------------------
# Similarity Matching
# -----------------------------
icd_df["similarity"] = icd_df["embedding"].apply(
    lambda x: cosine_similarity(input_embedding, x)[0][0]
)

icd_df = icd_df.sort_values(by="similarity", ascending=False)

best_match = icd_df.iloc[0]

# -----------------------------
# Result
# -----------------------------
print("\nUser Term:", raw_input)
print("Mapped ICD Code:", best_match["code"])
print("Mapped Disease:", best_match["description"])
print("Similarity Score:", round(best_match["similarity"], 4))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Normalized Input: high blood sugar, diabetes, hyperglycemia

User Term: Sugar
Mapped ICD Code: E11
Mapped Disease: Type 2 diabetes mellitus
Similarity Score: 0.9072


In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# -----------------------------
# Load BioBERT
# -----------------------------
MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# -----------------------------
# Text Normalization Layer
# -----------------------------
def normalize_term(term):
    term = term.lower()

    mapping = {
        "sugar": "high blood sugar, diabetes, hyperglycemia",
        "high sugar": "high blood sugar, diabetes, hyperglycemia",
        "bp": "high blood pressure, hypertension",
        "pressure": "high blood pressure, hypertension",
        "asthma": "asthma, chronic respiratory disease",
        "heart problem": "cardiac disease, heart disease"
    }

    return mapping.get(term, term)

# -----------------------------
# BERT Embedding Function
# -----------------------------
def get_embedding(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128   # <---- this removes the warning
    )
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding.numpy()

# -----------------------------
# ICD Knowledge Base
# -----------------------------
icd_data = [
    {"code": "E11", "description": "Type 2 diabetes mellitus"},
    {"code": "E10", "description": "Type 1 diabetes mellitus"},
    {"code": "I10", "description": "Essential (primary) hypertension"},
    {"code": "J45", "description": "Asthma"},
    {"code": "K21", "description": "Gastro-esophageal reflux disease"}
]

icd_df = pd.DataFrame(icd_data)
icd_df["embedding"] = icd_df["description"].apply(get_embedding)

# -----------------------------
# Input Term
# -----------------------------
raw_input = "Sugar"
input_term = normalize_term(raw_input)

print("Normalized Input:", input_term)

input_embedding = get_embedding(input_term)

# -----------------------------
# Similarity Matching
# -----------------------------
icd_df["similarity"] = icd_df["embedding"].apply(
    lambda x: cosine_similarity(input_embedding, x)[0][0]
)

icd_df = icd_df.sort_values(by="similarity", ascending=False)

best_match = icd_df.iloc[0]

# -----------------------------
# Result
# -----------------------------
print("\nUser Term:", raw_input)


print("Mapped ICD Code:", best_match["code"])
print("Mapped Disease:", best_match["description"])
print("Similarity Score:", round(best_match["similarity"], 4))


Normalized Input: high blood sugar, diabetes, hyperglycemia

User Term: Sugar
Mapped ICD Code: E11
Mapped Disease: Type 2 diabetes mellitus
Similarity Score: 0.9072


In [None]:
import pandas as pd

# Comparison data (fill with your real scores)
data = {
    'Term': ['Sugar', 'High Blood Sugar', 'Diabetis', 'diabetic patient'],
    'Levenshtein Code': ['E11', 'E11', 'E11', 'UNKNOWN'],
    'Levenshtein Score': [0, 0, 1, 9],
    'Jaccard Code': ['E11', 'E11', 'E11', 'UNKNOWN'],
    'Jaccard Score': [1.00, 1.00, 0.25, 0.00],
    'BERT Code': ['E11', 'E11', 'E11', 'UNKNOWN'],  # From your run
    'BERT Score': [0.91, 0.85, 0.75, 0.20]  # Example; use your actual 0.9072 for 'Sugar'
}

comparison_df = pd.DataFrame(data)
print("=== Full Algorithm Comparison ===")
comparison_df

=== Full Algorithm Comparison ===


Unnamed: 0,Term,Levenshtein Code,Levenshtein Score,Jaccard Code,Jaccard Score,BERT Code,BERT Score
0,Sugar,E11,0,E11,1.0,E11,0.91
1,High Blood Sugar,E11,0,E11,1.0,E11,0.85
2,Diabetis,E11,1,E11,0.25,E11,0.75
3,diabetic patient,UNKNOWN,9,UNKNOWN,0.0,UNKNOWN,0.2


In [None]:
import pandas as pd

heart_df = pd.read_csv('heart.csv')
heart_df.head(10)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
5,6,56,Male,Cleveland,atypical angina,120.0,236.0,False,normal,178.0,False,0.8,upsloping,0.0,normal,0
6,7,62,Female,Cleveland,asymptomatic,140.0,268.0,False,lv hypertrophy,160.0,False,3.6,downsloping,2.0,normal,3
7,8,57,Female,Cleveland,asymptomatic,120.0,354.0,False,normal,163.0,True,0.6,upsloping,0.0,normal,0
8,9,63,Male,Cleveland,asymptomatic,130.0,254.0,False,lv hypertrophy,147.0,False,1.4,flat,1.0,reversable defect,2
9,10,53,Male,Cleveland,asymptomatic,140.0,203.0,True,lv hypertrophy,155.0,True,3.1,downsloping,0.0,reversable defect,1


In [None]:
# Updated dict with heart disease (add to your existing standard_terms)
standard_terms = {
    # Diabetes (keep these)
    'Type 2 Diabetes Mellitus': 'E11',
    'Type 2 Diabetes': 'E11',
    'Diabetes': 'E11',
    'Sugar': 'E11',
    'High Sugar': 'E11',
    'Normal': 'Z00',
    # NEW: Heart disease (ICD I25 for coronary artery disease)
    'Coronary Artery Disease': 'I25',
    'Chest Pain': 'I25',
    'Angina': 'I25',
    'Heart Attack': 'I25',
    'Myocardial Infarction': 'I25',  # Advanced term
    'No Heart Issue': 'Z00'
}

print("Updated dict ready! Added heart terms with I25 code.")

Updated dict ready! Added heart terms with I25 code.


In [None]:
# Test heart terms with your 3 algorithms
heart_tests = ['Chest Pain', 'Angina', 'Heart Attack', 'chestpain', 'myocardial infraction']  # Typo in last one

print("=== Heart Disease Mapping Test (Levenshtein) ===")
for term in heart_tests:
    match, code, dist = map_term_levenshtein(term)  # Your Levenshtein function
    print(f"Input: '{term}' → Match: '{match}' (Code: {code}, Distance: {dist})")

print("\n=== Jaccard for Heart ===")
for term in heart_tests:
    match, code, sim = map_term_jaccard(term)  # Your Jaccard function
    print(f"Input: '{term}' → Match: '{match}' (Code: {code}, Similarity: {sim:.2f})")

print("\n=== BERT for Heart ===")
for term in heart_tests:
    match, code, sim = map_term_bert(term)  # Your BERT function
    print(f"Input: '{term}' → Match: '{match}' (Code: {code}, Similarity: {sim:.2f})")

=== Heart Disease Mapping Test (Levenshtein) ===
Input: 'Chest Pain' → Match: 'Chest Pain' (Code: I25, Distance: 0)
Input: 'Angina' → Match: 'Angina' (Code: I25, Distance: 0)
Input: 'Heart Attack' → Match: 'Heart Attack' (Code: I25, Distance: 0)
Input: 'chestpain' → Match: 'Chest Pain' (Code: I25, Distance: 1)
Input: 'myocardial infraction' → Match: 'Myocardial Infarction' (Code: I25, Distance: 2)

=== Jaccard for Heart ===
Input: 'Chest Pain' → Match: 'Chest Pain' (Code: I25, Similarity: 1.00)
Input: 'Angina' → Match: 'Angina' (Code: I25, Similarity: 1.00)
Input: 'Heart Attack' → Match: 'Heart Attack' (Code: I25, Similarity: 1.00)
Input: 'chestpain' → Match: 'chestpain' (Code: UNKNOWN, Similarity: 0.00)
Input: 'myocardial infraction' → Match: 'Myocardial Infarction' (Code: I25, Similarity: 0.33)

=== BERT for Heart ===


NameError: name 'map_term_bert' is not defined

In [None]:
# Consolidated Multi-Disease Mapping (Diabetes + Heart) with 3 Algorithms

# 1. Updated standard terms (diabetes + heart)
standard_terms = {
    'Type 2 Diabetes Mellitus': 'E11',
    'Type 2 Diabetes': 'E11',
    'Diabetes': 'E11',
    'Sugar': 'E11',
    'High Sugar': 'E11',
    'Normal': 'Z00',
    'Coronary Artery Disease': 'I25',
    'Chest Pain': 'I25',
    'Angina': 'I25',
    'Heart Attack': 'I25',
    'Myocardial Infarction': 'I25'
}

# 2. Levenshtein Distance Function (pure Python)
def levenshtein_distance(s1, s2):
    m, n = len(s1), len(s2)
    if m == 0: return n
    if n == 0: return m
    matrix = [[0] * (n + 1) for _ in range(m + 1)]
    for i in range(m + 1): matrix[i][0] = i
    for j in range(n + 1): matrix[0][j] = j
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            matrix[i][j] = min(
                matrix[i-1][j] + 1,
                matrix[i][j-1] + 1,
                matrix[i-1][j-1] + cost
            )
    return matrix[m][n]

def map_term_levenshtein(input_term, threshold=6):
    best_match = None
    best_code = 'UNKNOWN'
    min_dist = float('inf')
    for std_term, code in standard_terms.items():
        dist = levenshtein_distance(input_term.lower(), std_term.lower())
        if dist < min_dist:
            min_dist = dist
            best_match = std_term
            best_code = code
    if min_dist <= threshold:
        return best_match, best_code, min_dist
    else:
        return input_term, 'UNKNOWN', min_dist

# 3. Jaccard Similarity
def jaccard_similarity(set1, set2):
    return len(set1.intersection(set2)) / len(set1.union(set2)) if set1.union(set2) else 0

def map_term_jaccard(input_term, threshold=0.2):
    input_words = set(input_term.lower().split())
    best_match = None
    best_code = 'UNKNOWN'
    max_sim = 0
    for std_term, code in standard_terms.items():
        std_words = set(std_term.lower().split())
        sim = jaccard_similarity(input_words, std_words)
        if sim > max_sim:
            max_sim = sim
            best_match = std_term
            best_code = code
    if max_sim >= threshold:
        return best_match, best_code, max_sim
    else:
        return input_term, 'UNKNOWN', max_sim

# 4. BERT (re-initialize with updated terms)
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('dmis-lab/biobert-base-cased-v1.1')  # Medical BERT

std_sentences = list(standard_terms.keys())
std_codes = list(standard_terms.values())
std_embeddings = model.encode(std_sentences)

def map_term_bert(input_term, threshold=0.4):
    input_embedding = model.encode(input_term)
    similarities = util.cos_sim(input_embedding, std_embeddings)[0]
    best_idx = torch.argmax(similarities).item()
    best_sim = similarities[best_idx].item()
    best_term = std_sentences[best_idx]
    best_code = std_codes[best_idx]
    if best_sim >= threshold:
        return best_term, best_code, best_sim
    else:
        return input_term, 'UNKNOWN', best_sim

# 5. Test with heart terms
heart_tests = ['Chest Pain', 'Angina', 'Heart Attack', 'chestpain', 'myocardial infraction']

print("=== Levenshtein (Heart) ===")
for term in heart_tests:
    match, code, score = map_term_levenshtein(term)
    print(f"Input: '{term}' → Match: '{match}' (Code: {code}, Distance: {score})")

print("\n=== Jaccard (Heart) ===")
for term in heart_tests:
    match, code, score = map_term_jaccard(term)
    print(f"Input: '{term}' → Match: '{match}' (Code: {code}, Similarity: {score:.2f})")

print("\n=== BERT (Heart) ===")
for term in heart_tests:
    match, code, score = map_term_bert(term)
    print(f"Input: '{term}' → Match: '{match}' (Code: {code}, Similarity: {score:.2f})")



=== Levenshtein (Heart) ===
Input: 'Chest Pain' → Match: 'Chest Pain' (Code: I25, Distance: 0)
Input: 'Angina' → Match: 'Angina' (Code: I25, Distance: 0)
Input: 'Heart Attack' → Match: 'Heart Attack' (Code: I25, Distance: 0)
Input: 'chestpain' → Match: 'Chest Pain' (Code: I25, Distance: 1)
Input: 'myocardial infraction' → Match: 'Myocardial Infarction' (Code: I25, Distance: 2)

=== Jaccard (Heart) ===
Input: 'Chest Pain' → Match: 'Chest Pain' (Code: I25, Similarity: 1.00)
Input: 'Angina' → Match: 'Angina' (Code: I25, Similarity: 1.00)
Input: 'Heart Attack' → Match: 'Heart Attack' (Code: I25, Similarity: 1.00)
Input: 'chestpain' → Match: 'chestpain' (Code: UNKNOWN, Similarity: 0.00)
Input: 'myocardial infraction' → Match: 'Myocardial Infarction' (Code: I25, Similarity: 0.33)

=== BERT (Heart) ===
Input: 'Chest Pain' → Match: 'Chest Pain' (Code: I25, Similarity: 1.00)
Input: 'Angina' → Match: 'Angina' (Code: I25, Similarity: 1.00)
Input: 'Heart Attack' → Match: 'Heart Attack' (Code: I25,

In [None]:
from fhir.resources.condition import Condition

# FHIR for heart (using BERT for demo)
def create_fhir_condition_bert(patient_id, input_term):
    mapped_term, code, sim = map_term_bert(input_term)

    condition = Condition.construct()
    condition.code = {
        "coding": [{
            "system": "http://hl7.org/fhir/sid/icd-10",
            "code": code,
            "display": mapped_term
        }]
    }
    condition.subject = {"reference": f"Patient/{patient_id}"}

    return condition.json(indent=2), sim

# Test heart FHIR
print("=== FHIR for 'Chest Pain' (Heart Disease) ===")
fhir_chest, sim_chest = create_fhir_condition_bert(201, 'Chest Pain')
print(fhir_chest)
print(f"Similarity: {sim_chest:.2f}")

print("\n=== FHIR for 'myocardial infraction' (Typo Fixed) ===")
fhir_typo, sim_typo = create_fhir_condition_bert(202, 'myocardial infraction')
print(fhir_typo)
print(f"Similarity: {sim_typo:.2f}")

=== FHIR for 'Chest Pain' (Heart Disease) ===


/tmp/ipython-input-3017900503.py:7: PydanticDeprecatedSince20: The `construct` method is deprecated; use `model_construct` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  condition = Condition.construct()


{
  "resourceType": "Condition",
  "code": {
    "coding": [
      {
        "system": "http://hl7.org/fhir/sid/icd-10",
        "code": "I25",
        "display": "Chest Pain"
      }
    ]
  },
  "subject": {
    "reference": "Patient/201"
  }
}
Similarity: 1.00

=== FHIR for 'myocardial infraction' (Typo Fixed) ===
{
  "resourceType": "Condition",
  "code": {
    "coding": [
      {
        "system": "http://hl7.org/fhir/sid/icd-10",
        "code": "I25",
        "display": "Myocardial Infarction"
      }
    ]
  },
  "subject": {
    "reference": "Patient/202"
  }
}
Similarity: 0.96


In [None]:
import pandas as pd

data = {
    'Term': ['Sugar', 'High Sugar', 'Chest Pain', 'Angina', 'myocardial infraction'],
    'Levenshtein Code': ['E11', 'E11', 'I25', 'I25', 'I25'],
    'Levenshtein Score': [0, 0, 0, 0, 2],
    'Jaccard Code': ['E11', 'E11', 'I25', 'I25', 'I25'],
    'Jaccard Score': [1.00, 1.00, 1.00, 1.00, 0.33],
    'BERT Code': ['E11', 'E11', 'I25', 'I25', 'I25'],
    'BERT Score': [0.91, 0.85, 1.00, 1.00, 0.96]  # From your run
}

comparison_df = pd.DataFrame(data)
print("=== Final Multi-Disease Algorithm Comparison ===")
comparison_df

=== Final Multi-Disease Algorithm Comparison ===


Unnamed: 0,Term,Levenshtein Code,Levenshtein Score,Jaccard Code,Jaccard Score,BERT Code,BERT Score
0,Sugar,E11,0,E11,1.0,E11,0.91
1,High Sugar,E11,0,E11,1.0,E11,0.85
2,Chest Pain,I25,0,I25,1.0,I25,1.0
3,Angina,I25,0,I25,1.0,I25,1.0
4,myocardial infraction,I25,2,I25,0.33,I25,0.96


In [None]:
import pandas as pd

stroke_df = pd.read_csv('healthcare-dataset-stroke-data.csv')  # Change name if different
stroke_df.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [None]:
# Add stroke to dict
standard_terms['Stroke'] = 'I63'
standard_terms['Cerebral Infarction'] = 'I63'
standard_terms['Brain Stroke'] = 'I63'
standard_terms['Hemorrhagic Stroke'] = 'I63'
standard_terms['No Stroke'] = 'Z00'

# Re-encode for BERT (run this)
std_sentences = list(standard_terms.keys())
std_codes = list(standard_terms.values())
std_embeddings = model.encode(std_sentences)

print("Stroke terms added (I63)! Demo now covers diabetes, heart, and stroke.")

Stroke terms added (I63)! Demo now covers diabetes, heart, and stroke.


In [None]:
# Stroke test terms
stroke_tests = ['Stroke', 'Cerebral Infarction', 'brain stroke', 'hemorrhagic stroke', 'stroek']  # Typo test

print("=== Levenshtein (Stroke) ===")
for term in stroke_tests:
    match, code, dist = map_term_levenshtein(term)
    print(f"Input: '{term}' → Match: '{match}' (Code: {code}, Distance: {dist})")

print("\n=== Jaccard (Stroke) ===")
for term in stroke_tests:
    match, code, sim = map_term_jaccard(term)
    print(f"Input: '{term}' → Match: '{match}' (Code: {code}, Similarity: {sim:.2f})")

print("\n=== BERT (Stroke) ===")
for term in stroke_tests:
    match, code, sim = map_term_bert(term)
    print(f"Input: '{term}' → Match: '{match}' (Code: {code}, Similarity: {sim:.2f})")

=== Levenshtein (Stroke) ===
Input: 'Stroke' → Match: 'Stroke' (Code: I63, Distance: 0)
Input: 'Cerebral Infarction' → Match: 'Cerebral Infarction' (Code: I63, Distance: 0)
Input: 'brain stroke' → Match: 'Brain Stroke' (Code: I63, Distance: 0)
Input: 'hemorrhagic stroke' → Match: 'Hemorrhagic Stroke' (Code: I63, Distance: 0)
Input: 'stroek' → Match: 'Stroke' (Code: I63, Distance: 2)

=== Jaccard (Stroke) ===
Input: 'Stroke' → Match: 'Stroke' (Code: I63, Similarity: 1.00)
Input: 'Cerebral Infarction' → Match: 'Cerebral Infarction' (Code: I63, Similarity: 1.00)
Input: 'brain stroke' → Match: 'Brain Stroke' (Code: I63, Similarity: 1.00)
Input: 'hemorrhagic stroke' → Match: 'Hemorrhagic Stroke' (Code: I63, Similarity: 1.00)
Input: 'stroek' → Match: 'stroek' (Code: UNKNOWN, Similarity: 0.00)

=== BERT (Stroke) ===
Input: 'Stroke' → Match: 'Stroke' (Code: I63, Similarity: 1.00)
Input: 'Cerebral Infarction' → Match: 'Cerebral Infarction' (Code: I63, Similarity: 1.00)
Input: 'brain stroke' → M

In [None]:
# FHIR for stroke example
fhir_stroke, sim = create_fhir_condition_bert(301, 'Stroke')
print("=== FHIR for 'Stroke' ===")
print(fhir_stroke)
print(f"Similarity: {sim:.2f}")

fhir_typo, sim_typo = create_fhir_condition_bert(302, 'stroek')
print("\n=== FHIR for 'stroek' (Typo Fixed) ===")
print(fhir_typo)
print(f"Similarity: {sim_typo:.2f}")

=== FHIR for 'Stroke' ===
{
  "resourceType": "Condition",
  "code": {
    "coding": [
      {
        "system": "http://hl7.org/fhir/sid/icd-10",
        "code": "I63",
        "display": "Stroke"
      }
    ]
  },
  "subject": {
    "reference": "Patient/301"
  }
}
Similarity: 1.00

=== FHIR for 'stroek' (Typo Fixed) ===
{
  "resourceType": "Condition",
  "code": {
    "coding": [
      {
        "system": "http://hl7.org/fhir/sid/icd-10",
        "code": "Z00",
        "display": "No Stroke"
      }
    ]
  },
  "subject": {
    "reference": "Patient/302"
  }
}
Similarity: 0.86


/tmp/ipython-input-3017900503.py:7: PydanticDeprecatedSince20: The `construct` method is deprecated; use `model_construct` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  condition = Condition.construct()


In [None]:
import pandas as pd

data = {
    'Term': ['Sugar', 'Chest Pain', 'Stroke', 'Diabetis', 'myocardial infraction'],
    'Levenshtein Code': ['E11', 'I25', 'I63', 'E11', 'I25'],
    'Levenshtein Score': [0, 0, 0, 1, 2],
    'Jaccard Code': ['E11', 'I25', 'I63', 'E11', 'I25'],
    'Jaccard Score': [1.00, 1.00, 1.00, 0.25, 0.33],
    'BERT Code': ['E11', 'I25', 'I63', 'E11', 'I25'],
    'BERT Score': [0.91, 1.00, 0.98, 0.75, 0.96]  # Approximate from runs
}

comparison_df = pd.DataFrame(data)
print("=== Final 3-Disease Comparison (Diabetes + Heart + Stroke) ===")
comparison_df

=== Final 3-Disease Comparison (Diabetes + Heart + Stroke) ===


Unnamed: 0,Term,Levenshtein Code,Levenshtein Score,Jaccard Code,Jaccard Score,BERT Code,BERT Score
0,Sugar,E11,0,E11,1.0,E11,0.91
1,Chest Pain,I25,0,I25,1.0,I25,1.0
2,Stroke,I63,0,I63,1.0,I63,0.98
3,Diabetis,E11,1,E11,0.25,E11,0.75
4,myocardial infraction,I25,2,I25,0.33,I25,0.96
