In [1]:
#imports libraries and packages

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from IPython.display import display


In [2]:
#file_path = "dataset/ed/diagnosis.csv"  # Remove 'thesis/' from the path
data = pd.read_csv('dataset/ed/finals/5_finalpart1.csv')
snomed = pd.read_csv('dataset/ed/finals/snomed.csv')
look_up = pd.read_csv('dataset/ed/finals/static_mort_predictor.csv')

In [3]:
data.head()

Unnamed: 0,subject_id,stay_id,intime,outtime,gender,race,chiefcomplaint,anchor_age,anchor_year,anchor_year_group,dod,dead_in_days,died_within_30_days,race_standard,age_group
0,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,2180-09-09,125.0,0,White,46-65
1,10000032,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,Abdominal distention,52.0,2180.0,2014 - 2016,2180-09-09,74.0,0,White,46-65
2,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,2180-09-09,34.0,0,White,46-65
3,10000032,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,Hypotension,52.0,2180.0,2014 - 2016,2180-09-09,48.0,0,White,46-65
4,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,2180-09-09,47.0,0,White,46-65


In [4]:

# Check if subject_id is unique
is_unique = data['subject_id'].is_unique

print("Is 'subject_id' unique across rows? ", is_unique)

Is 'subject_id' unique across rows?  False


In [6]:
subject_counts = data['subject_id'].value_counts()
print(subject_counts)


subject_id
15496609    321
16233333    210
16662316    184
14394983    170
17903094    167
           ... 
14034659      1
14034667      1
14034740      1
14034999      1
19999987      1
Name: count, Length: 205449, dtype: int64


In [7]:
data['unique_visit_id'] = data['subject_id'].astype(str) + '_' + data['stay_id'].astype(str)

is_unique_visit = data['unique_visit_id'].is_unique

print("Is (subject_id + stay_id) unique across rows? ", is_unique_visit)


Is (subject_id + stay_id) unique across rows?  True


In [9]:

# Check for exact duplicate rows
duplicates = data.duplicated()

# How many duplicate rows?
num_duplicates = duplicates.sum()
print(f"Number of exact duplicate rows: {num_duplicates}")

# If you want to see the duplicated rows:
if num_duplicates > 0:
    duplicated_rows = data[duplicates]
    print(duplicated_rows.head())  # Show a few examples

Number of exact duplicate rows: 0


In [10]:
# Find duplicate subject_id rows
duplicates = data[data.duplicated(subset=['subject_id'], keep=False)]
print(duplicates[['subject_id', 'stay_id', 'chiefcomplaint']])


        subject_id   stay_id                            chiefcomplaint
0         10000032  33258284            Abd pain, Abdominal distention
1         10000032  38112554                      Abdominal distention
2         10000032  35968195                           n/v/d, Abd pain
3         10000032  32952584                               Hypotension
4         10000032  39399961  Abdominal distention, Abd pain, LETHAGIC
...            ...       ...                                       ...
424983    19999784  37972930                              Abnormal MRI
424984    19999784  34149746                                Wound eval
424985    19999784  35692999                            L Leg weakness
424986    19999828  32917002                      Abd pain, Wound eval
424987    19999828  30712109                      Abd pain, Wound eval

[289474 rows x 3 columns]


In [16]:
is_stay_id_unique = data['stay_id'].is_unique
print("Is 'stay_id' unique across rows? ", is_stay_id_unique)


Is 'stay_id' unique across rows?  True


In [18]:
duplicated_stays = data[data.duplicated(subset=['stay_id'], keep=False)]
print(duplicated_stays[['stay_id', 'subject_id', 'chiefcomplaint']])


Empty DataFrame
Columns: [stay_id, subject_id, chiefcomplaint]
Index: []


In [20]:
# Group by stay_id and count unique intime and outtime values
time_check = data.groupby('stay_id')[['intime', 'outtime']].nunique()

# Filter where there are multiple intime or outtime values per stay_id
non_unique_times = time_check[(time_check['intime'] > 1) | (time_check['outtime'] > 1)]

# Print results
if non_unique_times.empty:
    print("✅ Each stay_id has exactly one intime and one outtime.")
else:
    print("🚨 Some stay_ids have multiple intime or outtime values:")
    print(non_unique_times)


✅ Each stay_id has exactly one intime and one outtime.


| Column(s)                       | Unique? | Notes                             |
| ------------------------------- | ------- | --------------------------------- |
| `subject_id`                    | ❌ No    | Patients can have multiple visits |
| `stay_id`                       | ✅ Yes   | Unique per visit — perfect key    |
| `stay_id` → `intime`, `outtime` | ✅ Yes   | Each visit has exact timestamps   |
| `subject_id + stay_id`          | ✅ Yes   | Visit is uniquely tied to patient |



In [27]:
look_up.head()

Unnamed: 0,symptom,term,is_male,ed_age,mu,sigma
0,1023001,Apnea,True,18,-6.247295,0.994007
1,1023001,Apnea,True,19,-6.122227,0.95043
2,1023001,Apnea,True,20,-5.997159,0.909199
3,1023001,Apnea,True,21,-5.872091,0.870649
4,1023001,Apnea,True,22,-5.747023,0.83515


In [29]:
# 1. Find unique symptom codes in look_up
unique_symptoms = look_up['symptom'].unique()
print(f"Number of unique symptoms in look_up: {len(unique_symptoms)}")

# 2. Save the unique symptoms to CSV
unique_symptoms_df = pd.DataFrame(unique_symptoms, columns=['symptom'])
unique_symptoms_df.to_csv('unique_symptoms.csv', index=False)
print("Unique symptoms saved to 'unique_symptoms.csv'")


Number of unique symptoms in look_up: 437
Unique symptoms saved to 'unique_symptoms.csv'


In [31]:
snomed.head()

Unnamed: 0,count,text,snomed
0,45577.0,abd pain,21522001.0
1,,nausea and vomiting,16932000.0
2,27394.0,chest pain,29857009.0
3,23480.0,transfer,
4,21395.0,s/p fall,1912002.0


In [33]:
# 1. Find unique symptom codes in look_up
unique_symptoms_in_snomed = snomed['snomed'].unique()
print(f"Number of unique symptoms in snomed: {len(unique_symptoms_in_snomed)}")

Number of unique symptoms in snomed: 77


In [35]:


# Step 1: Convert both columns to string for safe comparison
lookup_symptoms = look_up['symptom'].astype(str).unique()
snomed_codes = snomed['snomed'].dropna().astype(str).unique()

# Step 2: Find which lookup symptoms are missing in snomed codes
missing_symptoms = set(lookup_symptoms) - set(snomed_codes)

# Step 3: Show results
print(f"Number of symptoms in look_up: {len(lookup_symptoms)}")
print(f"Number of codes in snomed: {len(snomed_codes)}")
print(f"Number of symptoms missing in snomed: {len(missing_symptoms)}")

if missing_symptoms:
    print("\nMissing symptom codes:")
    print(sorted(missing_symptoms))
    
    # Optional: save to file
    pd.DataFrame(sorted(missing_symptoms), columns=['missing_symptom']).to_csv('dataset/ed/finals/6_missing_symptoms.csv', index=False)
    print("✅ Saved missing symptom codes to 'missing_symptoms.csv'")
else:
    print("✅ All symptom codes from look_up exist in snomed!")


Number of symptoms in look_up: 437
Number of codes in snomed: 76
Number of symptoms missing in snomed: 365

Missing symptom codes:
['1023001', '102570003', '102874004', '10443009', '105481005', '106004004', '108365000', '110030002', '110468005', '111479008', '1157096002', '116308004', '116312005', '116337000', '11687002', '118235002', '118931002', '118936007', '118952005', '1208523001', '125593007', '125595000', '125596004', '125597008', '125598003', '125600009', '125601008', '125604000', '125621009', '125643001', '125665001', '125666000', '125670008', '127278005', '127279002', '128053003', '128069005', '128351009', '129231000119103', '130951007', '131148009', '13187008', '135818000', '14448006', '16001004', '160957000', '161152002', '161885008', '161972006', '162016000', '162059005', '162138001', '162246009', '162673000', '16269008', '165311008', '165397008', '170599006', '17369002', '180300007', '183737004', '185317003', '185389009', '18876004', '18949003', '191124002', '19471005', '

In [37]:
# Convert missing set to DataFrame
missing_symptom_df = look_up[look_up['symptom'].astype(str).isin(missing_symptoms)]

# See top few
print(missing_symptom_df.head(30))

# Optionally save for analysis
missing_symptom_df.to_csv('dataset/ed/finals/7_missing_symptom_details.csv', index=False)
print("num of rows that are missed:",len(missing_symptom_df))

l=missing_symptom_df['symptom'].unique()
print("num of uniques symtoms' snomed cods are: ",len(l))


    symptom   term  is_male  ed_age        mu     sigma
0   1023001  Apnea     True      18 -6.247295  0.994007
1   1023001  Apnea     True      19 -6.122227  0.950430
2   1023001  Apnea     True      20 -5.997159  0.909199
3   1023001  Apnea     True      21 -5.872091  0.870649
4   1023001  Apnea     True      22 -5.747023  0.835150
5   1023001  Apnea     True      23 -5.621977  0.803115
6   1023001  Apnea     True      24 -5.497061  0.774990
7   1023001  Apnea     True      25 -5.372406  0.751162
8   1023001  Apnea     True      26 -5.248142  0.731907
9   1023001  Apnea     True      27 -5.124400  0.717362
10  1023001  Apnea     True      28 -5.001310  0.707501
11  1023001  Apnea     True      29 -4.879002  0.702120
12  1023001  Apnea     True      30 -4.757607  0.700850
13  1023001  Apnea     True      31 -4.637256  0.703173
14  1023001  Apnea     True      32 -4.518079  0.708464
15  1023001  Apnea     True      33 -4.400206  0.716024
16  1023001  Apnea     True      34 -4.283767  0

In [39]:
# exploding the sympotoms in the 

In [41]:
# Summarize one row per missing symptom code
missing_symptom_summary = (
    look_up[look_up['symptom'].astype(str).isin(missing_symptoms)]
    .drop_duplicates(subset=['symptom', 'term'])  # drop repeated rows per code
    .loc[:, ['symptom', 'term', 'mu', 'sigma']]
    .sort_values('symptom')
)

# View top 30
print(missing_symptom_summary.head(30))

# Confirm the total
print(f"\nUnique missing symptom codes: {missing_symptom_summary['symptom'].nunique()}")

# Save
missing_symptom_summary.to_csv("dataset/ed/finals/8_missing_symptom_summary.csv", index=False)
print("✅ Saved summary of missing symptoms to 'missing_symptom_summary.csv'")


         symptom                                            term        mu  \
0        1023001                                           Apnea -6.247295   
186    102570003                                   Inguinal pain -8.887124   
372    102874004                              Possible pregnancy -8.429332   
558     10443009                             Localized infection -8.256367   
744    105481005                      Refusal of food by patient -8.221643   
930    106004004           Hemorrhagic complication of pregnancy -8.766381   
1116   108365000                               Infection of skin -8.044164   
1302   110030002                      Concussion injury of brain -8.702403   
1488   110468005                              Ambulatory surgery -8.250761   
1674   111479008                         Organic mental disorder -8.556442   
2046  1157096002                       Self destructive behavior -8.255906   
2232   116308004                      Finding of shoulder region

In [43]:
# Step 1: Prepare both datasets
snomed_full = snomed.copy()
lookup_terms = look_up[['symptom', 'term']].drop_duplicates()
lookup_terms['term'] = lookup_terms['term'].str.strip().str.lower()
lookup_terms['symptom'] = lookup_terms['symptom'].astype(str)

snomed_full['text'] = snomed_full['text'].str.strip().str.lower()

# Step 2: Merge to find and fill missing snomed codes
# Merge on the name fields
snomed_merged = snomed_full.merge(lookup_terms, how='left', left_on='text', right_on='term')

# Step 3: Where 'snomed' is missing, fill it using 'symptom' from the merge
snomed_merged['snomed'] = snomed_merged['snomed'].fillna(snomed_merged['symptom'])

# Step 4: Drop extra columns and clean up
snomed_filled = snomed_merged.drop(columns=['symptom', 'term'])

# Optional: Convert SNOMED back to integer if possible
try:
    snomed_filled['snomed'] = snomed_filled['snomed'].astype(int)
except:
    snomed_filled['snomed'] = snomed_filled['snomed'].astype(str)  # keep as string if needed

# Step 5: Save result
snomed_filled.to_csv("dataset/ed/finals/9_snomed_filled.csv", index=False)
print(f"✅ Filled missing SNOMED codes and saved to 'snomed_filled.csv'")


✅ Filled missing SNOMED codes and saved to 'snomed_filled.csv'


In [45]:
# Get symptom codes from look_up
lookup_codes = look_up['symptom'].astype(str).unique()

# Get snomed codes after filling
snomed_codes = snomed_merged['snomed'].dropna().astype(str).unique()

# Compare sets
remaining_missing = set(lookup_codes) - set(snomed_codes)

print(f"🔎 Unique symptom codes in look_up: {len(lookup_codes)}")
print(f"✅ Unique SNOMED codes in snomed_merged: {len(snomed_codes)}")
print(f"❌ Remaining missing codes: {len(remaining_missing)}")

# Show some of the missing codes
if remaining_missing:
    print("Still missing codes (sample):")
    print(sorted(list(remaining_missing))[:20])
else:
    print("🎉 All symptom codes from look_up are now present in snomed!")


🔎 Unique symptom codes in look_up: 437
✅ Unique SNOMED codes in snomed_merged: 151
❌ Remaining missing codes: 290
Still missing codes (sample):
['102874004', '10443009', '105481005', '106004004', '108365000', '110030002', '110468005', '111479008', '116308004', '116312005', '116337000', '11687002', '118235002', '118931002', '118936007', '118952005', '1208523001', '125593007', '125595000', '125596004']


In [47]:
# Step 1: Get unique symptom codes from look_up
lookup_codes = look_up['symptom'].astype(str).unique()

# Step 2: Get unique snomed codes from updated snomed_filled
snomed_codes = snomed_filled['snomed'].dropna().astype(str).unique()

# Step 3: Compare and find missing
remaining_missing = set(lookup_codes) - set(snomed_codes)

# Step 4: Report
print(f"🔎 Unique symptom codes in look_up: {len(lookup_codes)}")
print(f"✅ Unique SNOMED codes in snomed_filled: {len(snomed_codes)}")
print(f"❌ Remaining missing codes: {len(remaining_missing)}")

# Step 5: Show some examples if any are missing
if remaining_missing:
    print("Still missing codes (sample):")
    print(sorted(list(remaining_missing))[:20])
else:
    print("🎉 All symptom codes from look_up are now present in snomed!")


🔎 Unique symptom codes in look_up: 437
✅ Unique SNOMED codes in snomed_filled: 152
❌ Remaining missing codes: 290
Still missing codes (sample):
['102874004', '10443009', '105481005', '106004004', '108365000', '110030002', '110468005', '111479008', '116308004', '116312005', '116337000', '11687002', '118235002', '118931002', '118936007', '118952005', '1208523001', '125593007', '125595000', '125596004']


In [50]:
"""


from rapidfuzz import process, fuzz

# Lowercase all lookup terms for comparison
lookup_terms = look_up[['symptom', 'term']].drop_duplicates()
lookup_terms['term_clean'] = lookup_terms['term'].str.strip().str.lower()

# Initialize list of matches
matched_terms = []

# Try matching each snomed text to look_up terms
for idx, row in snomed[snomed['snomed'].isna()].iterrows():
    snomed_text = str(row['text']).strip().lower()
    match, score, _ = process.extractOne(
        snomed_text,
        lookup_terms['term_clean'],
        scorer=fuzz.ratio
    )
    if score >= 85:
        symptom_code = lookup_terms[lookup_terms['term_clean'] == match]['symptom'].values[0]
        matched_terms.append((row['text'], match, symptom_code, score))

# Print top matches
for original, matched, code, score in matched_terms[:10]:
    print(f"{original} → {matched} → {code} (score: {score})")

    
"""

'\n\n\nfrom rapidfuzz import process, fuzz\n\n# Lowercase all lookup terms for comparison\nlookup_terms = look_up[[\'symptom\', \'term\']].drop_duplicates()\nlookup_terms[\'term_clean\'] = lookup_terms[\'term\'].str.strip().str.lower()\n\n# Initialize list of matches\nmatched_terms = []\n\n# Try matching each snomed text to look_up terms\nfor idx, row in snomed[snomed[\'snomed\'].isna()].iterrows():\n    snomed_text = str(row[\'text\']).strip().lower()\n    match, score, _ = process.extractOne(\n        snomed_text,\n        lookup_terms[\'term_clean\'],\n        scorer=fuzz.ratio\n    )\n    if score >= 85:\n        symptom_code = lookup_terms[lookup_terms[\'term_clean\'] == match][\'symptom\'].values[0]\n        matched_terms.append((row[\'text\'], match, symptom_code, score))\n\n# Print top matches\nfor original, matched, code, score in matched_terms[:10]:\n    print(f"{original} → {matched} → {code} (score: {score})")\n\n    \n'

In [53]:
"""

from rapidfuzz import process, fuzz

lookup_terms = look_up[['symptom', 'term']].drop_duplicates()
lookup_terms['term_clean'] = lookup_terms['term'].str.strip().str.lower()

matched_terms = []

# Go through only rows with missing snomed and valid text
for idx, row in snomed[snomed['snomed'].isna()].iterrows():
    text_val = row['text']
    
    # Skip if text is NaN or not a string
    if not isinstance(text_val, str):
        continue

    snomed_text = text_val.strip().lower()
    
    match, score, _ = process.extractOne(
        snomed_text,
        lookup_terms['term_clean'],
        scorer=fuzz.ratio
    )
    
    if score >= 85:
        symptom_code = lookup_terms[lookup_terms['term_clean'] == match]['symptom'].values[0]
        matched_terms.append((row['text'], match, symptom_code, score))

# Print a sample of matches
for original, matched, code, score in matched_terms[:10]:
    print(f"{original} → {matched} → {code} (score: {score})")


"""

'\n\nfrom rapidfuzz import process, fuzz\n\nlookup_terms = look_up[[\'symptom\', \'term\']].drop_duplicates()\nlookup_terms[\'term_clean\'] = lookup_terms[\'term\'].str.strip().str.lower()\n\nmatched_terms = []\n\n# Go through only rows with missing snomed and valid text\nfor idx, row in snomed[snomed[\'snomed\'].isna()].iterrows():\n    text_val = row[\'text\']\n    \n    # Skip if text is NaN or not a string\n    if not isinstance(text_val, str):\n        continue\n\n    snomed_text = text_val.strip().lower()\n    \n    match, score, _ = process.extractOne(\n        snomed_text,\n        lookup_terms[\'term_clean\'],\n        scorer=fuzz.ratio\n    )\n    \n    if score >= 85:\n        symptom_code = lookup_terms[lookup_terms[\'term_clean\'] == match][\'symptom\'].values[0]\n        matched_terms.append((row[\'text\'], match, symptom_code, score))\n\n# Print a sample of matches\nfor original, matched, code, score in matched_terms[:10]:\n    print(f"{original} → {matched} → {code} (sc

In [55]:
"""


unmatched_texts = snomed[snomed['snomed'].isna()]['text'].unique()
print("Texts with missing codes:")
print(sorted(unmatched_texts))


"""

'\n\n\nunmatched_texts = snomed[snomed[\'snomed\'].isna()][\'text\'].unique()\nprint("Texts with missing codes:")\nprint(sorted(unmatched_texts))\n\n\n'

In [57]:
"""


mport pandas as pd
from rapidfuzz import process, fuzz

# Load your tables (adjust path if needed)
snomed = pd.read_csv('dataset/ed/finals/snomed.csv')
look_up = pd.read_csv('dataset/ed/finals/static_mort_predictor.csv')

# Clean and lowercase 'term' column in look_up
lookup_terms = look_up[['symptom', 'term']].drop_duplicates()
lookup_terms['term_clean'] = lookup_terms['term'].astype(str).str.strip().str.lower()
lookup_terms['symptom'] = lookup_terms['symptom'].astype(str)

# Clean 'text' column in snomed
snomed['text_clean'] = snomed['text'].astype(str).str.strip().str.lower()



"""

"\n\n\nmport pandas as pd\nfrom rapidfuzz import process, fuzz\n\n# Load your tables (adjust path if needed)\nsnomed = pd.read_csv('dataset/ed/finals/snomed.csv')\nlook_up = pd.read_csv('dataset/ed/finals/static_mort_predictor.csv')\n\n# Clean and lowercase 'term' column in look_up\nlookup_terms = look_up[['symptom', 'term']].drop_duplicates()\nlookup_terms['term_clean'] = lookup_terms['term'].astype(str).str.strip().str.lower()\nlookup_terms['symptom'] = lookup_terms['symptom'].astype(str)\n\n# Clean 'text' column in snomed\nsnomed['text_clean'] = snomed['text'].astype(str).str.strip().str.lower()\n\n\n\n"

In [59]:
"""


matched_terms = []

# Loop over rows where snomed code is missing
for idx, row in snomed[snomed['snomed'].isna()].iterrows():
    text = row['text_clean']
    
    # Skip if blank
    if not isinstance(text, str) or text == '':
        continue

    # Fuzzy match to look_up terms
    match, score, _ = process.extractOne(
        text,
        lookup_terms['term_clean'],
        scorer=fuzz.ratio
    )

    # Accept only good matches (adjust threshold if needed)
    if score >= 85:
        symptom_code = lookup_terms[lookup_terms['term_clean'] == match]['symptom'].values[0]
        matched_terms.append((row['text'], match, symptom_code, score))



"""

"\n\n\nmatched_terms = []\n\n# Loop over rows where snomed code is missing\nfor idx, row in snomed[snomed['snomed'].isna()].iterrows():\n    text = row['text_clean']\n    \n    # Skip if blank\n    if not isinstance(text, str) or text == '':\n        continue\n\n    # Fuzzy match to look_up terms\n    match, score, _ = process.extractOne(\n        text,\n        lookup_terms['term_clean'],\n        scorer=fuzz.ratio\n    )\n\n    # Accept only good matches (adjust threshold if needed)\n    if score >= 85:\n        symptom_code = lookup_terms[lookup_terms['term_clean'] == match]['symptom'].values[0]\n        matched_terms.append((row['text'], match, symptom_code, score))\n\n\n\n"

In [61]:
"""


# Convert to DataFrame to inspect
matched_df = pd.DataFrame(matched_terms, columns=['text', 'matched_term', 'snomed_filled', 'score'])

# Show a few examples
print(matched_df.head())

# Save for reuse or review
matched_df.to_csv('matched_terms_fuzzy.csv', index=False)
print("✅ Full matched_terms list saved to 'matched_terms_fuzzy.csv'")


"""

'\n\n\n# Convert to DataFrame to inspect\nmatched_df = pd.DataFrame(matched_terms, columns=[\'text\', \'matched_term\', \'snomed_filled\', \'score\'])\n\n# Show a few examples\nprint(matched_df.head())\n\n# Save for reuse or review\nmatched_df.to_csv(\'matched_terms_fuzzy.csv\', index=False)\nprint("✅ Full matched_terms list saved to \'matched_terms_fuzzy.csv\'")\n\n\n'

In [63]:
"""


import pandas as pd
from rapidfuzz import process, fuzz

# === STEP 1: Load and prepare data ===
snomed = pd.read_csv('dataset/ed/finals/snomed.csv')
look_up = pd.read_csv('dataset/ed/finals/static_mort_predictor.csv')

# Clean text fields
snomed['text_clean'] = snomed['text'].astype(str).str.strip().str.lower()
lookup_terms = look_up[['symptom', 'term']].drop_duplicates()
lookup_terms['term_clean'] = lookup_terms['term'].astype(str).str.strip().str.lower()
lookup_terms['symptom'] = lookup_terms['symptom'].astype(str)

# === STEP 2: Fuzzy match snomed text (missing codes) to look_up terms ===
matched_terms = []

for idx, row in snomed[snomed['snomed'].isna()].iterrows():
    snomed_text = row['text_clean']
    
    if not isinstance(snomed_text, str) or snomed_text == '':
        continue

    match, score, _ = process.extractOne(
        snomed_text,
        lookup_terms['term_clean'],
        scorer=fuzz.ratio
    )

    if score >= 85:  # threshold for acceptable match
        symptom_code = lookup_terms[lookup_terms['term_clean'] == match]['symptom'].values[0]
        matched_terms.append((row['text'], match, symptom_code, score))

# Convert to DataFrame
matched_df = pd.DataFrame(matched_terms, columns=['text', 'matched_term', 'snomed_filled', 'score'])
matched_df['text'] = matched_df['text'].astype(str).str.strip().str.lower()

# === STEP 3: Merge matches back to snomed ===
snomed['text_clean'] = snomed['text'].astype(str).str.strip().str.lower()

snomed_merged = snomed.merge(
    matched_df[['text', 'snomed_filled']],
    how='left',
    left_on='text_clean',
    right_on='text'
)

# Fill missing snomed codes from matched results
snomed_merged['snomed'] = snomed_merged['snomed'].fillna(snomed_merged['snomed_filled'])

# ✅ FIX: Only drop helper columns if they exist
snomed_merged.drop(columns=[col for col in ['text_clean', 'text', 'snomed_filled'] if col in snomed_merged.columns], inplace=True)

# Convert SNOMED to string for consistency
snomed_merged['snomed'] = snomed_merged['snomed'].astype(str)

# === STEP 4: Save the updated table and matched results ===
snomed_merged.to_csv('snomed_filled_with_fuzzy.csv', index=False)
matched_df.to_csv('matched_terms_fuzzy.csv', index=False)

print("✅ SNOMED table saved to 'snomed_filled_with_fuzzy.csv'")
print("✅ Matched terms saved to 'matched_terms_fuzzy.csv'")

# === STEP 5: Check if all look_up symptoms are now covered ===
lookup_codes = look_up['symptom'].astype(str).unique()
snomed_codes = snomed_merged['snomed'].dropna().astype(str).unique()
remaining_missing = set(lookup_codes) - set(snomed_codes)

print(f"\n🔎 Total unique symptom codes in look_up: {len(lookup_codes)}")
print(f"✅ Total unique SNOMED codes in updated snomed: {len(snomed_codes)}")
print(f"❌ Remaining unmatched codes: {len(remaining_missing)}")

if remaining_missing:
    print("Sample of remaining unmatched codes:")
    print(sorted(list(remaining_missing))[:20])
else:
    print("🎉 All symptom codes from look_up are now matched in the SNOMED table!")



"""

'\n\n\nimport pandas as pd\nfrom rapidfuzz import process, fuzz\n\n# === STEP 1: Load and prepare data ===\nsnomed = pd.read_csv(\'dataset/ed/finals/snomed.csv\')\nlook_up = pd.read_csv(\'dataset/ed/finals/static_mort_predictor.csv\')\n\n# Clean text fields\nsnomed[\'text_clean\'] = snomed[\'text\'].astype(str).str.strip().str.lower()\nlookup_terms = look_up[[\'symptom\', \'term\']].drop_duplicates()\nlookup_terms[\'term_clean\'] = lookup_terms[\'term\'].astype(str).str.strip().str.lower()\nlookup_terms[\'symptom\'] = lookup_terms[\'symptom\'].astype(str)\n\n# === STEP 2: Fuzzy match snomed text (missing codes) to look_up terms ===\nmatched_terms = []\n\nfor idx, row in snomed[snomed[\'snomed\'].isna()].iterrows():\n    snomed_text = row[\'text_clean\']\n    \n    if not isinstance(snomed_text, str) or snomed_text == \'\':\n        continue\n\n    match, score, _ = process.extractOne(\n        snomed_text,\n        lookup_terms[\'term_clean\'],\n        scorer=fuzz.ratio\n    )\n\n    

In [66]:
'''

import pandas as pd
import re

# Step 1: Clean and standardize the chief complaints
def clean_chiefcomplaint(text):
    if pd.isna(text):
        return []
    text = text.lower()
    text = re.sub(r"[\'\"+\?]", "", text)  # remove special characters
    #text = re.sub(r"\b(llq|rlq|luq|ruq|l|r|lower|upper)\s*", "", text)  # remove quadrant references
    text = re.sub(r"\babdominal\b", "abd", text)  # standardize 'abdominal' to 'abd'
    text = re.sub(r"\s+", " ", text)  # normalize whitespace
    return [t.strip() for t in text.split(",") if t.strip()]

# Step 2: Apply cleaning function
data['terms'] = data['chiefcomplaint'].apply(clean_chiefcomplaint)


'''


  '''


'\n\nimport pandas as pd\nimport re\n\n# Step 1: Clean and standardize the chief complaints\ndef clean_chiefcomplaint(text):\n    if pd.isna(text):\n        return []\n    text = text.lower()\n    text = re.sub(r"[\'"+\\?]", "", text)  # remove special characters\n    #text = re.sub(r"\x08(llq|rlq|luq|ruq|l|r|lower|upper)\\s*", "", text)  # remove quadrant references\n    text = re.sub(r"\x08abdominal\x08", "abd", text)  # standardize \'abdominal\' to \'abd\'\n    text = re.sub(r"\\s+", " ", text)  # normalize whitespace\n    return [t.strip() for t in text.split(",") if t.strip()]\n\n# Step 2: Apply cleaning function\ndata[\'terms\'] = data[\'chiefcomplaint\'].apply(clean_chiefcomplaint)\n\n\n'

In [68]:

# + is replaced with a comma via text.replace("+", ",")
# "abd pain + nausea" → "abd pain", "nausea" (becomes two terms)
# More accurate splitting of multi-symptom entries

import pandas as pd
import re

# Step 1: Clean and standardize the chief complaints

def clean_chiefcomplaint(text):
    if pd.isna(text):
        return []
    text = text.lower()
    text = re.sub(r"[\'\"\?]", "", text)  # remove quotes and question marks
    text = text.replace("+", ",")         # replace plus sign with comma
    #text = re.sub(r"\b(llq|rlq|luq|ruq|l|r|lower|upper)\s*", "", text)
    text = re.sub(r"\babdominal\b", "abd", text)
    text = re.sub(r"\s+", " ", text)  # normalize whitespace
    return [t.strip() for t in text.split(",") if t.strip()]


# Step 2: Apply cleaning function
data['terms'] = data['chiefcomplaint'].apply(clean_chiefcomplaint)

In [69]:
data.head()

Unnamed: 0,subject_id,stay_id,intime,outtime,gender,race,chiefcomplaint,anchor_age,anchor_year,anchor_year_group,dod,dead_in_days,died_within_30_days,race_standard,age_group,unique_visit_id,terms
0,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,2180-09-09,125.0,0,White,46-65,10000032_33258284,"[abd pain, abd distention]"
1,10000032,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,Abdominal distention,52.0,2180.0,2014 - 2016,2180-09-09,74.0,0,White,46-65,10000032_38112554,[abd distention]
2,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,2180-09-09,34.0,0,White,46-65,10000032_35968195,"[n/v/d, abd pain]"
3,10000032,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,Hypotension,52.0,2180.0,2014 - 2016,2180-09-09,48.0,0,White,46-65,10000032_32952584,[hypotension]
4,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,2180-09-09,47.0,0,White,46-65,10000032_39399961,"[abd distention, abd pain, lethagic]"


In [70]:
data.terms.head(20)

0                 [abd pain, abd distention]
1                           [abd distention]
2                          [n/v/d, abd pain]
3                              [hypotension]
4       [abd distention, abd pain, lethagic]
5                [confusion, hallucinations]
6     [altered mental status, b pedal edema]
7             [left cheek swelling, abscess]
8                          [l cheek abscess]
9                        [l facial swelling]
10                          [suture removal]
11                    [laceration, s/p fall]
12           [throat foreign body sensation]
13                              [l hip pain]
14                             [head injury]
15                             [r foot pain]
16                         [anemia s/p fall]
17                                [abd pain]
18                            [luq abd pain]
19                                     [ili]
Name: terms, dtype: object

In [71]:
# Step 3: Create 'unique_visit_id' column
data['unique_visit_id'] = data['subject_id'].astype(str) + '_' + data['stay_id'].astype(str)


In [72]:
data.head(10)

Unnamed: 0,subject_id,stay_id,intime,outtime,gender,race,chiefcomplaint,anchor_age,anchor_year,anchor_year_group,dod,dead_in_days,died_within_30_days,race_standard,age_group,unique_visit_id,terms
0,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,2180-09-09,125.0,0,White,46-65,10000032_33258284,"[abd pain, abd distention]"
1,10000032,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,Abdominal distention,52.0,2180.0,2014 - 2016,2180-09-09,74.0,0,White,46-65,10000032_38112554,[abd distention]
2,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,2180-09-09,34.0,0,White,46-65,10000032_35968195,"[n/v/d, abd pain]"
3,10000032,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,Hypotension,52.0,2180.0,2014 - 2016,2180-09-09,48.0,0,White,46-65,10000032_32952584,[hypotension]
4,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,2180-09-09,47.0,0,White,46-65,10000032_39399961,"[abd distention, abd pain, lethagic]"
5,10000084,35203156,2160-11-20 20:36:00,2160-11-21 03:20:00,M,WHITE,"Confusion, Hallucinations",72.0,2160.0,2017 - 2019,2161-02-13,84.0,0,White,66-80,10000084_35203156,"[confusion, hallucinations]"
6,10000084,36954971,2160-12-27 18:32:00,2160-12-28 16:07:00,M,WHITE,"Altered mental status, B Pedal edema",72.0,2160.0,2017 - 2019,2161-02-13,47.0,0,White,66-80,10000084_36954971,"[altered mental status, b pedal edema]"
7,10000108,36533795,2163-09-27 16:18:00,2163-09-28 09:04:00,M,WHITE,"LEFT CHEEK SWELLING, Abscess",25.0,2163.0,2014 - 2016,,,0,White,18-30,10000108_36533795,"[left cheek swelling, abscess]"
8,10000108,32522732,2163-09-16 16:34:00,2163-09-16 18:13:00,M,WHITE,L CHEEK ABSCESS,25.0,2163.0,2014 - 2016,,,0,White,18-30,10000108_32522732,[l cheek abscess]
9,10000108,39513268,2163-09-24 16:14:00,2163-09-24 21:02:00,M,WHITE,L FACIAL SWELLING,25.0,2163.0,2014 - 2016,,,0,White,18-30,10000108_39513268,[l facial swelling]


In [73]:
# Symptom expansion dictionary
symptom_expand_map = {
    'depression/suicidal/deliberate self harm': ['depressed mood', 'suicide risk assessment', 'intentional self-harm'],
    'vomiting and/or nausea': ['nausea and vomiting'],
    'ha/htn post partum': ['headache', 'hypertension', 'postpartum state'],
    'htn nausea and dizzy': ['hypertension', 'nausea', 'dizziness'],
    'htn , abd pain': ['hypertension', 'abdominal pain'],
    'n/v/d': ['nausea and vomiting', 'diarrhea'],
    'nvd': ['nausea and vomiting', 'diarrhea'],
    'n/v': ['nausea and vomiting'],
    'v/n': ['nausea and vomiting'],
    'n&v': ['nausea and vomiting'],
    'nv': ['nausea and vomiting'],
    'n.v': ['nausea and vomiting'],
    'n/d': ['nausea', 'diarrhea'],
    'v/d': ['vomiting', 'diarrhea'],
    'vomiting/nausea': ['nausea and vomiting'],
    'nausea/vomiting': ['nausea and vomiting'],
    'constipated': ['constipation'],
    'diarrhoea': ['diarrhea'],
    'n/v /d': ['nausea and vomiting', 'diarrhea']
}

# Map and expand terms
def apply_mapping(term_list):
    new_terms = []
    for term in term_list:
        if term in symptom_expand_map:
            expanded = symptom_expand_map[term]
            if isinstance(expanded, list):
                new_terms.extend(expanded)
            else:
                new_terms.append(expanded)
        else:
            new_terms.append(term)
    return ', '.join(new_terms)

# Apply and store in new column
data['terms_new'] = data['terms'].apply(apply_mapping)


In [79]:
data.head(20)  # or any number of rows you want to preview

Unnamed: 0,subject_id,stay_id,intime,outtime,gender,race,chiefcomplaint,anchor_age,anchor_year,anchor_year_group,dod,dead_in_days,died_within_30_days,race_standard,age_group,unique_visit_id,terms,terms_new
0,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,2180-09-09,125.0,0,White,46-65,10000032_33258284,"[abd pain, abd distention]","abd pain, abd distention"
1,10000032,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,Abdominal distention,52.0,2180.0,2014 - 2016,2180-09-09,74.0,0,White,46-65,10000032_38112554,[abd distention],abd distention
2,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,2180-09-09,34.0,0,White,46-65,10000032_35968195,"[n/v/d, abd pain]","nausea and vomiting, diarrhea, abd pain"
3,10000032,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,Hypotension,52.0,2180.0,2014 - 2016,2180-09-09,48.0,0,White,46-65,10000032_32952584,[hypotension],hypotension
4,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,2180-09-09,47.0,0,White,46-65,10000032_39399961,"[abd distention, abd pain, lethagic]","abd distention, abd pain, lethagic"
5,10000084,35203156,2160-11-20 20:36:00,2160-11-21 03:20:00,M,WHITE,"Confusion, Hallucinations",72.0,2160.0,2017 - 2019,2161-02-13,84.0,0,White,66-80,10000084_35203156,"[confusion, hallucinations]","confusion, hallucinations"
6,10000084,36954971,2160-12-27 18:32:00,2160-12-28 16:07:00,M,WHITE,"Altered mental status, B Pedal edema",72.0,2160.0,2017 - 2019,2161-02-13,47.0,0,White,66-80,10000084_36954971,"[altered mental status, b pedal edema]","altered mental status, b pedal edema"
7,10000108,36533795,2163-09-27 16:18:00,2163-09-28 09:04:00,M,WHITE,"LEFT CHEEK SWELLING, Abscess",25.0,2163.0,2014 - 2016,,,0,White,18-30,10000108_36533795,"[left cheek swelling, abscess]","left cheek swelling, abscess"
8,10000108,32522732,2163-09-16 16:34:00,2163-09-16 18:13:00,M,WHITE,L CHEEK ABSCESS,25.0,2163.0,2014 - 2016,,,0,White,18-30,10000108_32522732,[l cheek abscess],l cheek abscess
9,10000108,39513268,2163-09-24 16:14:00,2163-09-24 21:02:00,M,WHITE,L FACIAL SWELLING,25.0,2163.0,2014 - 2016,,,0,White,18-30,10000108_39513268,[l facial swelling],l facial swelling


In [81]:
print(data[['terms', 'terms_new']].head(10))  # to focus on these two columns

                                    terms  \
0              [abd pain, abd distention]   
1                        [abd distention]   
2                       [n/v/d, abd pain]   
3                           [hypotension]   
4    [abd distention, abd pain, lethagic]   
5             [confusion, hallucinations]   
6  [altered mental status, b pedal edema]   
7          [left cheek swelling, abscess]   
8                       [l cheek abscess]   
9                     [l facial swelling]   

                                 terms_new  
0                 abd pain, abd distention  
1                           abd distention  
2  nausea and vomiting, diarrhea, abd pain  
3                              hypotension  
4       abd distention, abd pain, lethagic  
5                confusion, hallucinations  
6     altered mental status, b pedal edema  
7             left cheek swelling, abscess  
8                          l cheek abscess  
9                        l facial swelling  


In [84]:
'''

# Step 4: Explode terms into separate rows
data = data.explode('terms')  # in-place replacement of 'data'

# Step 5: Rename the exploded column
data = data.rename(columns={'terms': 'indiv_symptoms'})

# Step 6: Create 'stayed_id_num' as <stay_id>_<symptom_index>
data['symptom_index'] = data.groupby('stay_id').cumcount() + 1
data['stayed_id_num'] = data['stay_id'].astype(str) + '_' + data['symptom_index'].astype(str)

'''

"\n\n# Step 4: Explode terms into separate rows\ndata = data.explode('terms')  # in-place replacement of 'data'\n\n# Step 5: Rename the exploded column\ndata = data.rename(columns={'terms': 'indiv_symptoms'})\n\n# Step 6: Create 'stayed_id_num' as <stay_id>_<symptom_index>\ndata['symptom_index'] = data.groupby('stay_id').cumcount() + 1\ndata['stayed_id_num'] = data['stay_id'].astype(str) + '_' + data['symptom_index'].astype(str)\n\n"

In [88]:

exploded_rows = []

for _, row in data.iterrows():
    symptoms_str = row['terms_new']
    
    if pd.isna(symptoms_str):
        continue

    symptoms_list = [s.strip() for s in symptoms_str.split(',') if s.strip()]

    for idx, symptom in enumerate(symptoms_list, start=1):
        new_row = row.copy()
        new_row['indiv_symptom'] = symptom
        new_row['counter'] = idx
        new_row['unique_ids_exploded'] = f"{row['stay_id']}_{idx}"
        exploded_rows.append(new_row)

# Overwrite the original dataset
data = pd.DataFrame(exploded_rows)

# Save to CSV
data.to_csv("dataset/ed/finals/10_exploded_symptom_data.csv", index=False)


In [89]:
data.head(22)

Unnamed: 0,subject_id,stay_id,intime,outtime,gender,race,chiefcomplaint,anchor_age,anchor_year,anchor_year_group,...,dead_in_days,died_within_30_days,race_standard,age_group,unique_visit_id,terms,terms_new,indiv_symptom,counter,unique_ids_exploded
0,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,...,125.0,0,White,46-65,10000032_33258284,"[abd pain, abd distention]","abd pain, abd distention",abd pain,1,33258284_1
0,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,...,125.0,0,White,46-65,10000032_33258284,"[abd pain, abd distention]","abd pain, abd distention",abd distention,2,33258284_2
1,10000032,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,Abdominal distention,52.0,2180.0,2014 - 2016,...,74.0,0,White,46-65,10000032_38112554,[abd distention],abd distention,abd distention,1,38112554_1
2,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,34.0,0,White,46-65,10000032_35968195,"[n/v/d, abd pain]","nausea and vomiting, diarrhea, abd pain",nausea and vomiting,1,35968195_1
2,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,34.0,0,White,46-65,10000032_35968195,"[n/v/d, abd pain]","nausea and vomiting, diarrhea, abd pain",diarrhea,2,35968195_2
2,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,34.0,0,White,46-65,10000032_35968195,"[n/v/d, abd pain]","nausea and vomiting, diarrhea, abd pain",abd pain,3,35968195_3
3,10000032,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,Hypotension,52.0,2180.0,2014 - 2016,...,48.0,0,White,46-65,10000032_32952584,[hypotension],hypotension,hypotension,1,32952584_1
4,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,...,47.0,0,White,46-65,10000032_39399961,"[abd distention, abd pain, lethagic]","abd distention, abd pain, lethagic",abd distention,1,39399961_1
4,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,...,47.0,0,White,46-65,10000032_39399961,"[abd distention, abd pain, lethagic]","abd distention, abd pain, lethagic",abd pain,2,39399961_2
4,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,...,47.0,0,White,46-65,10000032_39399961,"[abd distention, abd pain, lethagic]","abd distention, abd pain, lethagic",lethagic,3,39399961_3


In [90]:
# Filter rows where indiv_symptom is less than 5 characters (excluding whitespace)
short_symptoms = data[data['indiv_symptom'].str.strip().str.len() < 5]

# Get unique short terms as a list
short_term_list = sorted(short_symptoms['indiv_symptom'].str.strip().str.lower().unique().tolist())

# Show number and Python list format
print("number of short_terms less than 5 characters is:", len(short_term_list))
print("📋 Unique short symptom terms (< 5 characters):")
print(short_term_list)  # ✅ Print as Python list

# Print each term on a new line (optional)
#print("🩺 Short symptom terms (each on a line):")
#for term in short_term_list:
#    print("-", term)

# Save list to a text file
#with open('dataset/ed/finals/11_short_term_list.txt', 'w') as f:
#    for term in short_term_list:
#        f.write(term + '\n')


number of short_terms less than 5 characters is: 353
📋 Unique short symptom terms (< 5 characters):
['-', '--', '.', '.si', '/', '/___', '/cp', '0', '1', '150', '2', '21', '353', '\\', '___', '___.', 'aaa', 'aar', 'ab', 'abd', 'ac', 'achy', 'af', 'afb', 'afib', 'ah', 'aki', 'al', 'am', 'ams', 'amy', 'anx', 'ap', 'appe', 'appy', 'arf', 'arm', 'aura', 'auto', 'avm', 'b', 'back', 'bc', 'bil', 'bite', 'bl', 'ble', 'bmt', 'boil', 'bp', 'bpr', 'buns', 'burn', 'c.p', 'c.p.', 'c/p', 'c1', 'c1fx', 'c2fx', 'c5fx', 'c6fx', 'ca', 'calf', 'ch', 'chb', 'chf', 'chi', 'chol', 'chp', 'cll', 'clot', 'cm', 'code', 'cold', 'copd', 'cp', 'cp..', 'cp/', 'cpr', 'crf', 'cri', 'cva', 'cyst', 'd', 'dark', 'dia', 'dic', 'dka', 'dm', 'dms', 'doe', 'dvt', 'dvts', 'dysu', 'dz', 'e', 'ea', 'ear', 'ect', 'eohs', 'ercp', 'ersd', 'esld', 'etoh', 'ett', 'eval', 'eye', 'f', 'f/c', 'face', 'fall', 'fast', 'fb', 'fbe', 'fell', 'fev', 'flu', 'foot', 'ftt', 'fuo', 'fx', 'geni', 'gerd', 'gi', 'gib', 'gout', 'gsw', 'gu', 'gyn'

In [91]:

'''
# Filter rows where indiv_symptom is less than 5 characters (excluding whitespace)
short_symptoms = data[data['indiv_symptom'].str.strip().str.len() < 5]

# Get only the unique short terms (optional, for summary)
unique_short_terms = short_symptoms['indiv_symptom'].str.strip().str.lower().unique()

# Display them
print("number ofshort_terms", len(unique_short_terms))


print("🩺 Short symptom terms (less than 5 characters):")
for term in unique_short_terms:
    print("-", term)


# Save list to a text file
with open('dataset/ed/finals/short_term_list.txt', 'w') as f:
    for term in short_term_list:
        f.write(term + '\n')

'''


'\n# Filter rows where indiv_symptom is less than 5 characters (excluding whitespace)\nshort_symptoms = data[data[\'indiv_symptom\'].str.strip().str.len() < 5]\n\n# Get only the unique short terms (optional, for summary)\nunique_short_terms = short_symptoms[\'indiv_symptom\'].str.strip().str.lower().unique()\n\n# Display them\nprint("number ofshort_terms", len(unique_short_terms))\n\n\nprint("🩺 Short symptom terms (less than 5 characters):")\nfor term in unique_short_terms:\n    print("-", term)\n\n\n# Save list to a text file\nwith open(\'dataset/ed/finals/short_term_list.txt\', \'w\') as f:\n    for term in short_term_list:\n        f.write(term + \'\n\')\n\n'

In [92]:


# Save short terms data to CSV
short_symptoms.to_csv('dataset/ed/finals/11_short_symptom_5char_terms.csv', index=False)

'''
# Display the full DataFrame rows
import pandas as pd
pd.set_option('display.max_rows', None)  # Optional: show all rows
pd.set_option('display.max_columns', None)  # Optional: show all columns
pd.set_option('display.width', None)  # Optional: avoid column wrapping
pd.set_option('display.max_colwidth', None)  # Optional: show full text in each column

# Show the DataFrame
short_symptom_rows
'''



"\n# Display the full DataFrame rows\nimport pandas as pd\npd.set_option('display.max_rows', None)  # Optional: show all rows\npd.set_option('display.max_columns', None)  # Optional: show all columns\npd.set_option('display.width', None)  # Optional: avoid column wrapping\npd.set_option('display.max_colwidth', None)  # Optional: show full text in each column\n\n# Show the DataFrame\nshort_symptom_rows\n"

In [93]:
short_symptoms.head(20)

Unnamed: 0,subject_id,stay_id,intime,outtime,gender,race,chiefcomplaint,anchor_age,anchor_year,anchor_year_group,...,dead_in_days,died_within_30_days,race_standard,age_group,unique_visit_id,terms,terms_new,indiv_symptom,counter,unique_ids_exploded
19,10000473,33267868,2138-03-15 20:07:00,2138-03-16 04:04:00,M,ASIAN - SOUTH EAST ASIAN,ILI,81.0,2138.0,2017 - 2019,...,,0,Asian,80+,10000473_33267868,[ili],ili,ili,1,33267868_1
21,10000507,31021946,2151-07-02 03:31:00,2151-07-02 06:45:00,M,WHITE,ETOH,19.0,2151.0,2017 - 2019,...,,0,White,18-30,10000507_31021946,[etoh],etoh,etoh,1,31021946_1
22,10000526,31939255,2160-06-12 19:02:00,2160-06-12 20:33:00,F,UNKNOWN,ILI,30.0,2160.0,2014 - 2016,...,,0,Other/Unknown/Not Reported,18-30,10000526_31939255,[ili],ili,ili,1,31939255_1
39,10000980,34277585,2188-01-03 12:23:00,2188-01-03 18:42:00,F,BLACK/AFRICAN AMERICAN,SOB,73.0,2186.0,2008 - 2010,...,2061.0,0,Black,66-80,10000980_34277585,[sob],sob,sob,1,34277585_1
47,10001038,34301067,2149-08-07 08:51:00,2149-08-07 11:02:00,M,WHITE,ILI,20.0,2149.0,2014 - 2016,...,,0,White,18-30,10001038_34301067,[ili],ili,ili,1,34301067_1
63,10001629,34377937,2175-07-28 12:26:00,2175-07-28 16:20:00,F,WHITE,___,29.0,2175.0,2011 - 2013,...,,0,White,18-30,10001629_34377937,[___],___,___,1,34377937_1
105,10002177,37215132,2176-07-19 12:06:00,2176-07-19 15:52:00,F,OTHER,"ILI, Sore throat, Cough",20.0,2176.0,2014 - 2016,...,,0,Other/Unknown/Not Reported,18-30,10002177_37215132,"[ili, sore throat, cough]","ili, sore throat, cough",ili,1,37215132_1
110,10002315,33941204,2161-03-23 17:53:00,2161-03-24 01:39:00,M,WHITE,SI,27.0,2161.0,2017 - 2019,...,,0,White,18-30,10002315_33941204,[si],si,si,1,33941204_1
119,10002430,31293660,2125-09-28 15:31:00,2125-09-28 22:22:00,M,WHITE,SOB,86.0,2125.0,2014 - 2016,...,1565.0,0,White,80+,10002430_31293660,[sob],sob,sob,1,31293660_1
134,10002557,39824070,2156-08-24 12:09:00,2156-08-24 14:42:00,F,WHITE - RUSSIAN,Rash,75.0,2145.0,2008 - 2010,...,1304.0,0,White,66-80,10002557_39824070,[rash],rash,rash,1,39824070_1


In [94]:

# Filter rows where indiv_symptom is less than 5 characters (excluding whitespace)
short_symptoms_2 = data[data['indiv_symptom'].str.strip().str.len() < 8]

# Get unique short terms as a list
short_term_list_2 = sorted(short_symptoms_2['indiv_symptom'].str.strip().str.lower().unique().tolist())


# Show number and Python list format
print("number of short_terms less than 8 characters:", len(short_term_list_2))
print("📋 Unique short symptom terms (< 8 characters):")
print(short_term_list_2)  # ✅ Print as Python list



# Save short terms data to CSV
short_symptoms_2.to_csv('dataset/ed/finals/12_short_symptom_8char_terms.csv', index=False)



# Display them
#print("number ofshort_terms", len(unique_short_terms))
#print("🩺 Short symptom terms (less than 7 characters):")
#for term in unique_short_terms:
#    print("-", term)


# Filter rows where indiv_symptom is less than 5 characters (excluding whitespace)
#short_symptoms = data[data['indiv_symptom'].str.strip().str.len() < 7]

# Get only the unique short terms (optional, for summary)
#unique_short_terms = short_symptoms['indiv_symptom'].str.strip().str.lower().unique()

# Display them
#print("number ofshort_terms", len(unique_short_terms))
#print("🩺 Short symptom terms (less than 7 characters):")
#for term in unique_short_terms:
#    print("-", term)


# Save list to a text file
#with open('dataset/ed/finals/short_term_list.txt', 'w') as f:
#    for term in short_term_list:
#        f.write(term + '\n')





number of short_terms less than 8 characters: 1311
📋 Unique short symptom terms (< 8 characters):
['!anemia', '& n&v', '-', '--', '.', '.si', '/', '/___', '/cp', '/o uti', '0', '1', '10 ft', '12 fall', '15 fall', '150', '2', '20 fall', '20fall', '21', '24 fall', '25 fall', '25fall', '353', '8 fall', '\\', '___', '___ ___', '___ n/v', '___ od', '___ s.i', '___ si', '___- si', '___-fx', '___.', '___/ si', '___/hi', '___/n/v', '___/si', '___/sob', 'a fib', 'a-fib', 'a. fib', 'a/o x3', 'aaa', 'aar', 'ab', 'ab pain', 'abcess', 'abd', 'abd lac', 'abd mri', 'abd n/v', 'abd ttp', 'abd/cp', 'abdpain', 'abn ct', 'abn cta', 'abn cxr', 'abn eeg', 'abn ekg', 'abn mre', 'abn mri', 'abn u/s', 'abn vbg', 'abnl bx', 'abnl ct', 'abnl mr', 'abnl us', 'absces', 'abscess', 'ac', 'aches', 'achy', 'acities', 'admit', 'af', 'afb', 'afib', 'again', 'ah', 'ah/si', 'ah/vh', 'airway', 'aki', 'al', 'all rxn', 'allergi', 'alt ms', 'alt.ms', 'altered', 'am', 'amnesia', 'ams', 'ams/ ha', 'amy', 'anaemia', 'anemia', '

In [95]:
# Step 1: Extract and clean unique terms
lookup_terms = set(look_up['term'].astype(str).str.strip().str.lower().unique())

# Step 2: Sort them for easier viewing
sorted_lookup_terms = sorted(lookup_terms)

# Step 3: Print all
for i, term in enumerate(sorted_lookup_terms, start=1):
    print(f"{i}. {term}")

# Optional: Check count
print(f"\nTotal terms: {len(sorted_lookup_terms)}")


1. 24 hr blood pressure monitoring
2. abdominal pain
3. abnormal heart beat
4. abscess
5. absences
6. accident caused by diving or jumping into water
7. accident caused by fireworks
8. accident caused by lightning
9. accidents caused by electric current
10. acute abdomen
11. acute pain
12. acute stress disorder
13. addiction
14. addison's disease
15. administration of rabies vaccine
16. adrenal cortical hypofunction
17. alcohol influence
18. alcohol intoxication
19. allergic condition
20. ambulatory surgery
21. anaphylaxis
22. anorectal problem
23. anxiety
24. apnea
25. ascites
26. assessment following possible exposure to contagion
27. assessment of miscarriage
28. asthma
29. attention to plaster cast
30. auditory dysfunction
31. back problem
32. backache
33. balanitis
34. bartholinitis
35. battery
36. bends
37. bite
38. bite of animal
39. bite of insect
40. bladder retention of urine
41. bleeding
42. bleeding from nose
43. bleeding in mouth or pharynx
44. bleeding of pharynx
45. blis

In [96]:
symptom_expand_map = {
    'depression/suicidal/deliberate self harm': ['depressed mood', 'suicide risk assessment', 'intentional self-harm'],
    'vomiting and/or nausea':['nausea and vomiting'],
    'ha/htn post partum': ['headache', 'hypertension', 'postpartum state'],
    'htn nausea and dizzy': ['hypertension', 'nausea', 'dizziness'],
    'htn , abd pain': ['hypertension', 'abdominal pain'],

      
    'n/v/d': ['nausea and vomiting', 'diarrhea'],
    'n&v&d': ['nausea and vomiting', 'diarrhea'],
    'nvd': ['nausea and vomiting', 'diarrhea'], 
    'n/vd': ['nausea and vomiting', 'diarrhea'],
    'n/d/v': ['nausea, diarrhea, vomiting'],
    'n/v /d': ['nausea, vomiting, diarrhea'],
    'n/v//d': ['nausea and vomiting', 'diarrhea'],
    'n/v/d/': ['nausea and vomiting', 'diarrhea'],
    'nausea/vomiting/diarrhea':  ['nausea and vomiting', 'diarrhea'],

    'vomiting and nausea': ['nausea and vomiting'],
    'vomitting and nausea': ['nausea and vomiting'],
    'nausea and vomitting': ['nausea and vomiting'],
    'nausea/diarrhea':['nausea', 'diarrhea'],
    'vomiting/diarrhea': ['vomiting', 'diarrhea'],
    'n/v': ['nausea and vomiting'],
    'v/n': ['nausea and vomiting'],
    'n&v': ['nausea and vomiting'],
    'nv': ['nausea and vomiting'],
    'n.v': ['nausea and vomiting'],
    'n//v': ['nausea and vomiting'],
    'n./v': ['nausea and vomiting'],
    'n/v/': ['nausea and vomiting'],
    'n / v': ['nausea and vomiting'],
    'vomiting/nausea': ['nausea and vomiting'],
    'nausea/vomiting': ['nausea and vomiting'],
    '& n&v': ['nausea and vomiting'],
    '___ n/v': ['nausea and vomiting'],
    '___/n/v': ['nausea and vomiting'],

    
    'v/d':['vomiting', 'diarrhea'],
    
    'n/d':['nausea , diarrhea'],
   
    '___- n/v/cramps': ['nausea and vomiting'],
    '___ pain/ vomiting': ['abdominal pain', 'vomiting'],
    '___/ abd pain vag dc': ['abdominal pain'],

    'n/v abd': ['nausea and vomiting','abdominal pain'],
    'abd pain with n/v': ['nausea and vomiting','abdominal pain'],
    'abd pain/ n/v': ['nausea and vomiting','abdominal pain'],
    'abd pain. n/v': ['nausea and vomiting','abdominal pain'],
    'abd pain n/v/d': ['nausea and vomiting', 'diarrhea','abdominal pain'],
    'abd pain nausea': ['nausea','abdominal pain'],
    
 
    'n/d/ha': ['nausea', 'diarrhea', 'headache'],
    'n/v ha': ['nausea and vomiting', 'headache'],
    'n/v/h/a':['nausea and vomiting', 'headache'],
    'n/v/ha': ['nausea and vomiting', 'headache'],

    'n/v cp': ['nausea and vomiting', 'chest pain'],
    'abd and chest pain': ['abdominal pain', 'chest pain'],
    'n/v/d and abd pain': ['nausea and vomiting', 'diarrhea', 'abdominal pain'],
    'n/v/d and cp': ['nausea and vomiting', 'diarrhea', 'chest pain'],
    
    'n/v sob': ['nausea and vomiting','dyspnea'],
    'n/v/sob': ['nausea, vomiting', 'dyspnea'],

    'n':['nausea'],
    'nasuea':['nausea'],
    'nauea': ['nausea'],
    'nausa': ['nausea'],
    'nausea': ['nausea'],
    'nausea.': ['nausea'],
    'n/': ['nausea'],

    'v':['vomiting'],
    'v/': ['vomiting'],
    '___-vomiting':['vomiting'],


    'd':['diarrhea'],  
    'diarrhoea': ['diarrhea'], 
    'diahrea': ['diarrhea'],
    'diarhea': ['diarrhea'],
    'diarr': ['diarrhea'],
 
    'constipated': ['constipation'],

    'abd n/v': ['abdominal pain', 'nausea and vomiting'],
    'abd pain/n/v':['abdominal pain', 'nausea and vomiting'],
    'ap/n/v': ['abdominal pain', 'nausea and vomiting'],
    'nausea/diarrhea/abd pain': ['nausea', 'diarrhea', 'abdominal pain'],



    'shortness of breath/abd pain':['shortness of breath, abd pain'],

    'body aches/constipation': ['body aches', 'constipation'],


    'cp': ['chest pain'],
    'c.p': ['chest pain'],
    'c/p': ['chest pain'],
    '/cp': ['chest pain'],

    'cp /sob': ['chest pain', 'shortness of breath'],
    'cp h/a': ['chest pain',' headache'],
    'cp sob': ['chest pain', 'shortness of breath'],
    'cp svt': ['chest pain', 'supraventricular tachycardia'],
    'cp..': ['chest pain'], 
    'cp./sob': ['chest pain', 'shortness of breath'],
    'cp/': ['chest pain'],
    'cp/ sob': ['chest pain', 'shortness of breath'],
    'cp/___': ['chest pain'],
    'cp/afib': ['chest pain', 'atrial fibrillation'],
    'cp/dka': ['chest pain', 'diabetic ketoacidosis'],
    'cp/etoh': ['chest pain', 'alcohol intoxication'],
    'cp/ha': ['chest pain', 'headache'],
    'cp/htn': ['chest pain', 'hypertension'],
    'cp/ili': ['chest pain', 'iliac region pain'],
    #'cp/ili': ['chest pain', 'iliac region pain (or unclear term "ili")', '29857009'],  # "ili" may need clarification
    'cp/n/v': ['chest pain', 'nausea and vomiting'],
    'cp/pna': ['chest pain', 'pneumonia'],
    'cp/sob': ['chest pain', 'dyspnea'],

    'chest discomfort': ['chest pain'],
    'central pain': ['central chest pain'],

    'ha': ['headache'],
    'h/a': ['headache'],
    'head pain': ['headache'],
    'ha persists': ['headache'], 
    'h/a cp': ['headache', 'chest pain'],
    'h/a cva': ['headache', 'stroke'],
    'h/a htn': ['headache', 'hypertension'],
    'h/a lbp': ['headache', 'low back pain'],

    'h/a n/v': ['headache', 'nausea and vomiting'],
    'h/a sob': ['headache', 'shortness of breath'],
    'h/a st': ['headache', 'sore throat'],
    'h/a/cp': ['headache', 'chest pain'],
    'h/a/htn': ['headache', 'hypertension'],
    'h/as': ['headaches'],
    'ha.cp': ['headache', 'chest pain'],
    'ha/ cp': ['headache', 'chest pain'],
    




    'abd pain': ['abdominal pain'],
    'abd': ['abdominal pain'],
    'abdpain': ['abdominal pain'],
    'abdo pain': ['abdominal pain'],
    'stomach ache': ['abdominal pain'],
    'epigastric pain': ['abdominal pain'],
    'flank pain': ['flank pain'],
    'abd pain /bilat pe': ['abd pain', 'bilat pe'],
    
    'gi bleed': ['gastrointestinal hemorrhage'],
    'gu pain': ['genitourinary pain'],
    'gu': ['genitourinary symptoms'],
    'gu eval': ['evaluation of genitourinary system'],
    'gu evaluation': ['evaluation of genitourinary system'],

    'uti': ['urinary tract infectious disease'],
    'urinary infection': ['urinary tract infectious disease'],

    'dvt': ['deep venous thrombosis'],
    'leg clot': ['deep venous thrombosis'],

    'fainted': ['syncope'],
    'syncopal episode': ['syncope'],

    'hypoglycemia': ['hypoglycemic state in diabetes'],
    'low sugar': ['hypoglycemic state in diabetes'],

    'rash': ['eruption'],
    'skin rash': ['eruption'],
    'hives': ['eruption'],

    'fever': ['fever'],
    'fev': ['fever'],    
    'febrile': ['fever'],
    'high temperature': ['fever'],

    'back pain': ['backache'],
    'leg pain': ['pain in limb'],
    'arm pain': ['pain in limb'],
    'shoulder pain': ['pain in shoulder'],

    'blurred vision': ['visual disturbance'],
    'vision loss': ['visual disturbance'],

    'numbness': ['paresthesia'],
    'tingling': ['paresthesia'],

    'confused': ['confusional state'],
    'weak': ['muscle weakness'],
    'tired': ['fatigue'],

    'bleeding': ['bleeding'],
    'nosebleed': ['bleeding from nose'],
    'blood in vomit': ['hematemesis'],
    'blood in urine': ['blood in urine'],
    'blood in stool': ['blood in feces symptom'],


    'injury': ['traumatic injury'],
    'trauma': ['traumatic injury'],
    'fall': ['fall'],
    's/p': ['fall'],  
    # status post fall
    'accident': ['victim of vehicular and_or traffic accident'],

    'pregnant': ['pregnancy problem'],
    'delivery': ['human parturition, function'],

    'sad': ['depressed mood'],
    'suicidal': ['suicide attempt'],
    'depression': ['depressed mood'],

    'burns': ['burn'],
    'seizure': ['seizure undetermined whether focal or generalized'],
    'convulsion': ['febrile convulsion'],

    'cold': ['common cold'],
    'sore throat': ['sore throat'],
    'coughing': ['cough'],

    #shortness of breath
    'dypnea': ['dyspnea'],
    'dypsnea': ['dyspnea'],
    'dysnea': ['dyspnea'],
    'dsypnea': ['dyspnea'],
    'dypsnea': ['dyspnea'],
    'dyspena': ['dyspnea'],    
    'sob': ['dyspnea'],
    'dyspnea': ['dyspnea'],
    'breathlessness': ['dyspnea'],

    #(painful urination)
    'dysu':  ['dysuria'],
    'dysuira':  ['dysuria'],
    'dysurea': ['dysuria'],
    'dysuria': ['dysuria'],
    'dsyuria': ['dysuria'],

    'ect': ['ectopic pregnancy'],
    'ect tx': ['ectopic pregnancy'],
    'ectopic': ['ectopic pregnancy'],
    #'ectasy': ['MDMA use / Ecstasy overdose'],
    'eczema': ['eczema'],
    #swelling
    'edema': ['edema'],


    'dizzy': ['dizziness'],
    '___/dizziness': ['dizziness'],
    'dizzy..': ['dizziness'],
    'dizzy/': ['dizziness'],
    'dizzy and vomiting': ['dizziness', 'vomiting'],

    'ear pain': ['otalgia'],
    'throat pain': ['sore throat'],
    'eye pain': ['problem of eye'],
    'jaw pain': ['problem of jaw'],

    'wound eval': ['wound evaluation'],

    
    'rlq': ['abdominal pain'],
    'llq': ['abdominal pain'],
    'luq': ['abdominal pain'],
    'ruq': ['abdominal pain'],
    
    'bp': ['blood pressure'], 
    'tia': ['transient ischemic attack'],


    'pna': ['pneumonia'],
    'pnx': ['pneumonia'],
    'pna ___': ['Pneumonia'],
    'pna/arf': ['Pneumonia', 'acute respiratory failure'],
    #'pneumo': ['Pneumothorax or Pneumonia (needs context)', '67782005'],  # Assuming pneumothorax
    #'pnx': ['Pneumothorax', '67782005']


    'htn': ['hypertension'],
    'af': ['atrial fibrillation'],
    'pe': ['pulmonary embolism'],
    'dka': ['diabetic ketoacidosis'],
    'loc': ['loss of consciousness'],
    'ftt': ['failure to thrive'],
    'ili': ['influenza-like illness'],  
    'ukn': ['Unknown'],
    'fx': ['Fracture'],
    'lbp': ['backache'],
    'bp eVAL': ['evaluation of blood pressure'],
    #'bp':['Blood pressure']
    #'bp':['']
    'mvc': ['motor vehicle accident'],

    'cva': ['cerebrovascular accident'],
    'cva / tpa': ['cerebrovascular accident', 'tissue plasminogen activator treatment'],
    #73899007 
    #snomed code= 95891005 or  103001002   check it by yourself  !!!!!
    # OD	Drug overdose (likely self-harm):	703442003 (intentional)/405612005 (accidental)	*Depends on clinical notes/context

    'od': ['overdose'],
    'heroin od': ['heroin overdose'],
    'accidental od': ['accidental overdose'],

    'gib': ['gastrointestinal bleed'],
    #'ah': ['failure to thrive'],
    'sdh': ['subdural hematoma'],  
    #Subdural hematoma (disorder) → SNOMED CT Code: 95435005

    'aaa': ['abdominal aortic aneurysm'], 
    #233985008 



    's.a.': ['suicide attempt'],          
    #'suicidal ideation': ['SI', 'S.I', 'SI WITH PLAN', 'SI / ETOH', 'SI DEPRESSION', 'S.I COMBATIVE', 'S.I / DETOX'],
    'si': ['suicidal ideation'],   
    #6471006	
    's.i': ['suicidal ideation'],
    'si with plan': ['suicidal ideation'],
    'si w plan': ['suicidal ideation'],
    'si/': ['suicidal ideation'],
    '.si': ['suicidal ideation'],
    's.i.': ['suicidal ideation'],
    '___- si': ['suicidal ideation'],
    '___ si': ['suicidal ideation'],
    '___/ si': ['suicidal ideation'],
    '___/si': ['suicidal ideation'],
    '___- si hi': ['suicidal ideation', 'homicidal ideation'],
    's.i ___': ['suicidal ideation'],
    'si hi': ['suicidal ideation', 'homicidal ideation'],
    'si- ah': ['suicidal ideation', 'auditory hallucinations'],
    'si/ ah':  ['suicidal ideation', 'auditory hallucinations'],
    'si/ hi':['suicidal ideation', 'homicidal ideation'],
    'si/ od': ['suicidal ideation', 'overdose'],
    'si/ sa': ['suicidal ideation', 'suicide attempt'],
    'si/___': ['suicidal ideation'],
    'si/ah':  ['suicidal ideation', 'auditory hallucinations'],
    'si/cp': ['suicidal ideation', 'chest pain'],



    's.t .': ['sore throat'],
    's.t.': ['sore throat'],


    'hi': ['homicidal ideation'],
    '___/hi': ['homicidal ideation'],

    'od/ si': ['overdose', 'suicidal ideation'],
    'od/sa': ['overdose', 'suicide attempt'],
    'od/si': ['overdose','suicidal ideation'],


    #'etoh': ['alcoholism']  #7200002 – Alcoholism (disorder)
    #'etoh': ['alcohol intoxication']  #191816009 – Alcohol intoxication (disorder)
    #ingeneral:   ETOH → Alcohol use / intoxication

            
    #'od':	Overdose or Right eye (oculus dexter)
    #'ah': Atrial hypertrophy or auditory hallucination
    #SSCP (Substernal Chest Pain)	Retrosternal chest pain (finding)	28770003

    '!anemia': ['anemia'], 
    'ah': ['auditory hallucinations'],  
    'ah/si': ['auditory hallucinations', 'suicidal ideation'], 
    'ah/vh': ['auditory hallucinations', 'visual hallucinations'],  
    'vh/ah': ['visual hallucinations', 'auditory hallucinations'], 



    'ams': ['altered mental status'],
    'ams/ ha': ['altered mental status', 'headache'],
    #'ah': ['auditory hallucinations', '279039007'],
    #'ah/si': ['auditory hallucinations, suicidal ideation', '279039007, 35489007'],
    #'ah/vh': ['auditory hallucinations, visual hallucinations', '279039007, 30087000'],


    'anemic': ['anemia'],
    'anaemia': ['anemia'], 
    'anemia': ['anemia'],

    'anxiety': ['Anxiety'],
    'anxious': ['Anxiety'],
    'anxity': ['Anxiety'],
    'anxoius': ['Anxiety'],
    'anx': ['Anxiety'],
    'apnea': ['apnea'],

    'ascites': ['ascites'],
    'ascitis': ['ascites'],


    '___- hypotension': ['hypotension'],
    '___/ depression/si': ['depression','suicidal ideation'],
    '___ & body pain': ['body pain'],
    '___/ r hand numbness r/o comp syndrome': ['r hand numbness r/o comp syndrome'],
    '/ infection': ['infection'], 
    'infx': ['infection'], 

    #'b/l pe': ['bilateral pulmonary embolism', '59282003']
    #'back': ['back pain', '279039007'],
    'back fx': ['back fracture'],  
    # more generally "fracture of spine"
    'bad h/a': ['severe headache'],  
    # SNOMED for headache
    'bat bit': ['bat bite'],  
    # animal bite, specific to bat


    'bc': ['birth control'],  
    # assumed context (birth control), or blood culture
    'bedtime': ['problem at bedtime/sleep disturbance'],  
    # or insomnia related
    'bi pap': ['bilevel positive airway pressure support'], 
    # BiPAP support in respiratory distress


    #sweating
    'diaph': ['diaphoresis'],
    #DIC
    'dic': ['disseminated Intravascular Coagulation'],

    'itch': ['itching'],
    'itching': ['itching'],
    'itchy': ['itching'],


    'htn/cp': ['hypertension', 'chest pain'],
    'htn/h/a': ['hypertension',  'headache'],
    'htn/ha': ['hypertension',  'headache'],
    'htn/sob': ['hypertension', 'dyspnea'],
    'ili cp': ['iliac region', 'chest pain'],
    'ili/ha': ['iliac pain',  'headache'],
    'ili/sob': ['iliac pain', 'dyspnea'],


    'head/cp': ['head pain', 'chest pain'],
    'headace': ['headache'],
    'hedache': ['headache'],


    'ha/cp': ['headache', 'chest pain'],
    'ha/etoh': ['headache', 'alcohol intoxication'],
    'ha/fall': ['headache', 'fall'],
    'ha/htn': ['headache', 'hypertension'],

    'ha/sob': ['headache', 'dyspnea'],
    'ha/st': ['headache', 'sore throat'],
    'ha/vom': ['headache', 'vomiting'],

    'gu sx': ['genitourinary symptoms'],
    'guaic': ['guaiac positive stool'],
    'guiac': ['guaiac positive stool'],
    'guiacc': ['Guaiac positive stool'],
    'gyn': ['gynecological complaint'],
    'h.i.': ['head injury'],


    'etoh': ['Alcohol use'],
    'etoh/cp': ['Alcohol use', 'chest pain'],
    'etoh/si': ['Alcohol use', ' suicidal ideation'],
    'mvc/h/a': ['Motor vehicle collision', 'headache'],


    'low hb': ['hemoglobin below reference range'],
    'low hct': ['Low hematocrit', '36582005'],
    'low hg': ['hemoglobin below reference range'],
    'low hgb': ['hemoglobin below reference range'],
    'low h/h': ['hemoglobin below reference range', 'hematocrit'],
    'lle dvt': ['deep vein thrombosis'],
    'low bp': ['hypotension'],
    'low bps': ['low blood pressures'],

    'lethagy': ['lethargy'],
    'lethary': ['lethargy'], 

    'si/ etoh': ['suicidal ideation', 'alcohol intoxication'],
    'si/etoh': ['suicidal ideation', 'alcohol intoxication'],
    'si/hi': ['suicidal', 'homicidal ideation'],
    'si/od': ['suicidal ideation', 'overdose'],
    'si/sa': ['suicidal ideation', 'suicide attempt'],



    #  "siatica": ["sciatica", "230265002"],
    #  "sick": ["feeling sick / general malaise", "162057007"],
    #  "sizure": ["seizure (misspelling of seizure)", "91175000"],
    #  "skin": ["skin complaint (general)", "399963005"],
    #  "skull": ["skull fracture", "20917003"],
    #  "sleepy": ["excessive drowsiness", "24823004"],
    #  "slipped": ["fall or slipped disc (context dependent)", "7895008 (fall), 367391008 (slipped disc)"],
    #  "slow hr": ["bradycardia (slow heart rate)", "48867003"],
    #  "snycope": ["syncope (misspelled)", "271594007"],
    

    'shortness of breath': ['dyspnea'],
    'sob /cp': ['dyspnea', 'chest pain'],
    'sob af': ['dyspnea', 'atrial fibrillation'],
    'sob cp': ['dyspnea', 'chest pain'],
    'sob ili': ['dyspnea', 'iliac pain'],
    'sob. cp': ['dyspnea', 'chest pain'],


    'sob/': ['dyspnea'],
    'sob/ cp': ['dyspnea', 'chest pain'],
    'sob/cp': ['dyspnea','chest pain'],
    'sob/doe': ['dyspnea', 'dyspnea on exertion'],
    'sob/dvt': ['dyspnea', 'deep vein thrombosis'],
    'sob/htn': ['dyspnea', 'hypertension'],
    'sob/pe': ['dyspnea', 'pulmonary embolism'],
    'sob/pna': ['dyspnea', 'pneumonia'],
    'sobbing': ['sobbing'],

    'ulcers': ['ulcer'],
    'throat': ['sore throat'],
    'st/ha': ['sore throat', 'hdyspnea'],
    'st/uri': ['sore throat', 'upper respiratory infection'],
    
    'spasms':['muscle spasms'],
    
    #'ssc': ['substernal chest pain (likely)	Possibly part of CP: 29857009
    'sscp':	['substernal chest pain'],
    #'st': ['Sore throat / Sinus tachycardia'] 
    #(context needed)	87979003 (sore throat) or 698247007 (sinus tachycardia)




    'sycopal': ['syncope'],
    'sycope':  ['syncope'],
    'syncope': ['syncope'],
    'syncopy': ['syncope'],
    'snycope': ['syncope'],
    'syncope/presyncope': ['syncope', 'presyncope'],
    'abd pain with syncope': ['abdominal pain', 'syncope'],
    'near sycope/abd pain': ['syncope', 'abdominal pain'],
    'headache/fever': ['headache', 'fever'],
    'chest pain/abd pain': ['chest pain', 'abdominal pain'],

    '___ meth today/cp': ['meth today', 'chest pain'],
    'cough/congestion': ['cough', 'congestion'],
    'dizziness/h/a/nausea':['dizziness', 'headache','nausea'],
    'anxiety/depression':['anxiety', 'depression'],
    'cp/nasal pain s/p mvc': ['chest pain','nasal pain s/p', 'mvc'],
    'nausea/weakness': ['nausea','weakness'],
    'weakness/vomiting': ['weakness', 'vomiting'],
    'vomiting s/p chemo': ['vomiting', 's/p chemo'],
    'dehydration/vomiting': ['dehydration', 'vomiting'],
    'ha/dizzy/ dissection': ['headache', 'dizziness',  'dissection'],
    'palpitation/flutter': ['palpitation', 'flutter'],
    'h/a bodyaches': ['headache', 'bodyaches'],
    'fatigue/nausea': ['fatigue','nausea'],
    'nausea, dizziness': ['nausea', 'dizziness'],
    'ha/vomiting s/p head injury': ['headache', 'vomiting', 's/p head injury'],
    'abd mass/ agitation': ['abd mass', 'agitation'],
    'chest tightness/dyspnea': ['chest tightness', 'dyspnea'],
    'vomiting and/or nausea/diarrhea': ['vomiting','nausea', 'diarrhea'],
    'chest pain/headache': ['chest pain', 'headache'],
    'chest pain/pna': ['chest pain', 'pneumonia'],
    'n/v/fever': ['nausea and vomiting', 'fever'],
    'fever/abd pain': ['fever', 'abdominal pain'],


    'dizzy and left eye blurry': ['dizziness' ,'left eye blurry'],
    'dizzy and weak':['dizziness' ,'weakness'],
    'chest and back pain':['chest pain' ,'back pain'],

    'vomiting and chills /hallucinated':['vomiting' , 'chills' ,'hallucinated'],
    'cp and palpatations': ['chest pain', 'palpatations'],
    'dizzy and blurred vision':['dizziness','blurred vision'],
    'n/v and flank pain':['nausea, vomiting' , 'flank pain'],
    'cp and back pain/nausea':['chest pain', 'back pain','nausea'],
    'fever and sob':['fever', 'dyspnea'],
    'fever and diarrhea':['fever' , 'diarrhea'],
    'cough and chest pain':['cough' , 'chest pain'],
    'weakness and dizziness':['weakness' , 'dizziness'],
    'chest pain and sob':['chest pain' , 'dyspnea'],
    'h/a and vomiting':['headache' , 'vomiting'],
    'n/v and cough':['nausea, vomiting' , 'cough'],
    'headache and dizziness':['headache','dizziness'],
    'chest and abd pain':['chest pain','abdominal pain'],

    
    'fevers & vomiting':['fever','vomiting'],
    'dizziness/ha/ syncope':['dizziness','headache','syncope'],
    'diarrhea/ hypotensive': ['diarrhea','hypotensive'],
    'sore throat/ dizziness': ['sore throat','dizziness'],
    'fever/ nausea': ['fever','nausea'],
    'abd pain / nausea': ['abdominal pain','nausea'],
    'ha / abd pain': ['headache','abdominal pain'],
    'syncope/ hit head': ['syncope','hit head'],
    'dizzieness/ sob':['dizziness','dyspnea'],
    'rlq pain/ dizziness':['abdominal pain','dizziness'],
    'abd pain/ nvd':['abdominal pain', 'nausea and vomiting', 'diarrhea'],
    'body aches/ diarrhea': ['body aches','diarrhea'],

    'hiccups/ n/v abd pain': ['hiccups', 'nausea and vomiting', 'abdominal pain'],
    'sob/ asthma flare': ['dyspnea', 'asthma flare'],
    'headache/ neck pain': ['headache', 'neck pain'],
    'neck/ back pain': ['neck pain', 'back pain'],
    'dyspnea/ gen weakness/facial numbness': ['dyspnea', 'generalized weakness', 'facial numbness'],
    'left leg pain/ htn/dizzy': ['left leg pain', 'hypertension', 'dizziness'],
    'left elbow injury/ polysubstance abuse': ['left elbow injury', 'polysubstance abuse'],
    'post op abd pain/ diarrhea': ['post-operative abdominal pain', 'diarrhea'],
    'abd / rlq pain': ['abdominal pain', 'right lower quadrant pain'],
    'n/v/d/ depression': ['nausea and vomiting', 'diarrhea', 'depression'],
    'weakness/ fever': ['weakness', 'fever'],
    'brbpr/ urinary pain': ['bright red blood per rectum', 'urinary pain'],
    'n/v/d/ abd pain': ['nausea and vomiting', 'diarrhea', 'abdominal pain'],
    'nasal congestion / hay fever': ['nasal congestion', 'hay fever'],
    'diarrhea/ abd pain': ['diarrhea', 'abdominal pain'],
    'weakness/ wound eval': ['weakness', 'wound evaluation'],
    'anxiety/ feels at risk': ['anxiety', 'feeling at risk'],
    'chest pain/ pressure': ['chest pain', 'chest pressure'],
    'hyperglycemia/ near syncope': ['hyperglycemia', 'near syncope'],
    'n/v lightheaded/ abd pain': ['nausea and vomiting', 'lightheadedness', 'abdominal pain'],
    'cp w/ l arm numbness': ['chest pain', 'left arm numbness'],
    'dizziness/ cp': ['dizziness', 'chest pain'],
    'abd pain/ gi bleed': ['abdominal pain', 'gastrointestinal bleed'],
    'abnormal labs/ jaundiced': ['abnormal labs', 'jaundiced'],
    '___ pain/ fever': ['pain', 'fever'],
    'sore throat/ r chest pain': ['sore throat', 'right chest pain'],
    'h/a/ spinal pain': ['headache', 'spinal pain'],
    'back pain/ sob': ['back pain', 'dyspnea'],
    'abd pain/ vomiting': ['abdominal pain', 'vomiting'],
    'abd pain / diarhea': ['abdominal pain', 'diarrhea'],
    'si/ cough': ['suicidal ideation', 'cough'],
    'constipation/ fever': ['constipation', 'fever'],
    'cp/ palpatations': ['chest pain', 'palpitations'],
    'h/a/ vomiting': ['headache', 'vomiting'],
    'vomiting/ ___ pain': ['vomiting', 'pain'],
    '___- nausea/ leg cramps': ['nausea', 'leg cramps'],
    'h/a / weakness': ['headache', 'weakness'],
    'sore throat/ bil. ear pain': ['sore throat','bil. ear pain'],
    'dizziness/ episodic blurred vision': ['dizziness','episodic blurred vision'],



    'abcess/ i & d': ['abscess', 'incision and drainage'],
    'i & d': ['incision', 'drainage'],
    'major trauma &#___; penetrating': ['major trauma', 'penetrating injury'],
    'gib & nstemi': ['gastrointestinal bleeding', 'non-ST elevation myocardial infarction'],
    'r/o hot&low': ['rule out hot', 'low blood pressure'],  # needs clarification
    'fall & confused': ['fall', 'confusion'],
    'abd pain & vomitting': ['abdominal pain', 'vomiting'],
    'sob & wheezing': ['dyspnea', 'wheezing'],
    'hip & back pain': ['hip pain', 'back pain'],
    'vag bleed s/p d&c': ['vaginal bleeding', 'status post dilation and curettage'],
    'l & r wrist injury': ['left wrist injury', 'right wrist injury'],
    'earache& cough': ['earache', 'cough'],
    'ha & nausea': ['headache', 'nausea'],
    'abd pain & distention': ['abdominal pain', 'abdominal distention'],
    'lle pain & swelling': ['left leg pain', 'swelling'],
    'ha & stomach ache': ['headache', 'abdominal pain'],
    'weak & dizzy': ['weakness', 'dizziness'],
    'pe & dvt': ['pulmonary embolism', 'deep vein thrombosis'],
    'wheezing &#___; no other complaints': ['wheezing'],
    'shoulder & leg pain': ['shoulder pain', 'leg pain'],
    'st & fevers': ['sore throat', 'fever'],
    'abd pain & vag bleeding': ['abdominal pain', 'vaginal bleeding'],
    'migrane & palpitations': ['migraine', 'palpitations'],
    'abd pain & vomiting': ['abdominal pain', 'vomiting'],
    'r sided weakness w/ edema &shift': ['right-sided weakness', 'edema', 'shift'],  
    # "shift" unclear
    'ha & dizzy': ['headache', 'dizziness'],
    'weakness & bodyaches': ['weakness', 'body aches'],
    'weakness & nausea': ['weakness', 'nausea'],
    'dizzy & fall': ['dizziness', 'fall'],
    'neck & face pain': ['neck pain', 'facial pain'],
    'n&v dizziness x 3d': ['nausea', 'vomiting', 'dizziness'],
    's/p mvc- neck & headache': ['status post motor vehicle collision', 'neck pain', 'headache'],
    'h/a & back pain': ['headache', 'back pain'],
    'i&d site bleeding': ['incision and drainage site', 'bleeding'],
    'pain & tingling': ['pain', 'tingling'],
    'n/v & dizziness': ['nausea and vomiting', 'dizziness'],
    'hallucinations & unsafe behavior': ['hallucinations', 'unsafe behavior'],
    'fevers & weakness': ['fever', 'weakness'],
    'l foot swelling & pain': ['left foot swelling', 'pain'],
    'major trauma &#___; blunt': ['major trauma', 'blunt injury'],
    'dizzy & vomiting': ['dizziness', 'vomiting'],
    'dizzy & n/v': ['dizziness', 'nausea and vomiting'],
    'isolated chest trauma &#___; blunt': ['chest trauma', 'blunt injury'],
    'finger & knee injury': ['finger injury', 'knee injury'],
    'sob & cp': ['dyspnea', 'chest pain'],
    'sob & shoulder pain': ['dyspnea', 'shoulder pain'],
    'fevers & vomitting': ['fever', 'vomiting'],
    'ha & vomitting': ['headache', 'vomiting'],
    '& pin point pupils': ['pinpoint pupils'],
    'fever & l arm pain': ['fever', 'left arm pain'],
    '& fevers': ['fever'],
    'elev trop & bnp': ['elevated troponin', 'elevated bnp'],
    'n&v & abd pain': ['nausea', 'vomiting', 'abdominal pain'],
    'i&d abscess': ['incision and drainage', 'abscess'],
    'bun & creat elevated': ['elevated BUN', 'elevated creatinine'],
    's/p d&c': ['status post dilation', 'curettage'],
    'nausea & dizzy': ['nausea', 'dizziness'],
    'fever & malaise': ['fever', 'malaise'],
    'rle swelling & redness': ['right leg swelling', 'redness'],
    'c & t1 fx': ['cervical fracture', 'thoracic fracture'],
    'vertigo &palpitations': ['vertigo', 'palpitations'],
    'incd ascites &dehydration': ['increased ascites', 'dehydration'],
    'l knee pain & r arm pain': ['left knee pain', 'right arm pain'],
    'cough & sob': ['cough', 'dyspnea'],
    'cp & syncope': ['chest pain', 'syncope'],
    'cough & congested': ['cough', 'congestion'],
    'h/a & neck pain': ['headache', 'neck pain'],
    'fevers & ha': ['fever', 'headache'],
    'back & leg pain': ['back pain', 'leg pain'],
    's/p trip & fall': ['status post fall'],
    '3&4th digit injury': ['third digit injury', 'fourth digit injury'],
    'drsg change & new abscess eval': ['dressing change', 'abscess evaluation'],
    'chills & feeling sick': ['chills', 'malaise'],
    'neck & back pain': ['neck pain', 'back pain'],
    'abd pain & cough': ['abdominal pain', 'cough'],
    'preg/abd&back pain': ['pregnancy', 'abdominal pain', 'back pain'],
    'fevers & high bs': ['fever', 'high blood sugar'],
    'lightheaded & dizzy': ['lightheadedness', 'dizziness'],
    'cold &cough': ['cold', 'cough'],
    'stiff neck & fevers': ['stiff neck', 'fever'],
    'cough & fevers': ['cough', 'fever'],
    'tired & weak': ['fatigue', 'weakness'],
    '&dizziness': ['dizziness'],
    'lac to left thumb & index finger': ['laceration to left thumb', 'laceration to left index finger'],
    'cold s&s': ['cold symptoms'],
    'abnl labs & sob': ['abnormal labs', 'dyspnea'],
    'sob & lethargic': ['dyspnea', 'lethargy'],
    'pain following i&d': ['pain', 'post-procedural pain'],
    'chin & tongue lacerations': ['chin laceration', 'tongue laceration'],
    'cyst i&d': ['cyst', 'incision and drainage'],


    #'soc": ["standard of care / seen on call (context-dependent, unclear)", "no specific SNOMED – depends on context"],
    #'sores": ["skin sores or ulcers", "128477000"],
  
    #"sp etoh": ["status post alcohol use/intoxication", "191816009"],
    #"sp fall": ["status post fall", "271593001"],
    #"sp mva": ["status post motor vehicle accident", "271327008"],
    #"sp sz": ["status post seizure", "91175000"],
    #"sp tpa": ["status post tPA treatment", "705129003"],
    #"spasm": ["muscle spasm", "263204007"]
}








In [97]:
from collections import Counter
import re

# Paste the dictionary as a raw string here
raw_text = """

'depression/suicidal/deliberate self harm': ['depressed mood', 'suicide risk assessment', 'intentional self-harm'],
'vomiting and/or nausea':['nausea and vomiting'],
'ha/htn post partum': ['headache', 'hypertension', 'postpartum state'],
'htn nausea and dizzy': ['hypertension', 'nausea', 'dizziness'],
'htn , abd pain': ['hypertension', 'abdominal pain'],

  
'n/v/d': ['nausea and vomiting', 'diarrhea'],
'n&v&d': ['nausea and vomiting', 'diarrhea'],
'nvd': ['nausea and vomiting', 'diarrhea'], 
'n/vd': ['nausea and vomiting', 'diarrhea'],
'n/d/v': ['nausea, diarrhea, vomiting'],
'n/v /d': ['nausea, vomiting, diarrhea'],
'n/v//d': ['nausea and vomiting', 'diarrhea'],
'n/v/d/': ['nausea and vomiting', 'diarrhea'],
'nausea/vomiting/diarrhea':  ['nausea and vomiting', 'diarrhea'],

'vomiting and nausea': ['nausea and vomiting'],
'vomitting and nausea': ['nausea and vomiting'],
'nausea and vomitting': ['nausea and vomiting'],
'nausea/diarrhea':['nausea', 'diarrhea'],
'vomiting/diarrhea': ['vomiting', 'diarrhea'],
'n/v': ['nausea and vomiting'],
'v/n': ['nausea and vomiting'],
'n&v': ['nausea and vomiting'],
'nv': ['nausea and vomiting'],
'n.v': ['nausea and vomiting'],
'n//v': ['nausea and vomiting'],
'n./v': ['nausea and vomiting'],
'n/v/': ['nausea and vomiting'],
'n / v': ['nausea and vomiting'],
'vomiting/nausea': ['nausea and vomiting'],
'nausea/vomiting': ['nausea and vomiting'],
'& n&v': ['nausea and vomiting'],
'___ n/v': ['nausea and vomiting'],
'___/n/v': ['nausea and vomiting'],


'v/d':['vomiting', 'diarrhea'],

'n/d':['nausea , diarrhea'],

'___- n/v/cramps': ['nausea and vomiting'],
'___ pain/ vomiting': ['abdominal pain', 'vomiting'],
'___/ abd pain vag dc': ['abdominal pain'],

'n/v abd': ['nausea and vomiting','abdominal pain'],
'abd pain with n/v': ['nausea and vomiting','abdominal pain'],
'abd pain/ n/v': ['nausea and vomiting','abdominal pain'],
'abd pain. n/v': ['nausea and vomiting','abdominal pain'],
'abd pain n/v/d': ['nausea and vomiting', 'diarrhea','abdominal pain'],
'abd pain nausea': ['nausea','abdominal pain'],


'n/d/ha': ['nausea', 'diarrhea', 'headache'],
'n/v ha': ['nausea and vomiting', 'headache'],
'n/v/h/a':['nausea and vomiting', 'headache'],
'n/v/ha': ['nausea and vomiting', 'headache'],

'n/v cp': ['nausea and vomiting', 'chest pain'],
'abd and chest pain': ['abdominal pain', 'chest pain'],
'n/v/d and abd pain': ['nausea and vomiting', 'diarrhea', 'abdominal pain'],
'n/v/d and cp': ['nausea and vomiting', 'diarrhea', 'chest pain'],

'n/v sob': ['nausea and vomiting','dyspnea'],
'n/v/sob': ['nausea, vomiting', 'dyspnea'],

'n':['nausea'],
'nasuea':['nausea'],
'nauea': ['nausea'],
'nausa': ['nausea'],
'nausea': ['nausea'],
'nausea.': ['nausea'],
'n/': ['nausea'],

'v':['vomiting'],
'v/': ['vomiting'],
'___-vomiting':['vomiting'],


'd':['diarrhea'],  
'diarrhoea': ['diarrhea'], 
'diahrea': ['diarrhea'],
'diarhea': ['diarrhea'],
'diarr': ['diarrhea'],

'constipated': ['constipation'],

'abd n/v': ['abdominal pain', 'nausea and vomiting'],
'abd pain/n/v':['abdominal pain', 'nausea and vomiting'],
'ap/n/v': ['abdominal pain', 'nausea and vomiting'],
'nausea/diarrhea/abd pain': ['nausea', 'diarrhea', 'abdominal pain'],



'shortness of breath/abd pain':['shortness of breath, abd pain'],

'body aches/constipation': ['body aches', 'constipation'],


'cp': ['chest pain'],
'c.p': ['chest pain'],
'c/p': ['chest pain'],
'/cp': ['chest pain'],

'cp /sob': ['chest pain', 'shortness of breath'],
'cp h/a': ['chest pain',' headache'],
'cp sob': ['chest pain', 'shortness of breath'],
'cp svt': ['chest pain', 'supraventricular tachycardia'],
'cp..': ['chest pain'], 
'cp./sob': ['chest pain', 'shortness of breath'],
'cp/': ['chest pain'],
'cp/ sob': ['chest pain', 'shortness of breath'],
'cp/___': ['chest pain'],
'cp/afib': ['chest pain', 'atrial fibrillation'],
'cp/dka': ['chest pain', 'diabetic ketoacidosis'],
'cp/etoh': ['chest pain', 'alcohol intoxication'],
'cp/ha': ['chest pain', 'headache'],
'cp/htn': ['chest pain', 'hypertension'],
'cp/ili': ['chest pain', 'iliac region pain'],
#'cp/ili': ['chest pain', 'iliac region pain (or unclear term "ili")', '29857009'],  # "ili" may need clarification
'cp/n/v': ['chest pain', 'nausea and vomiting'],
'cp/pna': ['chest pain', 'pneumonia'],
'cp/sob': ['chest pain', 'dyspnea'],

'chest discomfort': ['chest pain'],
'central pain': ['central chest pain'],

'ha': ['headache'],
'h/a': ['headache'],
'head pain': ['headache'],
'ha persists': ['headache'], 
'h/a cp': ['headache', 'chest pain'],
'h/a cva': ['headache', 'stroke'],
'h/a htn': ['headache', 'hypertension'],
'h/a lbp': ['headache', 'low back pain'],

'h/a n/v': ['headache', 'nausea and vomiting'],
'h/a sob': ['headache', 'shortness of breath'],
'h/a st': ['headache', 'sore throat'],
'h/a/cp': ['headache', 'chest pain'],
'h/a/htn': ['headache', 'hypertension'],
'h/as': ['headaches'],
'ha.cp': ['headache', 'chest pain'],
'ha/ cp': ['headache', 'chest pain'],

'abd pain': ['abdominal pain'],
'abd': ['abdominal pain'],
'abdpain': ['abdominal pain'],
'abdo pain': ['abdominal pain'],
'stomach ache': ['abdominal pain'],
'epigastric pain': ['abdominal pain'],
'flank pain': ['flank pain'],
'abd pain /bilat pe': ['abd pain', 'bilat pe'],

'gi bleed': ['gastrointestinal hemorrhage'],
'gu pain': ['genitourinary pain'],
'gu': ['genitourinary symptoms'],
'gu eval': ['evaluation of genitourinary system'],
'gu evaluation': ['evaluation of genitourinary system'],

'uti': ['urinary tract infectious disease'],
'urinary infection': ['urinary tract infectious disease'],

'dvt': ['deep venous thrombosis'],
'leg clot': ['deep venous thrombosis'],

'fainted': ['syncope'],
'syncopal episode': ['syncope'],

'hypoglycemia': ['hypoglycemic state in diabetes'],
'low sugar': ['hypoglycemic state in diabetes'],

'rash': ['eruption'],
'skin rash': ['eruption'],
'hives': ['eruption'],

'fever': ['fever'],
'fev': ['fever'],    
'febrile': ['fever'],
'high temperature': ['fever'],

'back pain': ['backache'],
'leg pain': ['pain in limb'],
'arm pain': ['pain in limb'],
'shoulder pain': ['pain in shoulder'],

'blurred vision': ['visual disturbance'],
'vision loss': ['visual disturbance'],

'numbness': ['paresthesia'],
'tingling': ['paresthesia'],

'confused': ['confusional state'],
'weak': ['muscle weakness'],
'tired': ['fatigue'],

'bleeding': ['bleeding'],
'nosebleed': ['bleeding from nose'],
'blood in vomit': ['hematemesis'],
'blood in urine': ['blood in urine'],
'blood in stool': ['blood in feces symptom'],


'injury': ['traumatic injury'],
'trauma': ['traumatic injury'],
'fall': ['fall'],
's/p': ['fall'],  
# status post fall
'accident': ['victim of vehicular and_or traffic accident'],

'pregnant': ['pregnancy problem'],
'delivery': ['human parturition, function'],

'sad': ['depressed mood'],
'suicidal': ['suicide attempt'],
'depression': ['depressed mood'],

'burns': ['burn'],
'seizure': ['seizure undetermined whether focal or generalized'],
'convulsion': ['febrile convulsion'],

'cold': ['common cold'],
'sore throat': ['sore throat'],
'coughing': ['cough'],

#shortness of breath
'dypnea': ['dyspnea'],
'dypsnea': ['dyspnea'],
'dysnea': ['dyspnea'],
'dsypnea': ['dyspnea'],
'dypsnea': ['dyspnea'],
'dyspena': ['dyspnea'],    
'sob': ['dyspnea'],
'dyspnea': ['dyspnea'],
'breathlessness': ['dyspnea'],

#(painful urination)
'dysu':  ['dysuria'],
'dysuira':  ['dysuria'],
'dysurea': ['dysuria'],
'dysuria': ['dysuria'],
'dsyuria': ['dysuria'],

'ect': ['ectopic pregnancy'],
'ect tx': ['ectopic pregnancy'],
'ectopic': ['ectopic pregnancy'],
#'ectasy': ['MDMA use / Ecstasy overdose'],
'eczema': ['eczema'],
#swelling
'edema': ['edema'],


'dizzy': ['dizziness'],
'___/dizziness': ['dizziness'],
'dizzy..': ['dizziness'],
'dizzy/': ['dizziness'],
'dizzy and vomiting': ['dizziness', 'vomiting'],

'ear pain': ['otalgia'],
'throat pain': ['sore throat'],
'eye pain': ['problem of eye'],
'jaw pain': ['problem of jaw'],

'wound eval': ['wound evaluation'],

'rlq': ['abdominal pain'],
'llq': ['abdominal pain'],
'luq': ['abdominal pain'],
'ruq': ['abdominal pain'],

'bp': ['blood pressure'], 
'tia': ['transient ischemic attack'],


'pna': ['pneumonia'],
'pnx': ['pneumonia'],
'pna ___': ['Pneumonia'],
'pna/arf': ['Pneumonia', 'acute respiratory failure'],
#'pneumo': ['Pneumothorax or Pneumonia (needs context)', '67782005'],  # Assuming pneumothorax
#'pnx': ['Pneumothorax', '67782005']


'htn': ['hypertension'],
'af': ['atrial fibrillation'],
'pe': ['pulmonary embolism'],
'dka': ['diabetic ketoacidosis'],
'loc': ['loss of consciousness'],
'ftt': ['failure to thrive'],
'ili': ['influenza-like illness'],  
'ukn': ['Unknown'],
'fx': ['Fracture'],
'lbp': ['backache'],
'bp eVAL': ['evaluation of blood pressure'],
#'bp':['Blood pressure']
#'bp':['']
'mvc': ['motor vehicle accident'],

'cva': ['cerebrovascular accident'],
'cva / tpa': ['cerebrovascular accident', 'tissue plasminogen activator treatment'],
#73899007 
#snomed code= 95891005 or  103001002   check it by yourself  !!!!!
# OD	Drug overdose (likely self-harm):	703442003 (intentional)/405612005 (accidental)	*Depends on clinical notes/context

'od': ['overdose'],
'heroin od': ['heroin overdose'],
'accidental od': ['accidental overdose'],

'gib': ['gastrointestinal bleed'],
#'ah': ['failure to thrive'],
'sdh': ['subdural hematoma'],  
#Subdural hematoma (disorder) → SNOMED CT Code: 95435005

'aaa': ['abdominal aortic aneurysm'], 
#233985008 



's.a.': ['suicide attempt'],          
#'suicidal ideation': ['SI', 'S.I', 'SI WITH PLAN', 'SI / ETOH', 'SI DEPRESSION', 'S.I COMBATIVE', 'S.I / DETOX'],
'si': ['suicidal ideation'],   
#6471006	
's.i': ['suicidal ideation'],
'si with plan': ['suicidal ideation'],
'si w plan': ['suicidal ideation'],
'si/': ['suicidal ideation'],
'.si': ['suicidal ideation'],
's.i.': ['suicidal ideation'],
'___- si': ['suicidal ideation'],
'___ si': ['suicidal ideation'],
'___/ si': ['suicidal ideation'],
'___/si': ['suicidal ideation'],
'___- si hi': ['suicidal ideation', 'homicidal ideation'],
's.i ___': ['suicidal ideation'],
'si hi': ['suicidal ideation', 'homicidal ideation'],
'si- ah': ['suicidal ideation', 'auditory hallucinations'],
'si/ ah':  ['suicidal ideation', 'auditory hallucinations'],
'si/ hi':['suicidal ideation', 'homicidal ideation'],
'si/ od': ['suicidal ideation', 'overdose'],
'si/ sa': ['suicidal ideation', 'suicide attempt'],
'si/___': ['suicidal ideation'],
'si/ah':  ['suicidal ideation', 'auditory hallucinations'],
'si/cp': ['suicidal ideation', 'chest pain'],



's.t .': ['sore throat'],
's.t.': ['sore throat'],


'hi': ['homicidal ideation'],
'___/hi': ['homicidal ideation'],

'od/ si': ['overdose', 'suicidal ideation'],
'od/sa': ['overdose', 'suicide attempt'],
'od/si': ['overdose','suicidal ideation'],


#'etoh': ['alcoholism']  #7200002 – Alcoholism (disorder)
#'etoh': ['alcohol intoxication']  #191816009 – Alcohol intoxication (disorder)
#ingeneral:   ETOH → Alcohol use / intoxication

        
#'od':	Overdose or Right eye (oculus dexter)
#'ah': Atrial hypertrophy or auditory hallucination
#SSCP (Substernal Chest Pain)	Retrosternal chest pain (finding)	28770003

'!anemia': ['anemia'], 
'ah': ['auditory hallucinations'],  
'ah/si': ['auditory hallucinations', 'suicidal ideation'], 
'ah/vh': ['auditory hallucinations', 'visual hallucinations'],  
'vh/ah': ['visual hallucinations', 'auditory hallucinations'], 



'ams': ['altered mental status'],
'ams/ ha': ['altered mental status', 'headache'],
#'ah': ['auditory hallucinations', '279039007'],
#'ah/si': ['auditory hallucinations, suicidal ideation', '279039007, 35489007'],
#'ah/vh': ['auditory hallucinations, visual hallucinations', '279039007, 30087000'],


'anemic': ['anemia'],
'anaemia': ['anemia'], 
'anemia': ['anemia'],

'anxiety': ['Anxiety'],
'anxious': ['Anxiety'],
'anxity': ['Anxiety'],
'anxoius': ['Anxiety'],
'anx': ['Anxiety'],
'apnea': ['apnea'],

'ascites': ['ascites'],
'ascitis': ['ascites'],


'___- hypotension': ['hypotension'],
'___/ depression/si': ['depression','suicidal ideation'],
'___ & body pain': ['body pain'],
'___/ r hand numbness r/o comp syndrome': ['r hand numbness r/o comp syndrome'],
'/ infection': ['infection'], 
'infx': ['infection'], 

#'b/l pe': ['bilateral pulmonary embolism', '59282003']
#'back': ['back pain', '279039007'],
'back fx': ['back fracture'],  
# more generally "fracture of spine"
'bad h/a': ['severe headache'],  
# SNOMED for headache
'bat bit': ['bat bite'],  
# animal bite, specific to bat


'bc': ['birth control'],  
# assumed context (birth control), or blood culture
'bedtime': ['problem at bedtime/sleep disturbance'],  
# or insomnia related
'bi pap': ['bilevel positive airway pressure support'], 
# BiPAP support in respiratory distress


#sweating
'diaph': ['diaphoresis'],
#DIC
'dic': ['disseminated Intravascular Coagulation'],

'itch': ['itching'],
'itching': ['itching'],
'itchy': ['itching'],


'htn/cp': ['hypertension', 'chest pain'],
'htn/h/a': ['hypertension',  'headache'],
'htn/ha': ['hypertension',  'headache'],
'htn/sob': ['hypertension', 'dyspnea'],
'ili cp': ['iliac region', 'chest pain'],
'ili/ha': ['iliac pain',  'headache'],
'ili/sob': ['iliac pain', 'dyspnea'],


'head/cp': ['head pain', 'chest pain'],
'headace': ['headache'],
'hedache': ['headache'],


'ha/cp': ['headache', 'chest pain'],
'ha/etoh': ['headache', 'alcohol intoxication'],
'ha/fall': ['headache', 'fall'],
'ha/htn': ['headache', 'hypertension'],

'ha/sob': ['headache', 'dyspnea'],
'ha/st': ['headache', 'sore throat'],
'ha/vom': ['headache', 'vomiting'],

'gu sx': ['genitourinary symptoms'],
'guaic': ['guaiac positive stool'],
'guiac': ['guaiac positive stool'],
'guiacc': ['Guaiac positive stool'],
'gyn': ['gynecological complaint'],
'h.i.': ['head injury'],


'etoh': ['Alcohol use'],
'etoh/cp': ['Alcohol use', 'chest pain'],
'etoh/si': ['Alcohol use', ' suicidal ideation'],
'mvc/h/a': ['Motor vehicle collision', 'headache'],


'low hb': ['hemoglobin below reference range'],
'low hct': ['Low hematocrit', '36582005'],
'low hg': ['hemoglobin below reference range'],
'low hgb': ['hemoglobin below reference range'],
'low h/h': ['hemoglobin below reference range', 'hematocrit'],
'lle dvt': ['deep vein thrombosis'],
'low bp': ['hypotension'],
'low bps': ['low blood pressures'],

'lethagy': ['lethargy'],
'lethary': ['lethargy'], 

'si/ etoh': ['suicidal ideation', 'alcohol intoxication'],
'si/etoh': ['suicidal ideation', 'alcohol intoxication'],
'si/hi': ['suicidal', 'homicidal ideation'],
'si/od': ['suicidal ideation', 'overdose'],
'si/sa': ['suicidal ideation', 'suicide attempt'],



#  "siatica": ["sciatica", "230265002"],
#  "sick": ["feeling sick / general malaise", "162057007"],
#  "sizure": ["seizure (misspelling of seizure)", "91175000"],
#  "skin": ["skin complaint (general)", "399963005"],
#  "skull": ["skull fracture", "20917003"],
#  "sleepy": ["excessive drowsiness", "24823004"],
#  "slipped": ["fall or slipped disc (context dependent)", "7895008 (fall), 367391008 (slipped disc)"],
#  "slow hr": ["bradycardia (slow heart rate)", "48867003"],
#  "snycope": ["syncope (misspelled)", "271594007"],


'shortness of breath': ['dyspnea'],
'sob /cp': ['dyspnea', 'chest pain'],
'sob af': ['dyspnea', 'atrial fibrillation'],
'sob cp': ['dyspnea', 'chest pain'],
'sob ili': ['dyspnea', 'iliac pain'],
'sob. cp': ['dyspnea', 'chest pain'],


'sob/': ['dyspnea'],
'sob/ cp': ['dyspnea', 'chest pain'],
'sob/cp': ['dyspnea','chest pain'],
'sob/doe': ['dyspnea', 'dyspnea on exertion'],
'sob/dvt': ['dyspnea', 'deep vein thrombosis'],
'sob/htn': ['dyspnea', 'hypertension'],
'sob/pe': ['dyspnea', 'pulmonary embolism'],
'sob/pna': ['dyspnea', 'pneumonia'],
'sobbing': ['sobbing'],

'ulcers': ['ulcer'],
'throat': ['sore throat'],
'st/ha': ['sore throat', 'hdyspnea'],
'st/uri': ['sore throat', 'upper respiratory infection'],

'spasms':['muscle spasms'],

#'ssc': ['substernal chest pain (likely)	Possibly part of CP: 29857009
'sscp':	['substernal chest pain'],
#'st': ['Sore throat / Sinus tachycardia'] 
#(context needed)	87979003 (sore throat) or 698247007 (sinus tachycardia)




'sycopal': ['syncope'],
'sycope':  ['syncope'],
'syncope': ['syncope'],
'syncopy': ['syncope'],
'snycope': ['syncope'],
'syncope/presyncope': ['syncope', 'presyncope'],
'abd pain with syncope': ['abdominal pain', 'syncope'],
'near sycope/abd pain': ['syncope', 'abdominal pain'],
'headache/fever': ['headache', 'fever'],
'chest pain/abd pain': ['chest pain', 'abdominal pain'],

'___ meth today/cp': ['meth today', 'chest pain'],
'cough/congestion': ['cough', 'congestion'],
'dizziness/h/a/nausea':['dizziness', 'headache','nausea'],
'anxiety/depression':['anxiety', 'depression'],
'cp/nasal pain s/p mvc': ['chest pain','nasal pain s/p', 'mvc'],
'nausea/weakness': ['nausea','weakness'],
'weakness/vomiting': ['weakness', 'vomiting'],
'vomiting s/p chemo': ['vomiting', 's/p chemo'],
'dehydration/vomiting': ['dehydration', 'vomiting'],
'ha/dizzy/ dissection': ['headache', 'dizziness',  'dissection'],
'palpitation/flutter': ['palpitation', 'flutter'],
'h/a bodyaches': ['headache', 'bodyaches'],
'fatigue/nausea': ['fatigue','nausea'],
'nausea, dizziness': ['nausea', 'dizziness'],
'ha/vomiting s/p head injury': ['headache', 'vomiting', 's/p head injury'],
'abd mass/ agitation': ['abd mass', 'agitation'],
'chest tightness/dyspnea': ['chest tightness', 'dyspnea'],
'vomiting and/or nausea/diarrhea': ['vomiting','nausea', 'diarrhea'],
'chest pain/headache': ['chest pain', 'headache'],
'chest pain/pna': ['chest pain', 'pneumonia'],
'n/v/fever': ['nausea and vomiting', 'fever'],
'fever/abd pain': ['fever', 'abdominal pain'],


'dizzy and left eye blurry': ['dizziness' ,'left eye blurry'],
'dizzy and weak':['dizziness' ,'weakness'],
'chest and back pain':['chest pain' ,'back pain'],

'vomiting and chills /hallucinated':['vomiting' , 'chills' ,'hallucinated'],
'cp and palpatations': ['chest pain', 'palpatations'],
'dizzy and blurred vision':['dizziness','blurred vision'],
'n/v and flank pain':['nausea, vomiting' , 'flank pain'],
'cp and back pain/nausea':['chest pain', 'back pain','nausea'],
'fever and sob':['fever', 'dyspnea'],
'fever and diarrhea':['fever' , 'diarrhea'],
'cough and chest pain':['cough' , 'chest pain'],
'weakness and dizziness':['weakness' , 'dizziness'],
'chest pain and sob':['chest pain' , 'dyspnea'],
'h/a and vomiting':['headache' , 'vomiting'],
'n/v and cough':['nausea, vomiting' , 'cough'],
'headache and dizziness':['headache','dizziness'],
'chest and abd pain':['chest pain','abdominal pain'],





'fevers & vomiting':['fever','vomiting'],
'dizziness/ha/ syncope':['dizziness','headache','syncope'],
'diarrhea/ hypotensive': ['diarrhea','hypotensive'],
'sore throat/ dizziness': ['sore throat','dizziness'],
'fever/ nausea': ['fever','nausea'],
'abd pain / nausea': ['abdominal pain','nausea'],
'ha / abd pain': ['headache','abdominal pain'],
'syncope/ hit head': ['syncope','hit head'],
'dizzieness/ sob':['dizziness','dyspnea'],
'rlq pain/ dizziness':['abdominal pain','dizziness'],
'abd pain/ nvd':['abdominal pain', 'nausea and vomiting', 'diarrhea'],
'body aches/ diarrhea': ['body aches','diarrhea'],

'hiccups/ n/v abd pain': ['hiccups', 'nausea and vomiting', 'abdominal pain'],
'sob/ asthma flare': ['dyspnea', 'asthma flare'],
'headache/ neck pain': ['headache', 'neck pain'],
'neck/ back pain': ['neck pain', 'back pain'],
'dyspnea/ gen weakness/facial numbness': ['dyspnea', 'generalized weakness', 'facial numbness'],
'left leg pain/ htn/dizzy': ['left leg pain', 'hypertension', 'dizziness'],
'left elbow injury/ polysubstance abuse': ['left elbow injury', 'polysubstance abuse'],
'post op abd pain/ diarrhea': ['post-operative abdominal pain', 'diarrhea'],
'abd / rlq pain': ['abdominal pain', 'right lower quadrant pain'],
'n/v/d/ depression': ['nausea and vomiting', 'diarrhea', 'depression'],
'weakness/ fever': ['weakness', 'fever'],
'brbpr/ urinary pain': ['bright red blood per rectum', 'urinary pain'],
'n/v/d/ abd pain': ['nausea and vomiting', 'diarrhea', 'abdominal pain'],
'nasal congestion / hay fever': ['nasal congestion', 'hay fever'],
'diarrhea/ abd pain': ['diarrhea', 'abdominal pain'],
'weakness/ wound eval': ['weakness', 'wound evaluation'],
'anxiety/ feels at risk': ['anxiety', 'feeling at risk'],
'chest pain/ pressure': ['chest pain', 'chest pressure'],
'hyperglycemia/ near syncope': ['hyperglycemia', 'near syncope'],
'n/v lightheaded/ abd pain': ['nausea and vomiting', 'lightheadedness', 'abdominal pain'],
'cp w/ l arm numbness': ['chest pain', 'left arm numbness'],
'dizziness/ cp': ['dizziness', 'chest pain'],
'abd pain/ gi bleed': ['abdominal pain', 'gastrointestinal bleed'],
'abnormal labs/ jaundiced': ['abnormal labs', 'jaundiced'],
'___ pain/ fever': ['pain', 'fever'],
'sore throat/ r chest pain': ['sore throat', 'right chest pain'],
'h/a/ spinal pain': ['headache', 'spinal pain'],
'back pain/ sob': ['back pain', 'dyspnea'],
'abd pain/ vomiting': ['abdominal pain', 'vomiting'],
'abd pain / diarhea': ['abdominal pain', 'diarrhea'],
'si/ cough': ['suicidal ideation', 'cough'],
'constipation/ fever': ['constipation', 'fever'],
'cp/ palpatations': ['chest pain', 'palpitations'],
'h/a/ vomiting': ['headache', 'vomiting'],
'vomiting/ ___ pain': ['vomiting', 'pain'],
'___- nausea/ leg cramps': ['nausea', 'leg cramps'],
'h/a / weakness': ['headache', 'weakness'],
'sore throat/ bil. ear pain': ['sore throat','bil. ear pain'],
'dizziness/ episodic blurred vision': ['dizziness','episodic blurred vision'],



'abcess/ i & d': ['abscess', 'incision and drainage'],
'i & d': ['incision', 'drainage'],
'major trauma &#___; penetrating': ['major trauma', 'penetrating injury'],
'gib & nstemi': ['gastrointestinal bleeding', 'non-ST elevation myocardial infarction'],
'r/o hot&low': ['rule out hot', 'low blood pressure'],  # needs clarification
'fall & confused': ['fall', 'confusion'],
'abd pain & vomitting': ['abdominal pain', 'vomiting'],
'sob & wheezing': ['dyspnea', 'wheezing'],
'hip & back pain': ['hip pain', 'back pain'],
'vag bleed s/p d&c': ['vaginal bleeding', 'status post dilation and curettage'],
'l & r wrist injury': ['left wrist injury', 'right wrist injury'],
'earache& cough': ['earache', 'cough'],
'ha & nausea': ['headache', 'nausea'],
'abd pain & distention': ['abdominal pain', 'abdominal distention'],
'lle pain & swelling': ['left leg pain', 'swelling'],
'ha & stomach ache': ['headache', 'abdominal pain'],
'weak & dizzy': ['weakness', 'dizziness'],
'pe & dvt': ['pulmonary embolism', 'deep vein thrombosis'],
'wheezing &#___; no other complaints': ['wheezing'],
'shoulder & leg pain': ['shoulder pain', 'leg pain'],
'st & fevers': ['sore throat', 'fever'],
'abd pain & vag bleeding': ['abdominal pain', 'vaginal bleeding'],
'migrane & palpitations': ['migraine', 'palpitations'],
'abd pain & vomiting': ['abdominal pain', 'vomiting'],
'r sided weakness w/ edema &shift': ['right-sided weakness', 'edema', 'shift'],  
# "shift" unclear
'ha & dizzy': ['headache', 'dizziness'],
'weakness & bodyaches': ['weakness', 'body aches'],
'weakness & nausea': ['weakness', 'nausea'],
'dizzy & fall': ['dizziness', 'fall'],
'neck & face pain': ['neck pain', 'facial pain'],
'n&v dizziness x 3d': ['nausea', 'vomiting', 'dizziness'],
's/p mvc- neck & headache': ['status post motor vehicle collision', 'neck pain', 'headache'],
'h/a & back pain': ['headache', 'back pain'],
'i&d site bleeding': ['incision and drainage site', 'bleeding'],
'pain & tingling': ['pain', 'tingling'],
'n/v & dizziness': ['nausea and vomiting', 'dizziness'],
'hallucinations & unsafe behavior': ['hallucinations', 'unsafe behavior'],
'fevers & weakness': ['fever', 'weakness'],
'l foot swelling & pain': ['left foot swelling', 'pain'],
'major trauma &#___; blunt': ['major trauma', 'blunt injury'],
'dizzy & vomiting': ['dizziness', 'vomiting'],
'dizzy & n/v': ['dizziness', 'nausea and vomiting'],
'isolated chest trauma &#___; blunt': ['chest trauma', 'blunt injury'],
'finger & knee injury': ['finger injury', 'knee injury'],
'sob & cp': ['dyspnea', 'chest pain'],
'sob & shoulder pain': ['dyspnea', 'shoulder pain'],
'fevers & vomitting': ['fever', 'vomiting'],
'ha & vomitting': ['headache', 'vomiting'],
'& pin point pupils': ['pinpoint pupils'],
'fever & l arm pain': ['fever', 'left arm pain'],
'& fevers': ['fever'],
'elev trop & bnp': ['elevated troponin', 'elevated bnp'],
'n&v & abd pain': ['nausea', 'vomiting', 'abdominal pain'],
'i&d abscess': ['incision and drainage', 'abscess'],
'bun & creat elevated': ['elevated BUN', 'elevated creatinine'],
's/p d&c': ['status post dilation', 'curettage'],
'nausea & dizzy': ['nausea', 'dizziness'],
'fever & malaise': ['fever', 'malaise'],
'rle swelling & redness': ['right leg swelling', 'redness'],
'c & t1 fx': ['cervical fracture', 'thoracic fracture'],
'vertigo &palpitations': ['vertigo', 'palpitations'],
'incd ascites &dehydration': ['increased ascites', 'dehydration'],
'l knee pain & r arm pain': ['left knee pain', 'right arm pain'],
'cough & sob': ['cough', 'dyspnea'],
'cp & syncope': ['chest pain', 'syncope'],
'cough & congested': ['cough', 'congestion'],
'h/a & neck pain': ['headache', 'neck pain'],
'fevers & ha': ['fever', 'headache'],
'back & leg pain': ['back pain', 'leg pain'],
's/p trip & fall': ['status post fall'],
'3&4th digit injury': ['third digit injury', 'fourth digit injury'],
'drsg change & new abscess eval': ['dressing change', 'abscess evaluation'],
'chills & feeling sick': ['chills', 'malaise'],
'neck & back pain': ['neck pain', 'back pain'],
'abd pain & cough': ['abdominal pain', 'cough'],
'preg/abd&back pain': ['pregnancy', 'abdominal pain', 'back pain'],
'fevers & high bs': ['fever', 'high blood sugar'],
'lightheaded & dizzy': ['lightheadedness', 'dizziness'],
'cold &cough': ['cold', 'cough'],
'stiff neck & fevers': ['stiff neck', 'fever'],
'cough & fevers': ['cough', 'fever'],
'tired & weak': ['fatigue', 'weakness'],
'&dizziness': ['dizziness'],
'lac to left thumb & index finger': ['laceration to left thumb', 'laceration to left index finger'],
'cold s&s': ['cold symptoms'],
'abnl labs & sob': ['abnormal labs', 'dyspnea'],
'sob & lethargic': ['dyspnea', 'lethargy'],
'pain following i&d': ['pain', 'post-procedural pain'],
'chin & tongue lacerations': ['chin laceration', 'tongue laceration'],
'cyst i&d': ['cyst', 'incision and drainage'],


#'soc": ["standard of care / seen on call (context-dependent, unclear)", "no specific SNOMED – depends on context"],
#'sores": ["skin sores or ulcers", "128477000"],

#"sp etoh": ["status post alcohol use/intoxication", "191816009"],
#"sp fall": ["status post fall", "271593001"],
#"sp mva": ["status post motor vehicle accident", "271327008"],
#"sp sz": ["status post seizure", "91175000"],
#"sp tpa": ["status post tPA treatment", "705129003"],
#"spasm": ["muscle spasm", "263204007"]


"""

# Extract keys using regex (this assumes keys are followed by a colon)
keys = re.findall(r"['\"]([^'\"]+)['\"]\s*:", raw_text)

# Count duplicates
key_counts = Counter(keys)

# Print keys that appear more than once
duplicates = {k: v for k, v in key_counts.items() if v > 1}
print(duplicates)


{'cp/ili': 2, 'dypsnea': 2, 'bp': 3, 'pnx': 2, 'od': 2, 'ah': 4, 'etoh': 3, 'ah/si': 2, 'ah/vh': 2, 'snycope': 2}


In [98]:
# Function to apply mapping or keep original
def expand_symptom(term, mapping):
    term_clean = term.lower().strip()
    return mapping.get(term_clean, [term_clean])

# Apply to DataFrame
data['expanded_symptoms'] = data['indiv_symptom'].apply(lambda x: expand_symptom(x, symptom_expand_map))

In [99]:
data.head(43)

Unnamed: 0,subject_id,stay_id,intime,outtime,gender,race,chiefcomplaint,anchor_age,anchor_year,anchor_year_group,...,died_within_30_days,race_standard,age_group,unique_visit_id,terms,terms_new,indiv_symptom,counter,unique_ids_exploded,expanded_symptoms
0,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,...,0,White,46-65,10000032_33258284,"[abd pain, abd distention]","abd pain, abd distention",abd pain,1,33258284_1,[abdominal pain]
0,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,...,0,White,46-65,10000032_33258284,"[abd pain, abd distention]","abd pain, abd distention",abd distention,2,33258284_2,[abd distention]
1,10000032,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,Abdominal distention,52.0,2180.0,2014 - 2016,...,0,White,46-65,10000032_38112554,[abd distention],abd distention,abd distention,1,38112554_1,[abd distention]
2,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,0,White,46-65,10000032_35968195,"[n/v/d, abd pain]","nausea and vomiting, diarrhea, abd pain",nausea and vomiting,1,35968195_1,[nausea and vomiting]
2,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,0,White,46-65,10000032_35968195,"[n/v/d, abd pain]","nausea and vomiting, diarrhea, abd pain",diarrhea,2,35968195_2,[diarrhea]
2,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,0,White,46-65,10000032_35968195,"[n/v/d, abd pain]","nausea and vomiting, diarrhea, abd pain",abd pain,3,35968195_3,[abdominal pain]
3,10000032,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,Hypotension,52.0,2180.0,2014 - 2016,...,0,White,46-65,10000032_32952584,[hypotension],hypotension,hypotension,1,32952584_1,[hypotension]
4,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,...,0,White,46-65,10000032_39399961,"[abd distention, abd pain, lethagic]","abd distention, abd pain, lethagic",abd distention,1,39399961_1,[abd distention]
4,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,...,0,White,46-65,10000032_39399961,"[abd distention, abd pain, lethagic]","abd distention, abd pain, lethagic",abd pain,2,39399961_2,[abdominal pain]
4,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,...,0,White,46-65,10000032_39399961,"[abd distention, abd pain, lethagic]","abd distention, abd pain, lethagic",lethagic,3,39399961_3,[lethagic]


In [100]:
data.columns

Index(['subject_id', 'stay_id', 'intime', 'outtime', 'gender', 'race',
       'chiefcomplaint', 'anchor_age', 'anchor_year', 'anchor_year_group',
       'dod', 'dead_in_days', 'died_within_30_days', 'race_standard',
       'age_group', 'unique_visit_id', 'terms', 'terms_new', 'indiv_symptom',
       'counter', 'unique_ids_exploded', 'expanded_symptoms'],
      dtype='object')

In [101]:
import string


def expand_symptoms_directly(df):
    expanded_rows = []
    for _, row in df.iterrows():
        symptoms = row['expanded_symptoms'] if isinstance(row['expanded_symptoms'], list) else []
        if not symptoms:
            # Case 3: No symptoms
            new_row = row.copy()
            new_row['expanded_symptoms_new'] = None
            new_row['counter_new'] = None
            new_row['unique_ids_exploded_new'] = None
            expanded_rows.append(new_row)
        else:
            # Case 1 & 2: One or more symptoms
            for i, symptom in enumerate(symptoms):
                new_row = row.copy()
                letter = string.ascii_lowercase[i]
                new_row['expanded_symptoms_new'] = symptom
                new_row['counter_new'] = letter
                new_row['unique_ids_exploded_new'] = f"{row['unique_ids_exploded']}_{letter}"
                expanded_rows.append(new_row)
    return pd.DataFrame(expanded_rows)

# Apply corrected logic
final_expanded_data_fixed = expand_symptoms_directly(data)


In [102]:
final_expanded_data_fixed.head(20)

Unnamed: 0,subject_id,stay_id,intime,outtime,gender,race,chiefcomplaint,anchor_age,anchor_year,anchor_year_group,...,unique_visit_id,terms,terms_new,indiv_symptom,counter,unique_ids_exploded,expanded_symptoms,expanded_symptoms_new,counter_new,unique_ids_exploded_new
0,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,...,10000032_33258284,"[abd pain, abd distention]","abd pain, abd distention",abd pain,1,33258284_1,[abdominal pain],abdominal pain,a,33258284_1_a
0,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,...,10000032_33258284,"[abd pain, abd distention]","abd pain, abd distention",abd distention,2,33258284_2,[abd distention],abd distention,a,33258284_2_a
1,10000032,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,Abdominal distention,52.0,2180.0,2014 - 2016,...,10000032_38112554,[abd distention],abd distention,abd distention,1,38112554_1,[abd distention],abd distention,a,38112554_1_a
2,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,10000032_35968195,"[n/v/d, abd pain]","nausea and vomiting, diarrhea, abd pain",nausea and vomiting,1,35968195_1,[nausea and vomiting],nausea and vomiting,a,35968195_1_a
2,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,10000032_35968195,"[n/v/d, abd pain]","nausea and vomiting, diarrhea, abd pain",diarrhea,2,35968195_2,[diarrhea],diarrhea,a,35968195_2_a
2,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,10000032_35968195,"[n/v/d, abd pain]","nausea and vomiting, diarrhea, abd pain",abd pain,3,35968195_3,[abdominal pain],abdominal pain,a,35968195_3_a
3,10000032,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,Hypotension,52.0,2180.0,2014 - 2016,...,10000032_32952584,[hypotension],hypotension,hypotension,1,32952584_1,[hypotension],hypotension,a,32952584_1_a
4,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,...,10000032_39399961,"[abd distention, abd pain, lethagic]","abd distention, abd pain, lethagic",abd distention,1,39399961_1,[abd distention],abd distention,a,39399961_1_a
4,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,...,10000032_39399961,"[abd distention, abd pain, lethagic]","abd distention, abd pain, lethagic",abd pain,2,39399961_2,[abdominal pain],abdominal pain,a,39399961_2_a
4,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,...,10000032_39399961,"[abd distention, abd pain, lethagic]","abd distention, abd pain, lethagic",lethagic,3,39399961_3,[lethagic],lethagic,a,39399961_3_a


In [103]:
# Save short terms data to CSV
final_expanded_data_fixed.to_csv('dataset/ed/finals/14_exploded_dataset.csv', index=False)

In [104]:
final_expanded_data_fixed.columns

Index(['subject_id', 'stay_id', 'intime', 'outtime', 'gender', 'race',
       'chiefcomplaint', 'anchor_age', 'anchor_year', 'anchor_year_group',
       'dod', 'dead_in_days', 'died_within_30_days', 'race_standard',
       'age_group', 'unique_visit_id', 'terms', 'terms_new', 'indiv_symptom',
       'counter', 'unique_ids_exploded', 'expanded_symptoms',
       'expanded_symptoms_new', 'counter_new', 'unique_ids_exploded_new'],
      dtype='object')

In [105]:
# Prepare the SNOMED dataset to include only the relevant columns
snomed_clean = snomed[['text', 'snomed']].drop_duplicates(subset='text')

# Merge only the SNOMED code using a left join
merged_final = pd.merge(
    final_expanded_data_fixed,
    snomed_clean,
    left_on='expanded_symptoms_new',
    right_on='text',
    how='left'
)

# Drop the redundant 'text' column after the merge
merged_final.drop(columns=['text'], inplace=True)



In [106]:
print(merged_final.columns)
print(merged_final[['expanded_symptoms_new', 'snomed']].head())


Index(['subject_id', 'stay_id', 'intime', 'outtime', 'gender', 'race',
       'chiefcomplaint', 'anchor_age', 'anchor_year', 'anchor_year_group',
       'dod', 'dead_in_days', 'died_within_30_days', 'race_standard',
       'age_group', 'unique_visit_id', 'terms', 'terms_new', 'indiv_symptom',
       'counter', 'unique_ids_exploded', 'expanded_symptoms',
       'expanded_symptoms_new', 'counter_new', 'unique_ids_exploded_new',
       'snomed'],
      dtype='object')
  expanded_symptoms_new     snomed
0        abdominal pain        NaN
1        abd distention  609624008
2        abd distention  609624008
3   nausea and vomiting   16932000
4              diarrhea   62315008


In [107]:
merged_final.head()

Unnamed: 0,subject_id,stay_id,intime,outtime,gender,race,chiefcomplaint,anchor_age,anchor_year,anchor_year_group,...,terms,terms_new,indiv_symptom,counter,unique_ids_exploded,expanded_symptoms,expanded_symptoms_new,counter_new,unique_ids_exploded_new,snomed
0,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,...,"[abd pain, abd distention]","abd pain, abd distention",abd pain,1,33258284_1,[abdominal pain],abdominal pain,a,33258284_1_a,
1,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,...,"[abd pain, abd distention]","abd pain, abd distention",abd distention,2,33258284_2,[abd distention],abd distention,a,33258284_2_a,609624008.0
2,10000032,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,Abdominal distention,52.0,2180.0,2014 - 2016,...,[abd distention],abd distention,abd distention,1,38112554_1,[abd distention],abd distention,a,38112554_1_a,609624008.0
3,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,"[n/v/d, abd pain]","nausea and vomiting, diarrhea, abd pain",nausea and vomiting,1,35968195_1,[nausea and vomiting],nausea and vomiting,a,35968195_1_a,16932000.0
4,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,"[n/v/d, abd pain]","nausea and vomiting, diarrhea, abd pain",diarrhea,2,35968195_2,[diarrhea],diarrhea,a,35968195_2_a,62315008.0


In [108]:
# Count how many rows are missing SNOMED codes
missing_snomed_count = merged_final['snomed'].isna().sum()

# Check for duplicates in 'unique_ids_exploded_new'
duplicated_unique_ids = merged_final['unique_ids_exploded_new'].duplicated().sum()

# Return both results
missing_snomed_count, duplicated_unique_ids


(347976, 0)

In [109]:
# Filter rows where SNOMED code is missing
missing_snomed_rows = merged_final[merged_final['snomed'].isna()]

# Get unique symptom names with missing SNOMED code
unique_missing_symptoms = missing_snomed_rows['expanded_symptoms_new'].dropna().unique()

# Convert to list for easy viewing
unique_missing_symptoms_list = list(unique_missing_symptoms)

unique_missing_symptoms_list[:20], len(unique_missing_symptoms_list)  # Show first 20 and total count


(['abdominal pain',
  'lethagic',
  'hallucinations',
  'altered mental status',
  'b pedal edema',
  'left cheek swelling',
  'l cheek abscess',
  'l facial swelling',
  'suture removal',
  'laceration',
  'throat foreign body sensation',
  'l hip pain',
  'r foot pain',
  'anemia s/p fall',
  'luq abd pain',
  'influenza-like illness',
  'Alcohol use',
  'l shoulder pain',
  'visual changes',
  'abnormal labs'],
 24011)

In [110]:
# Ensure lowercase terms in lookup
look_up['term'] = look_up['term'].str.strip().str.lower()

# Create a copy of the main dataset
merged_with_lookup = merged_final.copy()

# Convert gender to is_male
merged_with_lookup['is_male'] = merged_with_lookup['gender'].str.upper().map({'M': True, 'F': False})

# Round anchor_age to get ed_age
merged_with_lookup['ed_age'] = merged_with_lookup['anchor_age'].round().astype('Int64')

# Prepare for merging
merged_with_lookup = merged_with_lookup.merge(
    look_up[['term', 'symptom', 'is_male', 'ed_age', 'mu', 'sigma']],
    how='left',
    left_on=['expanded_symptoms_new', 'is_male', 'ed_age'],
    right_on=['term', 'is_male', 'ed_age']
)

# Drop the redundant 'term' column after merging
merged_with_lookup.drop(columns=['term'], inplace=True)

In [111]:
merged_with_lookup.head(20)

Unnamed: 0,subject_id,stay_id,intime,outtime,gender,race,chiefcomplaint,anchor_age,anchor_year,anchor_year_group,...,expanded_symptoms,expanded_symptoms_new,counter_new,unique_ids_exploded_new,snomed,is_male,ed_age,symptom,mu,sigma
0,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,...,[abdominal pain],abdominal pain,a,33258284_1_a,,False,52,21522001.0,-5.522575,0.052208
1,10000032,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,"Abd pain, Abdominal distention",52.0,2180.0,2014 - 2016,...,[abd distention],abd distention,a,33258284_2_a,609624008.0,False,52,,,
2,10000032,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,Abdominal distention,52.0,2180.0,2014 - 2016,...,[abd distention],abd distention,a,38112554_1_a,609624008.0,False,52,,,
3,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,[nausea and vomiting],nausea and vomiting,a,35968195_1_a,16932000.0,False,52,16932000.0,-4.561999,0.151079
4,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,[diarrhea],diarrhea,a,35968195_2_a,62315008.0,False,52,62315008.0,-5.942055,0.206403
5,10000032,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,"n/v/d, Abd pain",52.0,2180.0,2014 - 2016,...,[abdominal pain],abdominal pain,a,35968195_3_a,,False,52,21522001.0,-5.522575,0.052208
6,10000032,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,Hypotension,52.0,2180.0,2014 - 2016,...,[hypotension],hypotension,a,32952584_1_a,45007003.0,False,52,45007003.0,-3.693187,0.2986
7,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,...,[abd distention],abd distention,a,39399961_1_a,609624008.0,False,52,,,
8,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,...,[abdominal pain],abdominal pain,a,39399961_2_a,,False,52,21522001.0,-5.522575,0.052208
9,10000032,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,"Abdominal distention, Abd pain, LETHAGIC",52.0,2180.0,2014 - 2016,...,[lethagic],lethagic,a,39399961_3_a,,False,52,,,


In [112]:
# Check if all values in 'unique_ids_exploded_new' are unique
is_unique = merged_with_lookup['unique_ids_exploded_new'].is_unique

# Count of duplicated entries, if any
duplicate_count = merged_with_lookup['unique_ids_exploded_new'].duplicated().sum()

is_unique, duplicate_count

(True, 0)

In [113]:

# Save short terms data to CSV
merged_with_lookup.to_csv('dataset/ed/finals/15_finaldata_with prohability.csv', index=False)

In [114]:
# Check unique symptom codes from the 'symptom' column (SNOMED codes)
unique_symptom_codes = merged_with_lookup['symptom'].dropna().unique()
unique_symptom_codes_count = len(unique_symptom_codes)

unique_symptom_codes_count, unique_symptom_codes[:10]  # Show total count and first 10 unique codes for preview


(134,
 array(['21522001', '16932000', '62315008', '45007003', '128477000',
        '422400008', '404640003', '29857009', '267036007', '237633009'],
       dtype=object))

In [None]:
# Step 1: Your mapping dictionary (already defined as symptom_expand_map)

# Step 2: Apply mapping and handle explosion
expanded_rows = []

for _, row in data.iterrows():
    # Skip rows with missing symptoms
    if pd.isna(row['indiv_symptoms']):
        row['mapped_symptom'] = None
        expanded_rows.append(row)
        continue

    original = row['indiv_symptoms'].strip().lower()
    base_index = str(row['symptom_index'])
    stay_id = str(row['stay_id'])
    
    # Case 1: if symptom needs to be expanded to multiple terms
    if original in symptom_expand_map:
        mapped_list = symptom_expand_map[original]
        
        if isinstance(mapped_list, list):
            for i, mapped in enumerate(mapped_list, start=1):
                new_row = row.copy()
                new_row['mapped_symptom'] = mapped
                new_row['symptom_index'] = f"{base_index}_{i}"
                new_row['stayed_id_num'] = f"{stay_id}_{base_index}_{i}"
                expanded_rows.append(new_row)
        else:
            new_row = row.copy()
            new_row['mapped_symptom'] = mapped_list
            expanded_rows.append(new_row)
    else:
        # Case 2: keep original symptom if no mapping found
        row['mapped_symptom'] = row['indiv_symptoms']
        expanded_rows.append(row)

# Step 3: Create final DataFrame
data = pd.DataFrame(expanded_rows)


In [None]:
data.head(20)

In [None]:
data.to_csv("dataset/ed/finals/10_mapped_symptoms_dataset.csv", index=False)

In [None]:
# Step 1: Drop missing values just in case
short_symptoms = data['mapped_symptom'].dropna()

# Step 2: Convert to lowercase and remove duplicates
short_symptoms = short_symptoms.astype(str).str.strip().str.lower().unique()

# Step 3: Filter those with fewer than 4 characters
abbreviated_symptoms = [symptom for symptom in short_symptoms if len(symptom) < 4]

# Step 4: Show result
print("Potential Abbreviated Symptoms (< 4 characters):")
print(abbreviated_symptoms)


In [None]:
data.to_csv("dataset/ed/finals/11_mapped_symptoms_dataset.csv", index=False)

In [None]:
snomed_df = pd.read_csv("dataset/ed/finals/snomed.csv")  # Update path if needed
snomed_df.columns = ['count', 'text', 'snomed']
snomed_df['text'] = snomed_df['text'].str.strip().str.lower()
snomed_df = snomed_df.dropna(subset=['snomed'])

# Create lookup dictionary
snomed_lookup = dict(zip(snomed_df['text'], snomed_df['snomed']))


In [None]:
def clean_chief_complaint(text):
    if pd.isna(text):
        return []

    # Lowercase and remove unwanted characters
    text = text.lower()
    text = re.sub(r"[\'\"+\?]", "", text)

    # Remove directional terms
    directional_terms = [
        r"\bllq\s*", r"\brlq\s*", r"\bluq\s*", r"\bruq\s*",
        r"\bl\s*", r"\br\s*", r"\blower\s*", r"\bupper\s*"
    ]
    for pattern in directional_terms:
        text = re.sub(pattern, "", text)

    # Normalize terminology
    text = re.sub(r"\babdominal\b", "abd", text)

    # Split on comma to handle multiple complaints
    return [t.strip() for t in text.split(",") if t.strip()]


In [None]:
def map_to_snomed(complaints):
    return [snomed_lookup[term] for term in complaints if term in snomed_lookup]

# Clean and map
merged_data['chiefcomplaint_cleaned'] = merged_data['chiefcomplaint'].apply(clean_chief_complaint)
merged_data['snomed_codes'] = merged_data['chiefcomplaint_cleaned'].apply(map_to_snomed)


In [None]:
snomed_codes

In [None]:
# Sample random chief complaints to get a sense of variation
random_complaints = data['chiefcomplaint'].sample(20)
print("Random sample of chief complaints:")
print(random_complaints)

# Look at the most common chief complaints
top_complaints = data['chiefcomplaint'].value_counts().head(30)
print("\nTop 30 most common chief complaints:")
print(top_complaints)

# Look at some complaints in the middle of the frequency distribution
mid_freq_complaints = data['chiefcomplaint'].value_counts().iloc[100:120]
print("\nMid-frequency chief complaints (rank 100-120):")
print(mid_freq_complaints)

# Look at some of the least common complaints
rare_complaints = data['chiefcomplaint'].value_counts().tail(20)
print("\nSome of the least common chief complaints:")
print(rare_complaints)

# Check for patterns with specific words
for term in ['pain', 'chest', 'abdominal', 'fall', 'head', 'breath']:
    count = data['chiefcomplaint'].str.contains(term, case=False, na=False).sum()
    print(f"Complaints containing '{term}': {count}")

In [None]:

snomed['text'] = snomed['text'].str.strip().str.lower()
snomed = snomed.dropna(subset=['snomed'])

# Create lookup dictionary
snomed_lookup = dict(zip(snomed['text'], snomed['snomed']))


In [None]:
def clean_chief_complaint(text):
    if pd.isna(text):
        return []

    # Lowercase and remove unwanted characters
    text = text.lower()
    text = re.sub(r"[\'\"+\?]", "", text)

    # Remove directional terms
    directional_terms = [
        r"\bllq\s*", r"\brlq\s*", r"\bluq\s*", r"\bruq\s*",
        r"\bl\s*", r"\br\s*", r"\blower\s*", r"\bupper\s*"
    ]
    for pattern in directional_terms:
        text = re.sub(pattern, "", text)

    # Normalize terminology
    text = re.sub(r"\babdominal\b", "abd", text)

    # Split on comma to handle multiple complaints
    return [t.strip() for t in text.split(",") if t.strip()]


In [None]:
def map_to_snomed(complaints):
    return [snomed_lookup[term] for term in complaints if term in snomed_lookup]

# Clean and map
data['chiefcomplaint_cleaned'] = data['chiefcomplaint'].apply(clean_chief_complaint)
data['snomed_codes'] = data['chiefcomplaint_cleaned'].apply(map_to_snomed)


In [None]:
data['snomed_primary'] = data['snomed_codes'].apply(lambda x: x[0] if x else None)

In [None]:
data[['chiefcomplaint', 'chiefcomplaint_cleaned', 'snomed_codes']].sample(10)

In [None]:
mapped_count = data['snomed_codes'].apply(lambda x: len(x) > 0).sum()
total_count = len(data)
print(f"Mapped SNOMED complaints: {mapped_count} out of {total_count}")
print("*******************************************")

print(f"Mapping success rate: {round(mapped_count / total_count * 100, 2)}%")
print("*******************************************")

data['snomed_primary'] = data['snomed_codes'].apply(lambda x: x[0] if x else None)
print(data['snomed_primary'].value_counts().head(10))

print("**************************************************")
unmapped = data[data['snomed_codes'].apply(len) == 0]
print(unmapped['chiefcomplaint'].value_counts().head(10))



In [None]:
merged_data.to_csv("dataset/ed/finals/dataset/ed/4_merged_data.csv", index=False)