In [1]:
import pandas as pd

allergies = pd.read_csv('csv_filtered/allergies.csv')
careplans = pd.read_csv('csv_filtered/careplans.csv')
conditions = pd.read_csv('csv_filtered/conditions.csv')
encounters = pd.read_csv('csv_filtered/encounters.csv')
immunizations = pd.read_csv('csv_filtered/immunizations.csv')
medications = pd.read_csv('csv_filtered/medications.csv')
observations = pd.read_csv('csv_filtered/observations.csv')
patients = pd.read_csv('csv_filtered/patients.csv')
procedures = pd.read_csv('csv_filtered/procedures.csv')

print("Shape of allergies DataFrame:", allergies.shape)
print("Shape of careplans DataFrame:", careplans.shape)
print("Shape of conditions DataFrame:", conditions.shape)
print("Shape of encounters DataFrame:", encounters.shape)
print("Shape of immunizations DataFrame:", immunizations.shape)
print("Shape of medications DataFrame:", medications.shape)
print("Shape of observations DataFrame:", observations.shape)
print("Shape of patients DataFrame:", patients.shape)
print("Shape of procedures DataFrame:", procedures.shape)

Shape of allergies DataFrame: (794, 6)
Shape of careplans DataFrame: (3931, 9)
Shape of conditions DataFrame: (38094, 6)
Shape of encounters DataFrame: (61459, 7)
Shape of immunizations DataFrame: (17009, 5)
Shape of medications DataFrame: (56430, 8)
Shape of observations DataFrame: (531144, 7)
Shape of patients DataFrame: (1163, 21)
Shape of procedures DataFrame: (83823, 6)


# Leave the data of the latest day.

## 1) Allergies

In [2]:
allergies

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
0,2/17/20,,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,1.110880e+08,Latex (substance)
1,2/17/20,,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,8.448900e+07,Mold (organism)
2,2/17/20,,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,2.601470e+08,House dust mite (organism)
3,2/17/20,,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,2.642870e+08,Animal dander (substance)
4,2/17/20,,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,2.562770e+08,Grass pollen (substance)
...,...,...,...,...,...,...
789,5/20/87,,27cb1c2f-4ec0-6e24-d080-e46178f712d5,04d64fc5-6137-3d7a-06b6-dd6dc2d6a7c1,4.425710e+14,Tree nut (substance)
790,4/9/01,,0cf08145-a634-329f-54d3-9cbd04ba18cc,ef2f78b9-750e-607b-f3f4-3de39ee2eeec,1.191000e+03,Aspirin
791,4/9/01,,0cf08145-a634-329f-54d3-9cbd04ba18cc,ef2f78b9-750e-607b-f3f4-3de39ee2eeec,1.022630e+08,Eggs (edible) (substance)
792,8/15/80,,9c532b25-cd38-f13a-f2de-2f5538326ed6,b3afbc77-ec8c-1cf8-05fe-740c91fa1d0f,1.191000e+03,Aspirin


In [3]:
allergies.isnull().sum()

START            0
STOP           794
PATIENT          0
ENCOUNTER        0
CODE             0
DESCRIPTION      0
dtype: int64

In [4]:
allergies.drop(columns=['STOP'], inplace=True)

In [5]:
# Check for unique START dates per PATIENT
start_check = allergies.groupby('PATIENT')['START'].nunique()

# Check for unique ENCOUNTERs per PATIENT
encounter_check = allergies.groupby('PATIENT')['ENCOUNTER'].nunique()

# Combine the checks into a single dataframe for easier interpretation
check_df = pd.DataFrame({'Unique_START_Dates': start_check, 'Unique_ENCOUNTERs': encounter_check})

check_df

Unnamed: 0_level_0,Unique_START_Dates,Unique_ENCOUNTERs
PATIENT,Unnamed: 1_level_1,Unnamed: 2_level_1
0142b69f-57f0-9a08-4e2d-65a2b77fdea7,1,1
01d78eb5-7f50-45e9-f524-921196a3dffe,1,1
01f8bbfd-cfc6-3b97-8bc1-8da6f0b4a9a8,1,1
0288c42c-43a1-9878-4a9d-6b96caa12c40,1,1
02ceca12-357f-981e-dcf3-3d26d3c1ff82,1,1
...,...,...
fc8e4748-86f4-1d49-af95-4c7e48678ca7,1,1
fca626fd-ab99-57fb-ba8d-f0bad053788b,1,1
fcca54e8-5abe-cd71-30a0-464f15368bea,1,1
ff59238a-9508-b0e5-39f4-0d4afcbe6f43,1,1


In [6]:
# To verify that all values in the 'Unique_START_Dates' and 'Unique_ENCOUNTERs' columns are 1s,
# we can use a simple check to see if all the values in these columns equal to 1.

all_ones_start = (check_df['Unique_START_Dates'] == 1).all()
all_ones_encounter = (check_df['Unique_ENCOUNTERs'] == 1).all()

all_ones_start, all_ones_encounter

(True, True)

There is no patient that has several encounters or several start dates.

In [7]:
# Group by 'CODE' and count the unique 'DESCRIPTION' values
code_description_counts = allergies.groupby('CODE')['DESCRIPTION'].nunique()

# Check if any code has more than one description
codes_with_multiple_descriptions = code_description_counts[code_description_counts > 1]

if codes_with_multiple_descriptions.empty:
    print("All codes have exactly one unique description.")
else:
    print("Some codes still have multiple descriptions:")
    print(codes_with_multiple_descriptions)

All codes have exactly one unique description.


Flatten the data.

In [8]:
# First, sort the data by 'CODE'
sorted_data = allergies.sort_values(by='CODE')

# Create a new column for indicating the presence of an allergy
sorted_data['AllergyPresent'] = 1

# Pivot the table to have one column for each allergy code, filled with 1s and 0s
pivot_table = sorted_data.pivot_table(index=['START', 'PATIENT', 'ENCOUNTER'], 
                                      columns='DESCRIPTION', 
                                      values='AllergyPresent', 
                                      fill_value=0).reset_index()

# Set the name of the columns index to None to remove the 'DESCRIPTION'
pivot_table.columns.name = None

# Because the pivot operation might have sorted the columns alphabetically by default,
# ensure the columns are ordered by allergy code by reordering them based on the initial sort
# First, get the order of allergy descriptions by code
allergy_order = sorted_data[['CODE', 'DESCRIPTION']].drop_duplicates().sort_values('CODE')['DESCRIPTION']

# Reorder the columns in the pivot table according to the sorted allergy descriptions
ordered_columns = ['START', 'PATIENT', 'ENCOUNTER'] + list(allergy_order)
pivot_table = pivot_table.reindex(columns=ordered_columns)

allergies_converted = pivot_table
allergies_converted

Unnamed: 0,START,PATIENT,ENCOUNTER,Aspirin,Ibuprofen,Penicillin V,Sulfamethoxazole / Trimethoprim,cefdinir,Lisinopril,Cow's milk (substance),...,Soya bean (substance),House dust mite (organism),Animal dander (substance),Bee venom (substance),Wheat (substance),Shellfish (substance),Fish (substance),Peanut (substance),Tree pollen (substance),Tree nut (substance)
0,1/1/05,307b6419-5147-4716-10b1-bb458ac191c3,1b7e57b0-cfda-1832-27c6-6f629a05ce17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1/11/03,25e8dac5-c8bc-3dc7-d319-140d1da359c1,92f63e3c-72a7-e521-9b32-b173263794a9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
2,1/11/15,8baeb614-ce03-cedf-2694-92b70f894886,a32d2519-3f50-0009-6082-874495ec845d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1/12/75,ee1a72db-aa92-9377-48fa-f7d9f1f8443b,6d1d01cb-8d4f-6789-ecc7-dc8ed0dbef2a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1/17/07,4ba15182-a126-7f01-a46e-1901c5c5ceae,3ad196bf-1083-eb2e-e70c-24f6ce4efcc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,9/22/83,94866a7e-4ee3-a5a7-5109-6642068d6a14,27ba8fac-bfc6-df8c-326d-20ac5da690cb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
175,9/28/04,b24f84f8-7adf-2b2e-dfb3-1c1c466d0a74,70b2a3cb-4948-3c0b-c440-b1b8a30f2028,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
176,9/5/76,c987d011-9807-23da-8487-4d7ce3511a86,bcc183fb-036f-afe7-5543-2e5e79f02387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
177,9/6/73,74b7fc31-8223-c477-0ec9-5b2ceea7c28f,9bcef791-5ea9-caf0-5838-7a5ab0468bd5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2) Careplans

In [9]:
careplans

Unnamed: 0,Id,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,6d10e8ad-cdf8-db60-ff71-688eae2861c2,2020-02-05,,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,fb15e123-fea7-cae8-6d49-ee9d2a85fc84,384758001,Self-care interventions (procedure),,
1,73f93524-dc7c-ed80-8c0f-b12605033d9a,2013-06-24,2014-07-07,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,0b2794bd-ec2b-d34f-0610-2523b3b7fcf0,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
2,fd741eee-5295-1185-8307-96cce2d592b4,2016-02-27,2016-03-14,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,a6d818dd-0983-fd1c-eefa-3d2295532c45,225358003,Wound care,283371005.0,Laceration of forearm
3,cb2fead4-d300-c0cb-4b87-c5869ba9ffba,2017-02-22,2017-06-02,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,6474f606-5a1b-48c0-bbbf-ad6dcbc24d4e,385691007,Fracture care,16114001.0,Fracture of ankle
4,7f73e1a4-1bbb-bd5b-c478-09bf05e7fb36,2020-02-06,2020-03-14,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,a6da4c61-bc91-17a7-14a2-fda9023536a3,91251008,Physical therapy procedure,44465007.0,Sprain of ankle
...,...,...,...,...,...,...,...,...,...
3926,51520a8a-c857-451d-b6ca-6b0578f5f4e5,2012-11-16,,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,3af46bfd-6b0f-f636-42fa-cf06d87c944b,698360004,Diabetes self management plan,15777000.0,Prediabetes
3927,cdb12fed-30b0-c8a3-5093-4f2b9781218e,2014-05-12,2014-12-12,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,f323f89a-4a0a-54a3-59b9-2bfd96fad86c,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
3928,41e44caf-d6ee-2183-9ce3-09bc1d1dab0b,2015-02-12,2015-12-18,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,53df8912-f20e-a8b7-f6a0-156d914d74ab,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
3929,3b5a358d-49d3-b81e-3d42-dc29f9773659,2018-06-21,2018-09-11,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,b171a750-d9a0-f87c-f463-ca8a11ce99e8,47387005,Head injury rehabilitation,62564004.0,Concussion with loss of consciousness


In [10]:
# Convert START column to datetime
careplans['START'] = pd.to_datetime(careplans['START'])

# Find the latest START date for each patient
latest_dates = careplans.groupby('PATIENT')['START'].max().reset_index()

# Merge to get all rows that match the latest START date for each patient
careplans_latest = careplans.merge(latest_dates, on=['PATIENT', 'START'])

careplans_latest

Unnamed: 0,Id,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,6d10e8ad-cdf8-db60-ff71-688eae2861c2,2020-02-05,,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,fb15e123-fea7-cae8-6d49-ee9d2a85fc84,384758001,Self-care interventions (procedure),,
1,7f73e1a4-1bbb-bd5b-c478-09bf05e7fb36,2020-02-06,2020-03-14,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,a6da4c61-bc91-17a7-14a2-fda9023536a3,91251008,Physical therapy procedure,44465007.0,Sprain of ankle
2,1c54773c-c0f1-8ca4-4a53-523bf1ca1915,2016-07-04,,339144f8-50e1-633e-a013-f361391c4cff,8195411f-4a65-9245-07d3-1999e1c223df,443402002,Lifestyle education regarding hypertension,59621000.0,Hypertension
3,e5197691-0013-5b28-7a87-81187656a259,2020-04-21,2020-11-17,d488232e-bf14-4bed-08c0-a82f34b6a197,77430c51-d02f-a065-954e-c321b9bb591f,134435003,Routine antenatal care,72892002.0,Normal pregnancy
4,14991c9e-03df-dbe3-f3f1-6842f0aa9c0e,2015-11-22,2015-12-09,217f95a3-4e10-bd5d-fb67-0cfb5e8ba075,a6817963-5790-3ba8-5703-96eccbbfdebc,225358003,Wound care,283385000.0,Laceration of thigh
...,...,...,...,...,...,...,...,...,...
1129,7b994c18-9bb9-3629-6351-17cd81071725,2021-09-28,,41862157-5c14-f706-4a94-d2929be969e7,955e1e31-bfd7-3933-bbab-87aecfa25509,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
1130,c2b2cfcd-8774-216a-d250-8d465551b295,2014-02-11,2014-03-16,cb328021-a854-dc94-e7ae-426580477308,94efd413-4574-760d-9f6c-c7afa593c732,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
1131,00c3b03b-e698-78ce-71d6-baed5a3a14e5,2020-12-29,2020-12-29,d53c57a5-4480-2481-32ee-b2844a991c9d,b756d6aa-2226-ad5b-4609-30ce37b24ead,736376001,Infectious disease care plan (record artifact),840544004.0,Suspected COVID-19
1132,5c8c758d-8610-f6ff-0a2a-94df2d8634e7,2020-12-29,2021-01-15,d53c57a5-4480-2481-32ee-b2844a991c9d,b756d6aa-2226-ad5b-4609-30ce37b24ead,736376001,Infectious disease care plan (record artifact),840539006.0,COVID-19


In [11]:
# Convert STOP column to datetime to handle it properly
careplans_latest['STOP'] = pd.to_datetime(careplans_latest['STOP'])

# For each patient with the latest START date, find the latest STOP date
# If STOP is NaN, it implies ongoing treatment, which should also be considered as the latest
latest_stop_dates = careplans_latest.groupby('PATIENT').apply(lambda x: x['STOP'].max() if x['STOP'].notna().any() else pd.NaT)

# Merge the latest STOP dates with the original dataframe to filter only those records
careplans_latest = careplans_latest.merge(latest_stop_dates.reset_index(name='Latest_STOP'), on='PATIENT')

# Keep records where STOP date matches the latest STOP date or is NaN (ongoing treatment)
# This step ensures we consider ongoing treatments as the latest if no actual STOP date is available
latest_careplans_with_latest_stop = careplans_latest[(careplans_latest['STOP'] == careplans_latest['Latest_STOP']) | careplans_latest['STOP'].isna()]

latest_careplans_with_latest_stop = latest_careplans_with_latest_stop.drop(columns=['Latest_STOP']) # Remove the auxiliary column

latest_careplans_with_latest_stop

Unnamed: 0,Id,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,6d10e8ad-cdf8-db60-ff71-688eae2861c2,2020-02-05,NaT,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,fb15e123-fea7-cae8-6d49-ee9d2a85fc84,384758001,Self-care interventions (procedure),,
1,7f73e1a4-1bbb-bd5b-c478-09bf05e7fb36,2020-02-06,2020-03-14,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,a6da4c61-bc91-17a7-14a2-fda9023536a3,91251008,Physical therapy procedure,44465007.0,Sprain of ankle
2,1c54773c-c0f1-8ca4-4a53-523bf1ca1915,2016-07-04,NaT,339144f8-50e1-633e-a013-f361391c4cff,8195411f-4a65-9245-07d3-1999e1c223df,443402002,Lifestyle education regarding hypertension,59621000.0,Hypertension
3,e5197691-0013-5b28-7a87-81187656a259,2020-04-21,2020-11-17,d488232e-bf14-4bed-08c0-a82f34b6a197,77430c51-d02f-a065-954e-c321b9bb591f,134435003,Routine antenatal care,72892002.0,Normal pregnancy
4,14991c9e-03df-dbe3-f3f1-6842f0aa9c0e,2015-11-22,2015-12-09,217f95a3-4e10-bd5d-fb67-0cfb5e8ba075,a6817963-5790-3ba8-5703-96eccbbfdebc,225358003,Wound care,283385000.0,Laceration of thigh
...,...,...,...,...,...,...,...,...,...
1128,99222d48-7fa9-a527-e678-611c78f2d307,1981-01-09,NaT,db2bac5f-730a-7f2a-a600-50d75bdf16c8,3ef5914f-b7b1-bc96-8cd0-96238159ca86,443402002,Lifestyle education regarding hypertension,59621000.0,Hypertension
1129,7b994c18-9bb9-3629-6351-17cd81071725,2021-09-28,NaT,41862157-5c14-f706-4a94-d2929be969e7,955e1e31-bfd7-3933-bbab-87aecfa25509,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
1130,c2b2cfcd-8774-216a-d250-8d465551b295,2014-02-11,2014-03-16,cb328021-a854-dc94-e7ae-426580477308,94efd413-4574-760d-9f6c-c7afa593c732,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
1132,5c8c758d-8610-f6ff-0a2a-94df2d8634e7,2020-12-29,2021-01-15,d53c57a5-4480-2481-32ee-b2844a991c9d,b756d6aa-2226-ad5b-4609-30ce37b24ead,736376001,Infectious disease care plan (record artifact),840539006.0,COVID-19


In [12]:
# Check how many patients have more than one entry in the filtered dataset
patient_counts = latest_careplans_with_latest_stop['PATIENT'].value_counts()

# Identify patients with more than one entry
patients_with_multiple_entries = patient_counts[patient_counts > 1]

# Count of such patients
patients_with_multiple_entries_count = len(patients_with_multiple_entries)

# Display the count and the patient IDs with their respective counts
patients_with_multiple_entries_count, patients_with_multiple_entries

(3,
 PATIENT
 d362f4e5-244f-cf80-f2d5-25bcd2c97785    2
 80591a09-43a2-3933-605e-d14dd364f2c5    2
 987453c3-b90f-a21e-08b8-3e940923c52b    2
 Name: count, dtype: int64)

In [13]:
# Correcting the approach to ensure we accurately filter and manage patients with multiple entries

# Step 1: Remove entries without a STOP date if there's another entry with a STOP date for the same patient
filtered_df = latest_careplans_with_latest_stop.sort_values(by=['PATIENT', 'STOP'], ascending=[True, False])
filtered_df = filtered_df.drop_duplicates(subset='PATIENT', keep='first')

# Step 2: For the specific patients, ensure we are choosing based on the rarity of REASONDESCRIPTION correctly
# Re-check the handling of REASONDESCRIPTION rarity
# Since we may have already inadvertently affected the dataset, let's re-apply the logic specifically for the patients of interest

# Re-identify patients with multiple entries after initial filtering
recheck_patients_with_multiple_entries = filtered_df['PATIENT'].value_counts()[filtered_df['PATIENT'].value_counts() > 1].index

# If there are still patients with multiple entries, apply the rarity based filtering specifically
if len(recheck_patients_with_multiple_entries) > 0:
    # Sort by REASON_RARITY within the subset of patients with multiple entries after initial STOP date filtering
    patients_subset_df = filtered_df[filtered_df['PATIENT'].isin(recheck_patients_with_multiple_entries)]
    patients_subset_df['REASON_RARITY'] = patients_subset_df['REASONDESCRIPTION'].map(reason_counts)
    patients_subset_df = patients_subset_df.sort_values(by=['PATIENT', 'REASON_RARITY'], ascending=[True, True])
    
    # Drop duplicates again, this time considering the REASONDESCRIPTION rarity
    patients_subset_df = patients_subset_df.drop_duplicates(subset='PATIENT', keep='first').drop(columns=['REASON_RARITY'])
    
    # Remove the original entries of these patients from the filtered_df
    filtered_df = filtered_df.drop(filtered_df[filtered_df['PATIENT'].isin(recheck_patients_with_multiple_entries)].index)
    
    # Merge the uniquely filtered subset back
    final_corrected_df = pd.concat([filtered_df, patients_subset_df], ignore_index=True)
else:
    # If there are no patients with multiple entries after the STOP date filtering, use the already filtered dataset
    final_corrected_df = filtered_df

# Final verification to ensure no patient has more than one entry
final_verification = final_corrected_df['PATIENT'].value_counts()

# Check specifically for the initially identified patients with multiple entries
final_verification[patients_with_multiple_entries.index]

PATIENT
d362f4e5-244f-cf80-f2d5-25bcd2c97785    1
80591a09-43a2-3933-605e-d14dd364f2c5    1
987453c3-b90f-a21e-08b8-3e940923c52b    1
Name: count, dtype: int64

In [14]:
careplans_latest = final_corrected_df

# Check for different DESCRIPTIONS with the same CODE
desc_with_same_code = careplans_latest.groupby('CODE')['DESCRIPTION'].nunique() > 1

# Check for different REASONDESCRIPTIONS with the same REASONCODE
reason_desc_with_same_reason_code = careplans_latest.groupby('REASONCODE')['REASONDESCRIPTION'].nunique() > 1

# Filter out the codes and reason codes that meet the criteria
codes_with_multiple_descriptions = desc_with_same_code[desc_with_same_code].index.tolist()
reason_codes_with_multiple_descriptions = reason_desc_with_same_reason_code[reason_desc_with_same_reason_code].index.tolist()

# Prepare the result summaries
result_summary_code = {
    "Codes with Multiple Descriptions": codes_with_multiple_descriptions,
    "Number of Codes with Multiple Descriptions": len(codes_with_multiple_descriptions)
}

result_summary_reason_code = {
    "Reason Codes with Multiple Descriptions": reason_codes_with_multiple_descriptions,
    "Number of Reason Codes with Multiple Descriptions": len(reason_codes_with_multiple_descriptions)
}

result_summary_code, result_summary_reason_code

({'Codes with Multiple Descriptions': [734163000],
  'Number of Codes with Multiple Descriptions': 1},
 {'Reason Codes with Multiple Descriptions': [],
  'Number of Reason Codes with Multiple Descriptions': 0})

In [15]:
# Apply the correction to the DESCRIPTION for CODE '734163000'
careplans_latest.loc[careplans_latest['CODE'] == 734163000, 'DESCRIPTION'] = 'Care plan (record artifact)'

# Verify if the correction has been applied correctly by checking unique descriptions for code '734163000'
unique_descriptions_for_734163000 = careplans_latest[careplans_latest['CODE'] == 734163000]['DESCRIPTION'].unique()

unique_descriptions_for_734163000

array(['Care plan (record artifact)'], dtype=object)

In [16]:
# Check for different DESCRIPTIONS with the same CODE
desc_with_same_code = careplans_latest.groupby('CODE')['DESCRIPTION'].nunique() > 1

# Check for different REASONDESCRIPTIONS with the same REASONCODE
reason_desc_with_same_reason_code = careplans_latest.groupby('REASONCODE')['REASONDESCRIPTION'].nunique() > 1

# Filter out the codes and reason codes that meet the criteria
codes_with_multiple_descriptions = desc_with_same_code[desc_with_same_code].index.tolist()
reason_codes_with_multiple_descriptions = reason_desc_with_same_reason_code[reason_desc_with_same_reason_code].index.tolist()

# Prepare the result summaries
result_summary_code = {
    "Codes with Multiple Descriptions": codes_with_multiple_descriptions,
    "Number of Codes with Multiple Descriptions": len(codes_with_multiple_descriptions)
}

result_summary_reason_code = {
    "Reason Codes with Multiple Descriptions": reason_codes_with_multiple_descriptions,
    "Number of Reason Codes with Multiple Descriptions": len(reason_codes_with_multiple_descriptions)
}

result_summary_code, result_summary_reason_code

({'Codes with Multiple Descriptions': [],
  'Number of Codes with Multiple Descriptions': 0},
 {'Reason Codes with Multiple Descriptions': [],
  'Number of Reason Codes with Multiple Descriptions': 0})

In [17]:
careplans_converted = careplans_latest
careplans_converted

Unnamed: 0,Id,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
674,c8f1514b-987c-fd3c-a773-b911732b576b,2016-10-01,2017-05-13,00126cb9-8460-4747-e302-c3609684531e,da43f1e3-3209-352e-4fcc-a6eb4feb5617,134435003,Routine antenatal care,72892002.0,Normal pregnancy
1027,0b9fd313-d7fa-3f12-4e08-76efc6f55bfb,1992-09-01,NaT,00209bf2-8e4d-06d1-82a4-daad02f25829,c828d63a-412e-efdd-8ae4-063117c09053,698360004,Diabetes self management plan,44054006.0,Diabetes
730,0f712a6a-8ba2-ba51-203e-a86df389c2bb,2021-07-18,2021-08-21,00ae3b00-9500-efc1-2758-a93d3f77e650,23619864-6c53-a820-ee10-e32ccaf69ae9,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
602,acae7499-aeff-f1c4-12ff-22657cb60ded,2021-07-16,2021-08-29,00c9ca99-6b9f-add4-8759-f7dfee6ea1a4,70decf9d-462a-725b-d125-ab0a244ee3bc,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
647,2f36cd63-175d-6d25-c2d1-8f058519ac08,1991-02-01,NaT,0142b69f-57f0-9a08-4e2d-65a2b77fdea7,ae24ca09-a297-8db3-5d3b-cc0e1ad6fd2c,734163000,Care plan (record artifact),,
...,...,...,...,...,...,...,...,...,...
115,bcb47d43-b65c-ee6c-2b63-4701de7223fe,2015-03-28,2015-06-29,fee55adf-498c-5111-2136-e805906a3a74,f4acd9be-831d-320d-6fe4-dbb28b2f4a4b,53950000,Respiratory therapy,10509002.0,Acute bronchitis (disorder)
637,fc58e0d8-618a-c133-608d-df460ae8282d,1986-07-02,1986-08-25,ff06d32e-e1df-b8db-484a-2d0d9e5d8461,917a927d-4dad-2ad5-11a9-a6c092b444c6,385691007,Fracture care,359817006.0,Closed fracture of hip
571,9addec39-59f8-7b45-93c7-c5283b6f11c0,2017-06-22,2021-06-11,ff59238a-9508-b0e5-39f4-0d4afcbe6f43,52de2e19-38fd-39f0-855a-926f7301ca5b,170836005,Allergic disorder monitoring,,
670,558b7956-2e99-2247-e197-384bad3f09fc,1970-06-09,NaT,ff9337d4-4f6c-2182-c1aa-a0811a68a4ab,8111bb92-31ba-36b1-f869-17a4033dbb2e,443402002,Lifestyle education regarding hypertension,59621000.0,Hypertension


## 3) Conditions

In [18]:
conditions

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
0,2013-06-24,2013-07-02,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,0b2794bd-ec2b-d34f-0610-2523b3b7fcf0,10509002,Acute bronchitis (disorder)
1,2016-02-27,2016-03-14,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,a6d818dd-0983-fd1c-eefa-3d2295532c45,283371005,Laceration of forearm
2,2016-08-11,2016-08-22,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,36d2e781-4655-0a11-1f70-c69856e02019,444814009,Viral sinusitis (disorder)
3,2016-11-27,2016-12-17,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,c8eaaf41-958b-31ab-7de5-568cee8751f3,444814009,Viral sinusitis (disorder)
4,2017-02-22,2017-06-02,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,6474f606-5a1b-48c0-bbbf-ad6dcbc24d4e,16114001,Fracture of ankle
...,...,...,...,...,...,...
38089,2020-01-10,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,090096d3-9404-3cb5-d1eb-583bf4c39180,160903007,Full-time employment (finding)
38090,2020-12-27,2021-01-03,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,db101ad8-66e2-9feb-e0cf-b2618f873c3a,38822007,Cystitis
38091,2021-01-15,,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,160904001,Part-time employment (finding)
38092,2021-01-15,,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,73595000,Stress (finding)


In [19]:
# Convert START column to datetime
conditions['START'] = pd.to_datetime(conditions['START'])

# Find the latest START date for each patient
latest_dates = conditions.groupby('PATIENT')['START'].max().reset_index()

# Merge to get all rows that match the latest START date for each patient
conditions_latest = conditions.merge(latest_dates, on=['PATIENT', 'START'])

conditions_latest

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
0,2020-02-06,2020-03-14,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,a6da4c61-bc91-17a7-14a2-fda9023536a3,44465007,Sprain of ankle
1,2021-08-02,,339144f8-50e1-633e-a013-f361391c4cff,fbf1824f-1ef7-820f-111b-6982ef9a9e5f,160903007,Full-time employment (finding)
2,2019-02-28,,217f95a3-4e10-bd5d-fb67-0cfb5e8ba075,dca904f3-f230-edf5-a5b9-4ba3b1b4077a,160904001,Part-time employment (finding)
3,2019-02-28,,217f95a3-4e10-bd5d-fb67-0cfb5e8ba075,dca904f3-f230-edf5-a5b9-4ba3b1b4077a,73595000,Stress (finding)
4,2021-09-07,2021-09-27,d488232e-bf14-4bed-08c0-a82f34b6a197,2cab235d-5992-a60e-6749-504bf62fe23a,72892002,Normal pregnancy
...,...,...,...,...,...,...
1736,2021-11-07,,cb328021-a854-dc94-e7ae-426580477308,77202e99-c2ae-8d7c-024c-98225f314ef1,80583007,Severe anxiety (panic) (finding
1737,2021-09-10,2021-10-05,d53c57a5-4480-2481-32ee-b2844a991c9d,6354922b-5b98-322e-a90a-278365b153d6,444814009,Viral sinusitis (disorder)
1738,2021-01-15,,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,160904001,Part-time employment (finding)
1739,2021-01-15,,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,73595000,Stress (finding)


In [20]:
# Convert STOP column to datetime to handle it properly
conditions_latest['STOP'] = pd.to_datetime(conditions_latest['STOP'])

# For each patient with the latest START date, find the latest STOP date
# If STOP is NaN, it implies ongoing treatment, which should also be considered as the latest
latest_stop_dates = conditions_latest.groupby('PATIENT').apply(lambda x: x['STOP'].max() if x['STOP'].notna().any() else pd.NaT)

# Merge the latest STOP dates with the original dataframe to filter only those records
conditions_latest = conditions_latest.merge(latest_stop_dates.reset_index(name='Latest_STOP'), on='PATIENT')

# Keep records where STOP date matches the latest STOP date or is NaN (ongoing treatment)
# This step ensures we consider ongoing treatments as the latest if no actual STOP date is available
latest_conditions_with_latest_stop = conditions_latest[(conditions_latest['STOP'] == conditions_latest['Latest_STOP']) | conditions_latest['STOP'].isna()]

latest_conditions_with_latest_stop = latest_conditions_with_latest_stop.drop(columns=['Latest_STOP']) # Remove the auxiliary column

latest_conditions_with_latest_stop

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
0,2020-02-06,2020-03-14,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,a6da4c61-bc91-17a7-14a2-fda9023536a3,44465007,Sprain of ankle
1,2021-08-02,NaT,339144f8-50e1-633e-a013-f361391c4cff,fbf1824f-1ef7-820f-111b-6982ef9a9e5f,160903007,Full-time employment (finding)
2,2019-02-28,NaT,217f95a3-4e10-bd5d-fb67-0cfb5e8ba075,dca904f3-f230-edf5-a5b9-4ba3b1b4077a,160904001,Part-time employment (finding)
3,2019-02-28,NaT,217f95a3-4e10-bd5d-fb67-0cfb5e8ba075,dca904f3-f230-edf5-a5b9-4ba3b1b4077a,73595000,Stress (finding)
4,2021-09-07,2021-09-27,d488232e-bf14-4bed-08c0-a82f34b6a197,2cab235d-5992-a60e-6749-504bf62fe23a,72892002,Normal pregnancy
...,...,...,...,...,...,...
1736,2021-11-07,NaT,cb328021-a854-dc94-e7ae-426580477308,77202e99-c2ae-8d7c-024c-98225f314ef1,80583007,Severe anxiety (panic) (finding
1737,2021-09-10,2021-10-05,d53c57a5-4480-2481-32ee-b2844a991c9d,6354922b-5b98-322e-a90a-278365b153d6,444814009,Viral sinusitis (disorder)
1738,2021-01-15,NaT,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,160904001,Part-time employment (finding)
1739,2021-01-15,NaT,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,73595000,Stress (finding)


In [21]:
# Check how many patients have more than one entry in the filtered dataset
patient_counts = latest_conditions_with_latest_stop['PATIENT'].value_counts()

# Identify patients with more than one entry
patients_with_multiple_entries = patient_counts[patient_counts > 1]

# Count of such patients
patients_with_multiple_entries_count = len(patients_with_multiple_entries)

# Display the count and the patient IDs with their respective counts
patients_with_multiple_entries_count, patients_with_multiple_entries

(367,
 PATIENT
 d515a84c-6dce-4070-8794-d59b7c5cf2ca    11
 050358a1-fcf7-1182-7ad1-6b663afa3002    10
 49dcfcdb-2e70-3c98-9ce1-7020c1d8850d     8
 6fc7b50b-61ff-9237-bcac-7b7b1d8606ab     8
 e31d10b1-1cfa-e761-c21d-b7fd630f55c3     8
                                         ..
 4898f210-0f92-d5f8-0b8f-29c0899b69db     2
 7049fd33-ca55-6cfd-3dd9-dfcea6228600     2
 8471f747-3d66-ed9b-9b34-277ca99303d2     2
 c75b66ef-356c-8113-5460-1e942419b3cf     2
 055bcb42-de36-4673-6d1a-628d1817dcea     2
 Name: count, Length: 367, dtype: int64)

In [22]:
# Correcting the approach to ensure we accurately filter and manage patients with multiple entries

# Step 1: Remove entries without a STOP date if there's another entry with a STOP date for the same patient
filtered_df = latest_conditions_with_latest_stop.sort_values(by=['PATIENT', 'STOP'], ascending=[True, False])
filtered_df = filtered_df.drop_duplicates(subset='PATIENT', keep='first')

# Step 2: For the specific patients, ensure we are choosing based on the rarity of DESCRIPTION correctly
# Since we may have already inadvertently affected the dataset, let's re-apply the logic specifically for the patients of interest

# Re-identify patients with multiple entries after initial filtering
recheck_patients_with_multiple_entries = filtered_df['PATIENT'].value_counts()[filtered_df['PATIENT'].value_counts() > 1].index

# If there are still patients with multiple entries, apply the rarity based filtering specifically
if len(recheck_patients_with_multiple_entries) > 0:
    # Sort by DESCRIPTION_RARITY within the subset of patients with multiple entries after initial STOP date filtering
    patients_subset_df = filtered_df[filtered_df['PATIENT'].isin(recheck_patients_with_multiple_entries)]
    patients_subset_df['DESCRIPTION_RARITY'] = patients_subset_df['DESCRIPTION'].map(reason_counts)
    patients_subset_df = patients_subset_df.sort_values(by=['PATIENT', 'DESCRIPTION_RARITY'], ascending=[True, True])
    
    # Drop duplicates again, this time considering the DESCRIPTION rarity
    patients_subset_df = patients_subset_df.drop_duplicates(subset='PATIENT', keep='first').drop(columns=['DESCRIPTION_RARITY'])
    
    # Remove the original entries of these patients from the filtered_df
    filtered_df = filtered_df.drop(filtered_df[filtered_df['PATIENT'].isin(recheck_patients_with_multiple_entries)].index)
    
    # Merge the uniquely filtered subset back
    final_corrected_df = pd.concat([filtered_df, patients_subset_df], ignore_index=True)
else:
    # If there are no patients with multiple entries after the STOP date filtering, use the already filtered dataset
    final_corrected_df = filtered_df

# Final verification to ensure no patient has more than one entry
final_verification = final_corrected_df['PATIENT'].value_counts()

# Check specifically for the initially identified patients with multiple entries
final_verification[patients_with_multiple_entries.index]

PATIENT
d515a84c-6dce-4070-8794-d59b7c5cf2ca    1
050358a1-fcf7-1182-7ad1-6b663afa3002    1
49dcfcdb-2e70-3c98-9ce1-7020c1d8850d    1
6fc7b50b-61ff-9237-bcac-7b7b1d8606ab    1
e31d10b1-1cfa-e761-c21d-b7fd630f55c3    1
                                       ..
4898f210-0f92-d5f8-0b8f-29c0899b69db    1
7049fd33-ca55-6cfd-3dd9-dfcea6228600    1
8471f747-3d66-ed9b-9b34-277ca99303d2    1
c75b66ef-356c-8113-5460-1e942419b3cf    1
055bcb42-de36-4673-6d1a-628d1817dcea    1
Name: count, Length: 367, dtype: int64

In [23]:
conditions_latest = final_corrected_df

# Check for duplicated patient data
duplicated_patients = conditions_latest.duplicated(subset=['PATIENT'], keep=False)

# Find the number of duplicated patient entries
duplicated_count = duplicated_patients.sum()

# To see the actual duplicated patient records
duplicated_patient_records = conditions_latest[duplicated_patients]

print(f"Number of duplicated patient entries: {duplicated_count}")
print("Duplicated patient records:")
print(duplicated_patient_records)

Number of duplicated patient entries: 0
Duplicated patient records:
Empty DataFrame
Columns: [START, STOP, PATIENT, ENCOUNTER, CODE, DESCRIPTION]
Index: []


In [24]:
# Check for different DESCRIPTIONS with the same CODE
desc_with_same_code = conditions_latest.groupby('CODE')['DESCRIPTION'].nunique() > 1

# Filter out the codes and reason codes that meet the criteria
codes_with_multiple_descriptions = desc_with_same_code[desc_with_same_code].index.tolist()

# Prepare the result summaries
result_summary_code = {
    "Codes with Multiple Descriptions": codes_with_multiple_descriptions,
    "Number of Codes with Multiple Descriptions": len(codes_with_multiple_descriptions)
}

result_summary_code

{'Codes with Multiple Descriptions': [233604007],
 'Number of Codes with Multiple Descriptions': 1}

In [25]:
# Apply the correction to the DESCRIPTION for CODE '233604007'
conditions_latest.loc[conditions_latest['CODE'] == 233604007, 'DESCRIPTION'] = 'Pneumonia (disorder)'

# Verify if the correction has been applied correctly by checking unique descriptions for code '734163000'
unique_descriptions_for_233604007 = conditions_latest[conditions_latest['CODE'] == 233604007]['DESCRIPTION'].unique()

unique_descriptions_for_233604007

array(['Pneumonia (disorder)'], dtype=object)

In [26]:
# Check for different DESCRIPTIONS with the same CODE
desc_with_same_code = conditions_latest.groupby('CODE')['DESCRIPTION'].nunique() > 1

# Filter out the codes and reason codes that meet the criteria
codes_with_multiple_descriptions = desc_with_same_code[desc_with_same_code].index.tolist()

# Prepare the result summaries
result_summary_code = {
    "Codes with Multiple Descriptions": codes_with_multiple_descriptions,
    "Number of Codes with Multiple Descriptions": len(codes_with_multiple_descriptions)
}

result_summary_code

{'Codes with Multiple Descriptions': [],
 'Number of Codes with Multiple Descriptions': 0}

In [27]:
conditions_converted = conditions_latest
conditions_converted

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
1028,2020-10-24,2021-10-30,00126cb9-8460-4747-e302-c3609684531e,36c147d3-c28a-df01-d6e0-b1fad96b99a9,706893006,Victim of intimate partner abuse (finding)
1592,2021-02-09,NaT,00209bf2-8e4d-06d1-82a4-daad02f25829,b3bc4d75-c000-8b48-5200-0cca2c4232c2,160903007,Full-time employment (finding)
1108,2021-08-21,NaT,00ae3b00-9500-efc1-2758-a93d3f77e650,0ebaab7c-63bc-a04b-1fd0-f45adf699563,160903007,Full-time employment (finding)
913,2021-08-29,NaT,00c9ca99-6b9f-add4-8759-f7dfee6ea1a4,45b47ac1-b8c3-36ca-d1d7-372fb40fd4df,160903007,Full-time employment (finding)
987,1991-02-01,NaT,0142b69f-57f0-9a08-4e2d-65a2b77fdea7,ae24ca09-a297-8db3-5d3b-cc0e1ad6fd2c,160904001,Part-time employment (finding)
...,...,...,...,...,...,...
973,1989-04-02,NaT,ff06d32e-e1df-b8db-484a-2d0d9e5d8461,3582a1ed-48c3-6a16-9085-79d28b546b30,22298006,Myocardial Infarction
237,2008-04-04,NaT,ff0e4d0e-6181-e36e-d817-64dbcaecb5d0,9ac93727-a73c-5479-6d06-35999c01756d,22298006,Myocardial Infarction
869,2017-07-13,2021-08-05,ff59238a-9508-b0e5-39f4-0d4afcbe6f43,c270ff26-12a1-08d9-75f6-7d7172705c6f,160968000,Risk activity involvement (finding)
1023,2021-07-18,NaT,ff9337d4-4f6c-2182-c1aa-a0811a68a4ab,56d237d6-418d-195a-78e2-467b2d83bd1f,82423001,Chronic pain


## 4) Encounters

In [28]:
encounters

Unnamed: 0,Id,DATE,PATIENT,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,748f8357-6cc7-551d-f31a-32fa2cf84126,2019-02-17,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
1,5a4735ae-423f-6563-28ab-b3d11b49b2d4,2019-03-24,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
2,0bee1ce6-3e2c-5506-f71c-a7ba8f64a3d3,2019-05-26,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
3,6e93bcf9-45a4-8528-0120-1c1eaa930faf,2019-07-28,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
4,8b6787c3-4316-a0cb-899d-4746525c319f,2019-10-27,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
...,...,...,...,...,...,...,...
61454,230e2215-38ab-9371-842d-a44d27ae4090,2020-12-18,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,390906007,Follow-up encounter,55822004.0,Hyperlipidemia
61455,db101ad8-66e2-9feb-e0cf-b2618f873c3a,2020-12-28,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,185345009,Encounter for symptom,38822007.0,Cystitis
61456,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,162673000,General examination of patient (procedure),,
61457,b2a4d90b-a2f5-1c88-0fb6-ba49b1487d37,2021-08-13,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,33879002,Administration of vaccine to produce active im...,,


In [29]:
# Convert DATE column to datetime
encounters['DATE'] = pd.to_datetime(encounters['DATE'])

# Find the latest date for each patient
latest_dates = encounters.groupby('PATIENT')['DATE'].max().reset_index()

# Merge to get all rows that match the latest date for each patient
encounters_latest = encounters.merge(latest_dates, on=['PATIENT', 'DATE'])

encounters_latest

Unnamed: 0,Id,DATE,PATIENT,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,53c2bae0-f0ff-7eac-4ca1-3dce0ecebb30,2021-07-25,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
1,78fdedd9-24e2-2711-15e9-e43a73ade634,2021-10-04,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,308335008,Patient encounter procedure,,
2,fbf1824f-1ef7-820f-111b-6982ef9a9e5f,2021-08-02,339144f8-50e1-633e-a013-f361391c4cff,162673000,General examination of patient (procedure),,
3,dca904f3-f230-edf5-a5b9-4ba3b1b4077a,2019-02-28,217f95a3-4e10-bd5d-fb67-0cfb5e8ba075,162673000,General examination of patient (procedure),,
4,c9fd0ea7-f7f5-7183-5d96-bf760cc2188d,2021-09-27,d488232e-bf14-4bed-08c0-a82f34b6a197,424619006,Prenatal visit,72892002.0,Normal pregnancy
...,...,...,...,...,...,...,...
1180,040dd9f6-be70-cca2-9dbc-312b3e6cdfcb,2021-10-06,41862157-5c14-f706-4a94-d2929be969e7,185349003,Encounter for check up (procedure),,
1181,2cc3f940-a30d-6fdd-c260-2256fde22f5d,1997-09-12,db2bac5f-730a-7f2a-a600-50d75bdf16c8,308646001,Death Certification,410429000.0,Cardiac Arrest
1182,77202e99-c2ae-8d7c-024c-98225f314ef1,2021-11-07,cb328021-a854-dc94-e7ae-426580477308,185349003,Encounter for check up (procedure),,
1183,6354922b-5b98-322e-a90a-278365b153d6,2021-09-10,d53c57a5-4480-2481-32ee-b2844a991c9d,185345009,Encounter for symptom,444814009.0,Viral sinusitis (disorder)


In [30]:
# Check how many patients have more than one entry in the filtered dataset
patient_counts = encounters_latest['PATIENT'].value_counts()

# Identify patients with more than one entry
patients_with_multiple_entries = patient_counts[patient_counts > 1]

# Count of such patients
patients_with_multiple_entries_count = len(patients_with_multiple_entries)

# Display the count and the patient IDs with their respective counts
patients_with_multiple_entries_count, patients_with_multiple_entries

(19,
 PATIENT
 9cbf47d8-b59a-ef34-c330-77accc084c17    3
 8b831e48-9db7-f037-8b8a-e94be939d2ea    3
 ccd81797-18f3-cf40-4f64-f37b7b50d689    3
 f665d685-1e8b-dc83-a1e4-7ca3f26d5a8c    2
 155e8fa2-a0ac-80e4-23e2-0395cdcf1794    2
 8bf6aa92-645f-3c82-ddcd-5851496a6aa8    2
 58e24016-f3f7-e33d-35cd-333d6dedd5aa    2
 a25e9984-9c96-bcd4-07f9-9f763e077366    2
 c518abbe-9695-1b3a-3173-305ff9bb6150    2
 3685bf21-cf8b-669f-cc82-ba3f467087c8    2
 895c8e87-5ba6-c823-6f40-0e6b6341a09f    2
 a8259641-ea43-ab98-3577-e1027c2dac2b    2
 49dcfcdb-2e70-3c98-9ce1-7020c1d8850d    2
 fa2d28b1-de84-ac96-ee04-a20dc54832c1    2
 1d1af1df-c916-9534-dcb4-b9aaf02e48d5    2
 050358a1-fcf7-1182-7ad1-6b663afa3002    2
 e4035211-964b-b90f-237d-655426c3aa1f    2
 1d4373dc-be5a-dd4a-15c3-0117198d94c0    2
 23cec4fc-145d-f75d-dc70-2a52c97bd9c3    2
 Name: count, dtype: int64)

In [31]:
# Count the occurrence of each CODE
code_occurrences = encounters_latest['CODE'].value_counts()

# Find the least occurred CODE for each patient with multiple entries
# First, create a dataframe that maps each patient to their least occurred CODE
patient_least_occurred_code = encounters_latest.groupby('PATIENT')['CODE'].apply(lambda x: x.map(code_occurrences).idxmin())

# Use the index from the above to filter the original data
filtered_data = encounters_latest.loc[patient_least_occurred_code]

# Display the first few rows of the filtered dataframe
filtered_data

Unnamed: 0,Id,DATE,PATIENT,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
705,1908e893-ac98-aba5-bef4-f6d4cad0f591,2021-10-30,00126cb9-8460-4747-e302-c3609684531e,162673000,General examination of patient (procedure),,
1078,ea8d3f14-23da-3aef-5474-55961b44d7fb,2021-04-06,00209bf2-8e4d-06d1-82a4-daad02f25829,33879002,Administration of vaccine to produce active im...,,
761,0ebaab7c-63bc-a04b-1fd0-f45adf699563,2021-08-21,00ae3b00-9500-efc1-2758-a93d3f77e650,162673000,General examination of patient (procedure),,
629,cb9f173e-1655-4805-da51-498524bc7ebe,2021-09-12,00c9ca99-6b9f-add4-8759-f7dfee6ea1a4,390906007,Follow-up encounter,55822004.0,Hyperlipidemia
673,551578ee-dcd6-f8cc-9ad4-bacabb3ce2cd,1991-06-21,0142b69f-57f0-9a08-4e2d-65a2b77fdea7,308646001,Death Certification,126906006.0,Neoplasm of prostate
...,...,...,...,...,...,...,...
663,ffb9bbbc-0c08-159d-4dbc-91a81e5a5772,1989-04-09,ff06d32e-e1df-b8db-484a-2d0d9e5d8461,308646001,Death Certification,22298006.0,Myocardial Infarction
157,6cd4fde4-a308-9815-d5eb-e61f9e285104,2008-04-11,ff0e4d0e-6181-e36e-d817-64dbcaecb5d0,308646001,Death Certification,22298006.0,Myocardial Infarction
597,b942f620-6384-0697-3401-7562dc7e0a32,2021-11-05,ff59238a-9508-b0e5-39f4-0d4afcbe6f43,33879002,Administration of vaccine to produce active im...,,
701,dcc8db5d-b189-bbb7-dfbe-66b8217508b6,2021-09-28,ff9337d4-4f6c-2182-c1aa-a0811a68a4ab,394701000,Asthma follow-up,195967001.0,Asthma


In [32]:
encounters_latest = filtered_data

# Check how many patients have more than one entry in the filtered dataset
patient_counts = encounters_latest['PATIENT'].value_counts()

# Identify patients with more than one entry
patients_with_multiple_entries = patient_counts[patient_counts > 1]

# Count of such patients
patients_with_multiple_entries_count = len(patients_with_multiple_entries)

# Display the count and the patient IDs with their respective counts
patients_with_multiple_entries_count, patients_with_multiple_entries

(0, Series([], Name: count, dtype: int64))

In [33]:
# Check for different DESCRIPTIONS with the same CODE
desc_with_same_code = encounters_latest.groupby('CODE')['DESCRIPTION'].nunique() > 1

# Check for different REASONDESCRIPTIONS with the same REASONCODE
reason_desc_with_same_reason_code = encounters_latest.groupby('REASONCODE')['REASONDESCRIPTION'].nunique() > 1

# Filter out the codes and reason codes that meet the criteria
codes_with_multiple_descriptions = desc_with_same_code[desc_with_same_code].index.tolist()
reason_codes_with_multiple_descriptions = reason_desc_with_same_reason_code[reason_desc_with_same_reason_code].index.tolist()

# Prepare the result summaries
result_summary_code = {
    "Codes with Multiple Descriptions": codes_with_multiple_descriptions,
    "Number of Codes with Multiple Descriptions": len(codes_with_multiple_descriptions)
}

result_summary_reason_code = {
    "Reason Codes with Multiple Descriptions": reason_codes_with_multiple_descriptions,
    "Number of Reason Codes with Multiple Descriptions": len(reason_codes_with_multiple_descriptions)
}

result_summary_code, result_summary_reason_code

({'Codes with Multiple Descriptions': [50849002,
   185345009,
   185347001,
   185349003,
   308335008,
   390906007],
  'Number of Codes with Multiple Descriptions': 6},
 {'Reason Codes with Multiple Descriptions': [],
  'Number of Reason Codes with Multiple Descriptions': 0})

In [34]:
encounters_latest.loc[encounters_latest['CODE'] == 50849002, 'DESCRIPTION'] = 'Emergency room admission (procedure)'
encounters_latest.loc[encounters_latest['CODE'] == 185345009, 'DESCRIPTION'] = 'Encounter for symptom'
encounters_latest.loc[encounters_latest['CODE'] == 185347001, 'DESCRIPTION'] = 'Encounter for problem'
encounters_latest.loc[encounters_latest['CODE'] == 185349003, 'DESCRIPTION'] = 'Encounter for check up (procedure)'
encounters_latest.loc[encounters_latest['CODE'] == 308335008, 'DESCRIPTION'] = 'Patient encounter procedure'
encounters_latest.loc[encounters_latest['CODE'] == 390906007, 'DESCRIPTION'] = 'Follow-up encounter'

In [35]:
# Check for different DESCRIPTIONS with the same CODE
desc_with_same_code = encounters_latest.groupby('CODE')['DESCRIPTION'].nunique() > 1

# Check for different REASONDESCRIPTIONS with the same REASONCODE
reason_desc_with_same_reason_code = encounters_latest.groupby('REASONCODE')['REASONDESCRIPTION'].nunique() > 1

# Filter out the codes and reason codes that meet the criteria
codes_with_multiple_descriptions = desc_with_same_code[desc_with_same_code].index.tolist()
reason_codes_with_multiple_descriptions = reason_desc_with_same_reason_code[reason_desc_with_same_reason_code].index.tolist()

# Prepare the result summaries
result_summary_code = {
    "Codes with Multiple Descriptions": codes_with_multiple_descriptions,
    "Number of Codes with Multiple Descriptions": len(codes_with_multiple_descriptions)
}

result_summary_reason_code = {
    "Reason Codes with Multiple Descriptions": reason_codes_with_multiple_descriptions,
    "Number of Reason Codes with Multiple Descriptions": len(reason_codes_with_multiple_descriptions)
}

result_summary_code, result_summary_reason_code

({'Codes with Multiple Descriptions': [],
  'Number of Codes with Multiple Descriptions': 0},
 {'Reason Codes with Multiple Descriptions': [],
  'Number of Reason Codes with Multiple Descriptions': 0})

In [36]:
encounters_converted = encounters_latest
encounters_converted

Unnamed: 0,Id,DATE,PATIENT,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
705,1908e893-ac98-aba5-bef4-f6d4cad0f591,2021-10-30,00126cb9-8460-4747-e302-c3609684531e,162673000,General examination of patient (procedure),,
1078,ea8d3f14-23da-3aef-5474-55961b44d7fb,2021-04-06,00209bf2-8e4d-06d1-82a4-daad02f25829,33879002,Administration of vaccine to produce active im...,,
761,0ebaab7c-63bc-a04b-1fd0-f45adf699563,2021-08-21,00ae3b00-9500-efc1-2758-a93d3f77e650,162673000,General examination of patient (procedure),,
629,cb9f173e-1655-4805-da51-498524bc7ebe,2021-09-12,00c9ca99-6b9f-add4-8759-f7dfee6ea1a4,390906007,Follow-up encounter,55822004.0,Hyperlipidemia
673,551578ee-dcd6-f8cc-9ad4-bacabb3ce2cd,1991-06-21,0142b69f-57f0-9a08-4e2d-65a2b77fdea7,308646001,Death Certification,126906006.0,Neoplasm of prostate
...,...,...,...,...,...,...,...
663,ffb9bbbc-0c08-159d-4dbc-91a81e5a5772,1989-04-09,ff06d32e-e1df-b8db-484a-2d0d9e5d8461,308646001,Death Certification,22298006.0,Myocardial Infarction
157,6cd4fde4-a308-9815-d5eb-e61f9e285104,2008-04-11,ff0e4d0e-6181-e36e-d817-64dbcaecb5d0,308646001,Death Certification,22298006.0,Myocardial Infarction
597,b942f620-6384-0697-3401-7562dc7e0a32,2021-11-05,ff59238a-9508-b0e5-39f4-0d4afcbe6f43,33879002,Administration of vaccine to produce active im...,,
701,dcc8db5d-b189-bbb7-dfbe-66b8217508b6,2021-09-28,ff9337d4-4f6c-2182-c1aa-a0811a68a4ab,394701000,Asthma follow-up,195967001.0,Asthma


## 5) Immunizations

In [37]:
immunizations

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION
0,2019-02-17T05:07:38Z,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,748f8357-6cc7-551d-f31a-32fa2cf84126,8,Hep B adolescent or pediatric
1,2019-03-24T05:07:38Z,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,5a4735ae-423f-6563-28ab-b3d11b49b2d4,8,Hep B adolescent or pediatric
2,2019-05-26T05:07:38Z,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,0bee1ce6-3e2c-5506-f71c-a7ba8f64a3d3,49,Hib (PRP-OMP)
3,2019-05-26T05:07:38Z,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,0bee1ce6-3e2c-5506-f71c-a7ba8f64a3d3,119,rotavirus monovalent
4,2019-05-26T05:07:38Z,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,0bee1ce6-3e2c-5506-f71c-a7ba8f64a3d3,10,IPV
...,...,...,...,...,...
17004,2020-01-10T05:11:58Z,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,090096d3-9404-3cb5-d1eb-583bf4c39180,140,Influenza seasonal injectable preservative ...
17005,2020-01-10T05:11:58Z,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,090096d3-9404-3cb5-d1eb-583bf4c39180,113,Td (adult) preservative free
17006,2021-01-15T05:11:58Z,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,140,Influenza seasonal injectable preservative ...
17007,2021-08-13T05:11:58Z,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,b2a4d90b-a2f5-1c88-0fb6-ba49b1487d37,208,SARS-COV-2 (COVID-19) vaccine mRNA spike pro...


In [38]:
# Convert DATE column to datetime
immunizations['DATE'] = pd.to_datetime(immunizations['DATE'])

# Find the latest date for each patient
latest_dates = immunizations.groupby('PATIENT')['DATE'].max().reset_index()

# Merge to get all rows that match the latest date for each patient
immunizations_latest = immunizations.merge(latest_dates, on=['PATIENT', 'DATE'])

immunizations_latest

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION
0,2021-07-25 05:07:38+00:00,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,53c2bae0-f0ff-7eac-4ca1-3dce0ecebb30,83,Hep A ped/adol 2 dose
1,2021-08-16 08:20:52+00:00,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,9f4acd52-4da4-c7d0-292f-a798b6cc5c69,140,Influenza seasonal injectable preservative ...
2,2021-08-16 08:20:52+00:00,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,9f4acd52-4da4-c7d0-292f-a798b6cc5c69,114,meningococcal MCV4P
3,2021-08-02 05:03:31+00:00,339144f8-50e1-633e-a013-f361391c4cff,fbf1824f-1ef7-820f-111b-6982ef9a9e5f,140,Influenza seasonal injectable preservative ...
4,2019-02-28 14:57:05+00:00,217f95a3-4e10-bd5d-fb67-0cfb5e8ba075,dca904f3-f230-edf5-a5b9-4ba3b1b4077a,140,Influenza seasonal injectable preservative ...
...,...,...,...,...,...
1448,2021-08-04 10:43:27+00:00,41862157-5c14-f706-4a94-d2929be969e7,2994389c-fbae-dc4e-156a-08e8de9aa1b0,140,Influenza seasonal injectable preservative ...
1449,1997-05-30 05:11:58+00:00,db2bac5f-730a-7f2a-a600-50d75bdf16c8,78b23644-87ba-add4-08be-2ad64de92ddd,140,Influenza seasonal injectable preservative ...
1450,2021-02-21 05:40:25+00:00,cb328021-a854-dc94-e7ae-426580477308,71034085-cb15-07a8-4b91-181a3afe72d6,208,SARS-COV-2 (COVID-19) vaccine mRNA spike pro...
1451,2021-07-28 07:04:22+00:00,d53c57a5-4480-2481-32ee-b2844a991c9d,e3df555a-8d35-7463-6e60-ce6284f21398,140,Influenza seasonal injectable preservative ...


In [39]:
# Check how many patients have more than one entry in the filtered dataset
patient_counts = immunizations_latest['PATIENT'].value_counts()

# Identify patients with more than one entry
patients_with_multiple_entries = patient_counts[patient_counts > 1]

# Count of such patients
patients_with_multiple_entries_count = len(patients_with_multiple_entries)

# Display the count and the patient IDs with their respective counts
patients_with_multiple_entries_count, patients_with_multiple_entries

(172,
 PATIENT
 a2f532cc-184b-d28c-b702-7b3fd99c77bd    5
 7d5e31d3-163b-4a77-576b-1ed21adf8c09    5
 dc234609-fb36-d5ba-bf99-c04b4f107790    5
 423a9252-21e3-6141-4207-46a9066fd7f4    5
 b3c96d58-b033-6bc7-a65f-3acd85698fd6    5
                                        ..
 c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8    2
 426cade9-82be-f335-7340-c4413e4bdd6a    2
 e15d5c46-48a2-2364-2b4f-529ecbe0c956    2
 3b6c2a81-32bc-168b-5a45-2b1d3368c6a0    2
 87daa935-0fc8-61cf-9e00-b8540c292903    2
 Name: count, Length: 172, dtype: int64)

In [40]:
# Check for different DESCRIPTIONS with the same CODE
desc_with_same_code = immunizations_latest.groupby('CODE')['DESCRIPTION'].nunique() > 1

# Filter out the codes and reason codes that meet the criteria
codes_with_multiple_descriptions = desc_with_same_code[desc_with_same_code].index.tolist()

# Prepare the result summaries
result_summary_code = {
    "Codes with Multiple Descriptions": codes_with_multiple_descriptions,
    "Number of Codes with Multiple Descriptions": len(codes_with_multiple_descriptions)
}

result_summary_code

{'Codes with Multiple Descriptions': [],
 'Number of Codes with Multiple Descriptions': 0}

In [41]:
# First, sort the data by 'CODE'
sorted_data = immunizations_latest.sort_values(by='CODE')

# Create a new column for indicating the presence of an immunization
sorted_data['Immunization'] = 1

# Pivot the table to have one column for each immunization code, filled with 1s and 0s
pivot_table = sorted_data.pivot_table(index=['DATE', 'PATIENT', 'ENCOUNTER'], 
                                      columns='DESCRIPTION', 
                                      values='Immunization', 
                                      fill_value=0).reset_index()

# Set the name of the columns index to None to remove the 'DESCRIPTION'
pivot_table.columns.name = None

# Because the pivot operation might have sorted the columns alphabetically by default,
# ensure the columns are ordered by immunization code by reordering them based on the initial sort
# First, get the order of allergy descriptions by code
immunization_order = sorted_data[['CODE', 'DESCRIPTION']].drop_duplicates().sort_values('CODE')['DESCRIPTION']

# Reorder the columns in the pivot table according to the sorted immunization descriptions
ordered_columns = ['DATE', 'PATIENT', 'ENCOUNTER'] + list(immunization_order)
pivot_table = pivot_table.reindex(columns=ordered_columns)

pivot_table

Unnamed: 0,DATE,PATIENT,ENCOUNTER,MMR,Hep B adolescent or pediatric,IPV,DTaP,varicella,pneumococcal polysaccharide vaccine 23 valent,Hep B adult,...,Td (adult) preservative free,meningococcal MCV4P,Tdap,rotavirus monovalent,zoster,Pneumococcal conjugate PCV 13,Influenza seasonal injectable preservative free,SARS-COV-2 (COVID-19) vaccine mRNA spike protein LNP preservative free 100 mcg/0.5mL dose,SARS-COV-2 (COVID-19) vaccine mRNA spike protein LNP preservative free 30 mcg/0.3mL dose,SARS-COV-2 (COVID-19) vaccine vector non-replicating recombinant spike protein-Ad26 preservative free 0.5 mL
0,1945-03-31 10:04:43+00:00,ae59c2c9-648d-6f76-2d42-a1b8cac21c8b,80df95b1-a622-8217-3194-c136de7e4b9c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1955-11-22 21:04:17+00:00,be341f37-13a9-e54b-901a-23d59b802bb8,cc8cf2e8-2b9e-84b0-3f18-0ee08a9f72bc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1957-12-12 10:15:32+00:00,db9b9c1c-0a86-773c-ba82-fbb9b71d9993,32811466-bc98-385e-6830-223530c95743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1960-12-10 08:46:25+00:00,3c26a41e-d1a5-45ed-6956-09bcefe3c08c,6e6ca7c8-4d89-0f5a-50cd-ae4f63720b45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1965-06-22 21:04:17+00:00,4fadd99c-4f13-0c0f-2b37-2c3c9c13ff78,1bee99ac-7331-cee8-3a71-c4d1ca6e9ef9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1158,2021-11-15 11:57:18+00:00,3515e99a-c9ef-2628-6949-bf2ba1d74897,49c6f733-7cfe-368a-b7ac-472ca2db304d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1159,2021-11-16 01:29:34+00:00,c2b9dc8a-cfe1-da50-5e72-140b66e43996,6a5385e7-6782-6cd3-f927-9362cf9ff0f9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1160,2021-11-16 05:14:56+00:00,219ddc63-11fe-e940-aa69-0e1bdd131e08,8f807dc1-2fd1-31a5-2bb1-4531a163e631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1161,2021-11-16 07:15:57+00:00,6a6a1b30-9966-bd68-e732-b7ce2c0b6ade,add6329a-9d12-ff4a-9d0e-abc05db90171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [42]:
# Check how many patients have more than one entry in the filtered dataset
patient_counts = pivot_table['PATIENT'].value_counts()

# Identify patients with more than one entry
patients_with_multiple_entries = patient_counts[patient_counts > 1]

# Count of such patients
patients_with_multiple_entries_count = len(patients_with_multiple_entries)

# Display the count and the patient IDs with their respective counts
patients_with_multiple_entries_count, patients_with_multiple_entries

(5,
 PATIENT
 23cec4fc-145d-f75d-dc70-2a52c97bd9c3    2
 155e8fa2-a0ac-80e4-23e2-0395cdcf1794    2
 0b48bf9b-5582-1253-ab01-b34da54bad15    2
 c518abbe-9695-1b3a-3173-305ff9bb6150    2
 35606b5e-5492-b8b1-25c7-085c406d0dac    2
 Name: count, dtype: int64)

In [43]:
immunizations_data = pivot_table

# Convert DATE to datetime for correct sorting and error handling
immunizations_data['DATE'] = pd.to_datetime(immunizations_data['DATE'], errors='coerce')

# Sort data by PATIENT and DATE to ensure chronological order
immunizations_data_sorted = immunizations_data.sort_values(by=['PATIENT', 'DATE'])

# Define vaccine columns (assuming all columns except DATE, PATIENT, and ENCOUNTER are vaccine indicators)
vaccine_columns = [col for col in immunizations_data.columns if col not in ['DATE', 'PATIENT', 'ENCOUNTER']]

# Perform aggregation: summing vaccine indicators and keeping the last (most recent) encounter's data
grouped_immunizations = immunizations_data_sorted.groupby('PATIENT', as_index=False).agg(
    {**{col: 'sum' for col in vaccine_columns}, **{'ENCOUNTER': 'last', 'DATE': 'last'}}
)

# Define the desired column order
desired_order = ['DATE', 'PATIENT', 'ENCOUNTER'] + vaccine_columns

# Reorder the DataFrame according to the desired order
grouped_immunizations = grouped_immunizations[desired_order]

In [44]:
immunizations_converted = grouped_immunizations
immunizations_converted

Unnamed: 0,DATE,PATIENT,ENCOUNTER,MMR,Hep B adolescent or pediatric,IPV,DTaP,varicella,pneumococcal polysaccharide vaccine 23 valent,Hep B adult,...,Td (adult) preservative free,meningococcal MCV4P,Tdap,rotavirus monovalent,zoster,Pneumococcal conjugate PCV 13,Influenza seasonal injectable preservative free,SARS-COV-2 (COVID-19) vaccine mRNA spike protein LNP preservative free 100 mcg/0.5mL dose,SARS-COV-2 (COVID-19) vaccine mRNA spike protein LNP preservative free 30 mcg/0.3mL dose,SARS-COV-2 (COVID-19) vaccine vector non-replicating recombinant spike protein-Ad26 preservative free 0.5 mL
0,2021-10-30 23:28:56+00:00,00126cb9-8460-4747-e302-c3609684531e,1908e893-ac98-aba5-bef4-f6d4cad0f591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2021-04-06 05:34:46+00:00,00209bf2-8e4d-06d1-82a4-daad02f25829,ea8d3f14-23da-3aef-5474-55961b44d7fb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2021-07-24 10:38:53+00:00,00ae3b00-9500-efc1-2758-a93d3f77e650,8bd075bf-d2a4-c108-7272-10d057f86677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2021-08-08 17:59:09+00:00,00c9ca99-6b9f-add4-8759-f7dfee6ea1a4,3001d937-63b0-8621-9e7a-c62c46d83435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1991-02-01 10:42:42+00:00,0142b69f-57f0-9a08-4e2d-65a2b77fdea7,ae24ca09-a297-8db3-5d3b-cc0e1ad6fd2c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1153,1988-06-12 13:16:42+00:00,ff06d32e-e1df-b8db-484a-2d0d9e5d8461,628807bb-c610-9a89-cd44-1752fefcdb1e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1154,2007-09-07 12:43:40+00:00,ff0e4d0e-6181-e36e-d817-64dbcaecb5d0,89d43601-ab9c-634a-b24f-4ef8222c587d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1155,2021-11-05 01:41:26+00:00,ff59238a-9508-b0e5-39f4-0d4afcbe6f43,b942f620-6384-0697-3401-7562dc7e0a32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1156,2021-03-23 19:41:16+00:00,ff9337d4-4f6c-2182-c1aa-a0811a68a4ab,598bb231-e339-5a24-0b08-e005d5198d4a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## 6) Medications

In [45]:
medications

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,2020-02-17T10:40:32Z,,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,1014676,cetirizine hydrochloride 5 MG Oral Tablet,,
1,2020-02-17T10:40:32Z,,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,1870230,NDA020800 0.3 ML Epinephrine 1 MG/ML Auto-Inje...,,
2,2013-06-24T06:39:19Z,2013-07-02T06:39:19Z,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,0b2794bd-ec2b-d34f-0610-2523b3b7fcf0,1043400,Acetaminophen 21.7 MG/ML / Dextromethorphan Hy...,10509002.0,Acute bronchitis (disorder)
3,2016-02-27T08:52:29Z,2016-03-14T08:52:29Z,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,a6d818dd-0983-fd1c-eefa-3d2295532c45,198405,Ibuprofen 100 MG Oral Tablet,,
4,2017-02-22T09:25:31Z,2017-06-02T09:25:31Z,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,6474f606-5a1b-48c0-bbbf-ad6dcbc24d4e,313820,Acetaminophen 160 MG Chewable Tablet,,
...,...,...,...,...,...,...,...,...
56425,2018-12-19T05:11:58Z,2019-12-19T05:11:58Z,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,a7b3614b-3840-6d16-a23d-37f2f73938cd,314231,Simvastatin 10 MG Oral Tablet,55822004.0,Hyperlipidemia
56426,2019-12-19T05:11:58Z,2020-12-18T05:11:58Z,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,d135bbff-a4f5-653b-5462-68af428138be,314231,Simvastatin 10 MG Oral Tablet,55822004.0,Hyperlipidemia
56427,2020-12-18T05:11:58Z,,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,230e2215-38ab-9371-842d-a44d27ae4090,314231,Simvastatin 10 MG Oral Tablet,55822004.0,Hyperlipidemia
56428,2020-12-28T02:11:58Z,2021-01-04T02:11:58Z,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,db101ad8-66e2-9feb-e0cf-b2618f873c3a,311989,Nitrofurantoin 5 MG/ML Oral Suspension,38822007.0,Cystitis


In [46]:
# Convert START column to datetime
medications['START'] = pd.to_datetime(medications['START'])

# Find the latest START date for each patient
latest_dates = medications.groupby('PATIENT')['START'].max().reset_index()

# Merge to get all rows that match the latest START date for each patient
medications_latest = medications.merge(latest_dates, on=['PATIENT', 'START'])

medications_latest

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,2020-02-17 10:40:32+00:00,,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,1014676,cetirizine hydrochloride 5 MG Oral Tablet,,
1,2020-02-17 10:40:32+00:00,,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,1870230,NDA020800 0.3 ML Epinephrine 1 MG/ML Auto-Inje...,,
2,2021-06-23 08:20:52+00:00,,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,68e737e5-a4a1-0ffd-7234-8fb04a2b0c19,1000126,1 ML medroxyPROGESTERone acetate 150 MG/ML Inj...,,
3,2021-08-02 05:03:31+00:00,,339144f8-50e1-633e-a013-f361391c4cff,fbf1824f-1ef7-820f-111b-6982ef9a9e5f,310798,Hydrochlorothiazide 25 MG Oral Tablet,59621000.0,Hypertension
4,2021-08-02 05:03:31+00:00,,339144f8-50e1-633e-a013-f361391c4cff,fbf1824f-1ef7-820f-111b-6982ef9a9e5f,314076,lisinopril 10 MG Oral Tablet,59621000.0,Hypertension
...,...,...,...,...,...,...,...,...
2028,2021-11-07 05:40:25+00:00,,cb328021-a854-dc94-e7ae-426580477308,77202e99-c2ae-8d7c-024c-98225f314ef1,310798,Hydrochlorothiazide 25 MG Oral Tablet,59621000.0,Hypertension
2029,2021-11-07 05:40:25+00:00,,cb328021-a854-dc94-e7ae-426580477308,77202e99-c2ae-8d7c-024c-98225f314ef1,314076,lisinopril 10 MG Oral Tablet,59621000.0,Hypertension
2030,2021-07-28 07:04:22+00:00,,d53c57a5-4480-2481-32ee-b2844a991c9d,e3df555a-8d35-7463-6e60-ce6284f21398,310798,Hydrochlorothiazide 25 MG Oral Tablet,59621000.0,Hypertension
2031,2020-12-28 02:11:58+00:00,2021-01-04T02:11:58Z,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,db101ad8-66e2-9feb-e0cf-b2618f873c3a,311989,Nitrofurantoin 5 MG/ML Oral Suspension,38822007.0,Cystitis


In [47]:
# Convert STOP column to datetime to handle it properly
medications_latest['STOP'] = pd.to_datetime(medications_latest['STOP'])

# For each patient with the latest START date, find the latest STOP date
# If STOP is NaN, it implies ongoing treatment, which should also be considered as the latest
latest_stop_dates = medications_latest.groupby('PATIENT').apply(lambda x: x['STOP'].max() if x['STOP'].notna().any() else pd.NaT)

# Merge the latest STOP dates with the original dataframe to filter only those records
medications_latest = medications_latest.merge(latest_stop_dates.reset_index(name='Latest_STOP'), on='PATIENT')

# Keep records where STOP date matches the latest STOP date or is NaN (ongoing treatment)
# This step ensures we consider ongoing treatments as the latest if no actual STOP date is available
latest_medications_with_latest_stop = medications_latest[(medications_latest['STOP'] == medications_latest['Latest_STOP']) | medications_latest['STOP'].isna()]

latest_medications_with_latest_stop = latest_medications_with_latest_stop.drop(columns=['Latest_STOP']) # Remove the auxiliary column

latest_medications_with_latest_stop

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,2020-02-17 10:40:32+00:00,NaT,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,1014676,cetirizine hydrochloride 5 MG Oral Tablet,,
1,2020-02-17 10:40:32+00:00,NaT,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,1870230,NDA020800 0.3 ML Epinephrine 1 MG/ML Auto-Inje...,,
2,2021-06-23 08:20:52+00:00,NaT,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,68e737e5-a4a1-0ffd-7234-8fb04a2b0c19,1000126,1 ML medroxyPROGESTERone acetate 150 MG/ML Inj...,,
3,2021-08-02 05:03:31+00:00,NaT,339144f8-50e1-633e-a013-f361391c4cff,fbf1824f-1ef7-820f-111b-6982ef9a9e5f,310798,Hydrochlorothiazide 25 MG Oral Tablet,59621000.0,Hypertension
4,2021-08-02 05:03:31+00:00,NaT,339144f8-50e1-633e-a013-f361391c4cff,fbf1824f-1ef7-820f-111b-6982ef9a9e5f,314076,lisinopril 10 MG Oral Tablet,59621000.0,Hypertension
...,...,...,...,...,...,...,...,...
2028,2021-11-07 05:40:25+00:00,NaT,cb328021-a854-dc94-e7ae-426580477308,77202e99-c2ae-8d7c-024c-98225f314ef1,310798,Hydrochlorothiazide 25 MG Oral Tablet,59621000.0,Hypertension
2029,2021-11-07 05:40:25+00:00,NaT,cb328021-a854-dc94-e7ae-426580477308,77202e99-c2ae-8d7c-024c-98225f314ef1,314076,lisinopril 10 MG Oral Tablet,59621000.0,Hypertension
2030,2021-07-28 07:04:22+00:00,NaT,d53c57a5-4480-2481-32ee-b2844a991c9d,e3df555a-8d35-7463-6e60-ce6284f21398,310798,Hydrochlorothiazide 25 MG Oral Tablet,59621000.0,Hypertension
2031,2020-12-28 02:11:58+00:00,2021-01-04 02:11:58+00:00,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,db101ad8-66e2-9feb-e0cf-b2618f873c3a,311989,Nitrofurantoin 5 MG/ML Oral Suspension,38822007.0,Cystitis


In [48]:
# Check how many patients have more than one entry in the filtered dataset
patient_counts = latest_medications_with_latest_stop['PATIENT'].value_counts()

# Identify patients with more than one entry
patients_with_multiple_entries = patient_counts[patient_counts > 1]

# Count of such patients
patients_with_multiple_entries_count = len(patients_with_multiple_entries)

# Display the count and the patient IDs with their respective counts
patients_with_multiple_entries_count, patients_with_multiple_entries

(424,
 PATIENT
 a3ccbf19-ad9a-7ae1-b926-ba3b0543fe1b    22
 ac58e6c6-b5d7-8fe7-a0f3-452885f7b560    12
 4c7bbf23-68f4-86e8-c61e-1a1b1e81c4ea    11
 0fc183b2-8b93-ec17-d2fc-e703b373c9d7    10
 e6178711-3936-e332-814d-19d8851e314d     9
                                         ..
 82e25b95-7bfc-a54f-f242-1e7156c27d74     2
 d477c46c-6e56-e32f-0c28-7356caab1e8b     2
 146f9020-c423-72ce-7c96-12e60f3f5910     2
 c98f8278-af04-a31b-ebee-65d34ae0466f     2
 e4a67e2a-4d8d-70e7-b0fb-83480abf443c     2
 Name: count, Length: 424, dtype: int64)

In [49]:
medications_latest = latest_medications_with_latest_stop.drop(columns=['REASONCODE','REASONDESCRIPTION'])

In [50]:
# Check for different DESCRIPTIONS with the same CODE
desc_with_same_code = medications_latest.groupby('CODE')['DESCRIPTION'].nunique() > 1

# Filter out the codes and reason codes that meet the criteria
codes_with_multiple_descriptions = desc_with_same_code[desc_with_same_code].index.tolist()

# Prepare the result summaries
result_summary_code = {
    "Codes with Multiple Descriptions": codes_with_multiple_descriptions,
    "Number of Codes with Multiple Descriptions": len(codes_with_multiple_descriptions)
}

result_summary_code

{'Codes with Multiple Descriptions': [242969, 1000126, 1049221],
 'Number of Codes with Multiple Descriptions': 3}

In [51]:
medications_latest.loc[medications_latest['CODE'] == 242969, 'DESCRIPTION'] = '4 ML norepinephrine 1 MG/ML Injection'
medications_latest.loc[medications_latest['CODE'] == 1000126, 'DESCRIPTION'] = '1 ML medroxyprogesterone acetate 150 MG/ML Injection'
medications_latest.loc[medications_latest['CODE'] == 1049221, 'DESCRIPTION'] = 'Acetaminophen 325 MG / Oxycodone Hydrochloride 5 MG Oral Tablet'

In [52]:
# Check for different DESCRIPTIONS with the same CODE
desc_with_same_code = medications_latest.groupby('CODE')['DESCRIPTION'].nunique() > 1

# Filter out the codes and reason codes that meet the criteria
codes_with_multiple_descriptions = desc_with_same_code[desc_with_same_code].index.tolist()

# Prepare the result summaries
result_summary_code = {
    "Codes with Multiple Descriptions": codes_with_multiple_descriptions,
    "Number of Codes with Multiple Descriptions": len(codes_with_multiple_descriptions)
}

result_summary_code

{'Codes with Multiple Descriptions': [],
 'Number of Codes with Multiple Descriptions': 0}

In [53]:
# First, sort the data by 'CODE'
sorted_data = medications_latest.sort_values(by='CODE')

# Create a new column for indicating the presence of an medication
sorted_data['MedicationPresent'] = 1

# Pivot the table to have one column for each allergy code, filled with 1s and 0s
pivot_table = sorted_data.pivot_table(index=['START', 'PATIENT', 'ENCOUNTER'], 
                                      columns='DESCRIPTION', 
                                      values='MedicationPresent', 
                                      fill_value=0).reset_index()

# Set the name of the columns index to None to remove the 'DESCRIPTION'
pivot_table.columns.name = None

# Because the pivot operation might have sorted the columns alphabetically by default,
# ensure the columns are ordered by allergy code by reordering them based on the initial sort
# First, get the order of allergy descriptions by code
medication_order = sorted_data[['CODE', 'DESCRIPTION']].drop_duplicates().sort_values('CODE')['DESCRIPTION']

# Reorder the columns in the pivot table according to the sorted medication descriptions
ordered_columns = ['START', 'PATIENT', 'ENCOUNTER'] + list(medication_order)
pivot_table = pivot_table.reindex(columns=ordered_columns)

pivot_table

Unnamed: 0,START,PATIENT,ENCOUNTER,Penicillin G 375 MG/ML Injectable Solution,insulin human isophane 70 UNT/ML / Regular Insulin Human 30 UNT/ML Injectable Suspension [Humulin],Allopurinol 100 MG Oral Tablet,Amlodipine 5 MG Oral Tablet,Colchicine 0.6 MG Oral Tablet,Diazepam 5 MG Oral Tablet,Digoxin 0.125 MG Oral Tablet,...,Alteplase 100 MG Injection,150 ML vancomycin 5 MG/ML Injection,vancomycin 1000 MG Injection,Kyleena 19.5 MG Intrauterine System,Abuse-Deterrent 12 HR Oxycodone Hydrochloride 15 MG Extended Release Oral Tablet,12 HR Hydrocodone Bitartrate 10 MG Extended Release Oral Capsule,NDA020800 0.3 ML Epinephrine 1 MG/ML Auto-Injector,baricitinib 2 MG Oral Tablet,1 ML Vasopressin (USP) 20 UNT/ML Injection,NDA020503 200 ACTUAT Albuterol 0.09 MG/ACTUAT Metered Dose Inhaler
0,1942-01-05 19:51:58+00:00,590ad7c6-6cb7-7181-bf1f-818e179c3edc,db936bf2-cc3f-d943-826f-1cf5e459df87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1942-02-15 04:58:49+00:00,c4bdbf50-4728-5778-7f4a-ce39e9c58232,6b04b154-f55a-5f26-258d-cf89f0c8fadd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1945-10-20 10:04:43+00:00,ae59c2c9-648d-6f76-2d42-a1b8cac21c8b,d8d18cd2-5548-5ba6-2d38-c0afe37cadff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1955-11-22 21:04:17+00:00,be341f37-13a9-e54b-901a-23d59b802bb8,cc8cf2e8-2b9e-84b0-3f18-0ee08a9f72bc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1955-12-14 05:41:34+00:00,7edc835f-0063-6d39-0c35-112f82a3e131,0308abcd-d826-1d53-21d3-4ea141d6f68b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1133,2021-11-16 05:14:56+00:00,219ddc63-11fe-e940-aa69-0e1bdd131e08,8f807dc1-2fd1-31a5-2bb1-4531a163e631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1134,2021-11-17 18:44:36+00:00,864b2fa0-78e3-88c6-2ce2-ba3aea72236c,6eafb77f-dc52-fab7-b1fa-a1dbe0dd5d2a,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1135,2021-11-18 06:40:58+00:00,9d92805b-c25a-c7ad-60ca-80c93ffb80b1,c18d198e-2200-3937-761a-8f70e1e0e3b8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1136,2021-11-18 10:36:25+00:00,6dcdf9cd-efbe-c8c0-c633-ef255bc05ef5,4b5c8930-6006-c82d-f6c1-777468ad4bf5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
# Check how many patients have more than one entry in the filtered dataset
patient_counts = pivot_table['PATIENT'].value_counts()

# Identify patients with more than one entry
patients_with_multiple_entries = patient_counts[patient_counts > 1]

# Count of such patients
patients_with_multiple_entries_count = len(patients_with_multiple_entries)

# Display the count and the patient IDs with their respective counts
patients_with_multiple_entries_count, patients_with_multiple_entries

(20,
 PATIENT
 ac58e6c6-b5d7-8fe7-a0f3-452885f7b560    3
 0fc183b2-8b93-ec17-d2fc-e703b373c9d7    2
 81393886-9f1f-35d5-f000-ea68f238f49f    2
 f0aa92fa-13d6-6e8c-cc01-9986ce990b5e    2
 b68c6e5e-6b47-b721-839b-739ec3400504    2
 e000e5c8-30f7-7a53-a324-7f3e431ba008    2
 a3ccbf19-ad9a-7ae1-b926-ba3b0543fe1b    2
 fbee8d6b-5acf-370c-7edd-af0d84f5288c    2
 c4292552-ab46-dd25-c7ee-61596965f2e2    2
 ba94f61a-0555-03dc-c25a-33b62cec09be    2
 e37d530a-3b67-0a3c-0da4-ed338900d916    2
 20ea49a4-8c7e-5df6-b505-d00366fb774a    2
 c518abbe-9695-1b3a-3173-305ff9bb6150    2
 db0ba46f-9b5e-63cd-cbe4-3c0ea82abbdf    2
 137651e5-3f15-db77-2712-ee0b75724502    2
 0ee91088-5b0d-8cdb-ef22-357b12a26b25    2
 eb7cffc3-28d2-e8b0-72bc-8e86860ed313    2
 d868e198-8ebe-2338-5691-4c1de3421c65    2
 cd6bea41-b268-4c96-0338-3f3d8129e5d8    2
 23cec4fc-145d-f75d-dc70-2a52c97bd9c3    2
 Name: count, dtype: int64)

In [55]:
data = pivot_table

# Convert 'START' to datetime
data['START'] = pd.to_datetime(data['START'])

# Now, we'll sort the data by 'PATIENT' and 'START' to ensure we're keeping the latest record for each patient
sorted_data = data.sort_values(by=['PATIENT', 'START'])

# Dropping all but the last record for each patient
latest_records = sorted_data.drop_duplicates(subset=['PATIENT'], keep='last')

In [56]:
# Check how many patients have more than one entry in the filtered dataset
patient_counts = latest_records['PATIENT'].value_counts()

# Identify patients with more than one entry
patients_with_multiple_entries = patient_counts[patient_counts > 1]

# Count of such patients
patients_with_multiple_entries_count = len(patients_with_multiple_entries)

# Display the count and the patient IDs with their respective counts
patients_with_multiple_entries_count, patients_with_multiple_entries

(0, Series([], Name: count, dtype: int64))

In [57]:
medications_converted = latest_records
medications_converted

Unnamed: 0,START,PATIENT,ENCOUNTER,Penicillin G 375 MG/ML Injectable Solution,insulin human isophane 70 UNT/ML / Regular Insulin Human 30 UNT/ML Injectable Suspension [Humulin],Allopurinol 100 MG Oral Tablet,Amlodipine 5 MG Oral Tablet,Colchicine 0.6 MG Oral Tablet,Diazepam 5 MG Oral Tablet,Digoxin 0.125 MG Oral Tablet,...,Alteplase 100 MG Injection,150 ML vancomycin 5 MG/ML Injection,vancomycin 1000 MG Injection,Kyleena 19.5 MG Intrauterine System,Abuse-Deterrent 12 HR Oxycodone Hydrochloride 15 MG Extended Release Oral Tablet,12 HR Hydrocodone Bitartrate 10 MG Extended Release Oral Capsule,NDA020800 0.3 ML Epinephrine 1 MG/ML Auto-Injector,baricitinib 2 MG Oral Tablet,1 ML Vasopressin (USP) 20 UNT/ML Injection,NDA020503 200 ACTUAT Albuterol 0.09 MG/ACTUAT Metered Dose Inhaler
1091,2021-10-30 23:28:56+00:00,00126cb9-8460-4747-e302-c3609684531e,1908e893-ac98-aba5-bef4-f6d4cad0f591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581,2021-02-09 05:34:46+00:00,00209bf2-8e4d-06d1-82a4-daad02f25829,b3bc4d75-c000-8b48-5200-0cca2c4232c2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
881,2021-07-18 07:38:53+00:00,00ae3b00-9500-efc1-2758-a93d3f77e650,23619864-6c53-a820-ee10-e32ccaf69ae9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
988,2021-09-12 17:59:09+00:00,00c9ca99-6b9f-add4-8759-f7dfee6ea1a4,cb9f173e-1655-4805-da51-498524bc7ebe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,1991-02-01 10:42:42+00:00,0142b69f-57f0-9a08-4e2d-65a2b77fdea7,ae24ca09-a297-8db3-5d3b-cc0e1ad6fd2c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,1989-04-02 13:16:42+00:00,ff06d32e-e1df-b8db-484a-2d0d9e5d8461,3582a1ed-48c3-6a16-9085-79d28b546b30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
108,2008-04-04 12:43:40+00:00,ff0e4d0e-6181-e36e-d817-64dbcaecb5d0,9ac93727-a73c-5479-6d06-35999c01756d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
253,2016-09-24 01:41:26+00:00,ff59238a-9508-b0e5-39f4-0d4afcbe6f43,8297fca9-c37c-9ac9-4e27-7275dfb069bd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
884,2021-07-18 19:41:16+00:00,ff9337d4-4f6c-2182-c1aa-a0811a68a4ab,56d237d6-418d-195a-78e2-467b2d83bd1f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## 7) Observations

In [58]:
observations

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,VALUE,UNITS
0,2019-02-17,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,748f8357-6cc7-551d-f31a-32fa2cf84126,8302-2,Body Height,51.4,cm
1,2019-02-17,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,748f8357-6cc7-551d-f31a-32fa2cf84126,72514-3,Pain severity - 0-10 verbal numeric rating [Sc...,1,{score}
2,2019-02-17,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,748f8357-6cc7-551d-f31a-32fa2cf84126,29463-7,Body Weight,3.8,kg
3,2019-02-17,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,748f8357-6cc7-551d-f31a-32fa2cf84126,77606-2,Weight-for-length Per age and sex,57.9,%
4,2019-02-17,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,748f8357-6cc7-551d-f31a-32fa2cf84126,9843-4,Head Occipital-frontal circumference,34.8,cm
...,...,...,...,...,...,...,...
531139,2017-11-07,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,,QOLS,QOLS,1,{score}
531140,2018-11-07,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,,QOLS,QOLS,1,{score}
531141,2019-11-07,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,,QOLS,QOLS,1,{score}
531142,2020-11-07,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,,QOLS,QOLS,1,{score}


In [59]:
# Convert DATE column to datetime
observations['DATE'] = pd.to_datetime(observations['DATE'])

# Find the latest date for each patient
latest_dates = observations.groupby('PATIENT')['DATE'].max().reset_index()

# Merge to get all rows that match the latest date for each patient
observations_latest = observations.merge(latest_dates, on=['PATIENT', 'DATE'])

observations_latest

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,VALUE,UNITS
0,2021-07-25,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,53c2bae0-f0ff-7eac-4ca1-3dce0ecebb30,8302-2,Body Height,88.8,cm
1,2021-07-25,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,53c2bae0-f0ff-7eac-4ca1-3dce0ecebb30,72514-3,Pain severity - 0-10 verbal numeric rating [Sc...,2,{score}
2,2021-07-25,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,53c2bae0-f0ff-7eac-4ca1-3dce0ecebb30,29463-7,Body Weight,12.9,kg
3,2021-07-25,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,53c2bae0-f0ff-7eac-4ca1-3dce0ecebb30,77606-2,Weight-for-length Per age and sex,49,%
4,2021-07-25,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,53c2bae0-f0ff-7eac-4ca1-3dce0ecebb30,9843-4,Head Occipital-frontal circumference,48.5,cm
...,...,...,...,...,...,...,...
18118,2021-07-28,d53c57a5-4480-2481-32ee-b2844a991c9d,,DALY,DALY,0.2,a
18119,2021-07-28,d53c57a5-4480-2481-32ee-b2844a991c9d,,QOLS,QOLS,1,{score}
18120,2021-11-07,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,,QALY,QALY,61.6,a
18121,2021-11-07,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,,DALY,DALY,0.4,a


In [60]:
# Check how many patients have more than one entry in the filtered dataset
patient_counts = observations_latest['PATIENT'].value_counts()

# Identify patients with more than one entry
patients_with_multiple_entries = patient_counts[patient_counts > 1]

# Count of such patients
patients_with_multiple_entries_count = len(patients_with_multiple_entries)

# Display the count and the patient IDs with their respective counts
patients_with_multiple_entries_count, patients_with_multiple_entries

(992,
 PATIENT
 81d85a6d-8dc1-a067-0d33-832e589a20fa    59
 e3d999ee-c61a-18cd-6045-7e958c87773b    58
 e4035211-964b-b90f-237d-655426c3aa1f    58
 3685bf21-cf8b-669f-cc82-ba3f467087c8    57
 546ab430-0ad8-54a3-9247-85caadb762d1    57
                                         ..
 312d3dd7-d005-5d59-078e-2dc5615a466c     2
 050358a1-fcf7-1182-7ad1-6b663afa3002     2
 76b289fd-e825-734c-8446-316f59643593     2
 4c7bbf23-68f4-86e8-c61e-1a1b1e81c4ea     2
 03777c32-ed98-50e2-f75d-cbcad532c610     2
 Name: count, Length: 992, dtype: int64)

In [61]:
# Check for different DESCRIPTIONS with the same CODE
desc_with_same_code = observations_latest.groupby('CODE')['DESCRIPTION'].nunique() > 1

# Filter out the codes and reason codes that meet the criteria
codes_with_multiple_descriptions = desc_with_same_code[desc_with_same_code].index.tolist()

# Prepare the result summaries
result_summary_code = {
    "Codes with Multiple Descriptions": codes_with_multiple_descriptions,
    "Number of Codes with Multiple Descriptions": len(codes_with_multiple_descriptions)
}

result_summary_code

{'Codes with Multiple Descriptions': ['21000-5',
  '33914-3',
  '5767-9',
  '6690-2',
  '789-8'],
 'Number of Codes with Multiple Descriptions': 5}

In [62]:
# Filter for rows
code_33914_3_rows = observations_latest['CODE'] == '33914-3'

# Update 'DESCRIPTION' and 'UNITS' for these rows
observations_latest.loc[code_33914_3_rows, 'DESCRIPTION'] = 'Glomerular filtration rate/1.73 sq M.predicted'
observations_latest.loc[code_33914_3_rows, 'UNITS'] = 'mL/min'

In [63]:
# Define a dictionary of codes and the chosen descriptions
code_updates = {
    '21000-5': 'Erythrocyte distribution width [Entitic volume] by Automated count',
    '5767-9': 'Appearance of Urine', 
    '6690-2': 'Leukocytes [#/volume] in Blood by Automated count',
    '789-8': 'Erythrocytes [#/volume] in Blood by Automated count'
}

# Update 'DESCRIPTION' for these codes
for code, description in code_updates.items():
    observations_latest.loc[observations_latest['CODE'] == code, 'DESCRIPTION'] = description

In [64]:
# Check for different DESCRIPTIONS with the same CODE
desc_with_same_code = observations_latest.groupby('CODE')['DESCRIPTION'].nunique() > 1

# Filter out the codes and reason codes that meet the criteria
codes_with_multiple_descriptions = desc_with_same_code[desc_with_same_code].index.tolist()

# Prepare the result summaries
result_summary_code = {
    "Codes with Multiple Descriptions": codes_with_multiple_descriptions,
    "Number of Codes with Multiple Descriptions": len(codes_with_multiple_descriptions)
}

result_summary_code

{'Codes with Multiple Descriptions': [],
 'Number of Codes with Multiple Descriptions': 0}

In [65]:
# Remove the 'DESCRIPTION' and 'UNITS' columns
data = observations_latest.drop(columns=['CODE', 'UNITS'])

# Pivot the table to flatten the data
flattened_data = data.pivot_table(index=['PATIENT', 'DATE'], columns='DESCRIPTION', values='VALUE', aggfunc='first').reset_index()

# Flatten the columns (since pivot_table creates MultiIndex columns)
flattened_data.columns = ['PATIENT', 'DATE'] + flattened_data.columns[2:].tolist()

flattened_data

Unnamed: 0,PATIENT,DATE,Adenovirus A+B+C+D+E DNA [Presence] in Respiratory specimen by NAA with probe detection,Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma,Albumin [Mass/volume] in Serum or Plasma,Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma,Appearance of Urine,Are you Hispanic or Latino?,Are you a refugee?,Are you worried about losing your housing?,...,What is the highest level of school that you have finished?,What is your current work situation?,What is your housing situation today?,What is your main insurance?,What language are you most comfortable speaking?,What number best describes how during the past week pain has interfered with your enjoyment of life?,What number best describes how during the past week pain has interfered with your general activity?,What number best describes your pain on average in the past week?,Which race(s) are you?,pH of Urine by Test strip
0,00126cb9-8460-4747-e302-c3609684531e,2021-10-31,,,,,,No,No,No,...,More than high school,Unemployed (finding),I have housing,Medicaid,English,,,,White,
1,00209bf2-8e4d-06d1-82a4-daad02f25829,2021-02-09,,,,,,No,No,No,...,More than high school,Full-time work,I have housing,Medicare,English,,,,Other Please write,
2,00ae3b00-9500-efc1-2758-a93d3f77e650,2021-08-21,,,,,,No,No,No,...,More than high school,Full-time work,I have housing,Medicare,English,,,,Black/African American,
3,00c9ca99-6b9f-add4-8759-f7dfee6ea1a4,2021-09-12,,57.1,4.3,111.1,,,,,...,,,,,,,,,,
4,0142b69f-57f0-9a08-4e2d-65a2b77fdea7,1991-06-21,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1158,ff06d32e-e1df-b8db-484a-2d0d9e5d8461,1989-04-09,,,,,,,,,...,,,,,,,,,,
1159,ff0e4d0e-6181-e36e-d817-64dbcaecb5d0,2008-04-11,,,,,,,,,...,,,,,,,,,,
1160,ff59238a-9508-b0e5-39f4-0d4afcbe6f43,2021-08-06,,,,,,,,,...,,,,,,,,,,
1161,ff9337d4-4f6c-2182-c1aa-a0811a68a4ab,2021-07-18,,,,,,,,,...,,,,,,,,,,


In [66]:
observations_converted = flattened_data
observations_converted

Unnamed: 0,PATIENT,DATE,Adenovirus A+B+C+D+E DNA [Presence] in Respiratory specimen by NAA with probe detection,Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma,Albumin [Mass/volume] in Serum or Plasma,Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma,Appearance of Urine,Are you Hispanic or Latino?,Are you a refugee?,Are you worried about losing your housing?,...,What is the highest level of school that you have finished?,What is your current work situation?,What is your housing situation today?,What is your main insurance?,What language are you most comfortable speaking?,What number best describes how during the past week pain has interfered with your enjoyment of life?,What number best describes how during the past week pain has interfered with your general activity?,What number best describes your pain on average in the past week?,Which race(s) are you?,pH of Urine by Test strip
0,00126cb9-8460-4747-e302-c3609684531e,2021-10-31,,,,,,No,No,No,...,More than high school,Unemployed (finding),I have housing,Medicaid,English,,,,White,
1,00209bf2-8e4d-06d1-82a4-daad02f25829,2021-02-09,,,,,,No,No,No,...,More than high school,Full-time work,I have housing,Medicare,English,,,,Other Please write,
2,00ae3b00-9500-efc1-2758-a93d3f77e650,2021-08-21,,,,,,No,No,No,...,More than high school,Full-time work,I have housing,Medicare,English,,,,Black/African American,
3,00c9ca99-6b9f-add4-8759-f7dfee6ea1a4,2021-09-12,,57.1,4.3,111.1,,,,,...,,,,,,,,,,
4,0142b69f-57f0-9a08-4e2d-65a2b77fdea7,1991-06-21,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1158,ff06d32e-e1df-b8db-484a-2d0d9e5d8461,1989-04-09,,,,,,,,,...,,,,,,,,,,
1159,ff0e4d0e-6181-e36e-d817-64dbcaecb5d0,2008-04-11,,,,,,,,,...,,,,,,,,,,
1160,ff59238a-9508-b0e5-39f4-0d4afcbe6f43,2021-08-06,,,,,,,,,...,,,,,,,,,,
1161,ff9337d4-4f6c-2182-c1aa-a0811a68a4ab,2021-07-18,,,,,,,,,...,,,,,,,,,,


## 8) Patients

In [67]:
patients

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,...,MARITAL,RACE,ETHNICITY,GENDER,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,ZIP
0,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,2/17/19,,999-65-3251,,,,Damon455,Langosh790,,...,,white,nonhispanic,M,Middleborough Massachusetts US,620 Lynch Tunnel Apt 0,Springfield,Massachusetts,Hampden County,1104.0
1,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,7/4/05,,999-49-3323,S99941126,,,Thi53,Wunsch504,,...,,white,nonhispanic,F,Danvers Massachusetts US,972 Tillman Branch Suite 48,Bellingham,Massachusetts,Norfolk County,
2,339144f8-50e1-633e-a013-f361391c4cff,5/11/98,,999-10-8743,S99996708,X75063318X,Mr.,Chi716,Greenfelder433,,...,,white,nonhispanic,M,Athens Athens Prefecture GR,1060 Bernhard Crossroad Suite 15,Boston,Massachusetts,Suffolk County,2131.0
3,d488232e-bf14-4bed-08c0-a82f34b6a197,1/28/03,,999-56-6057,S99929424,,Ms.,Phillis443,Walter473,,...,,white,nonhispanic,F,Boston Massachusetts US,677 Ritchie Terrace,Hingham,Massachusetts,Plymouth County,2043.0
4,217f95a3-4e10-bd5d-fb67-0cfb5e8ba075,12/23/93,,999-91-4320,S99991143,X44132498X,Mr.,Jerrold404,Herzog843,,...,M,black,nonhispanic,M,Boston Massachusetts US,276 Bernier Branch,Revere,Massachusetts,Suffolk County,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1158,409330fa-7ffd-dbfb-4eba-2349d58a6324,2/28/79,,999-68-5445,S99991649,X72884711X,Mr.,Coy949,Schaden604,,...,M,white,nonhispanic,M,Fitchburg Massachusetts US,434 Hickle Throughway Suite 15,Amherst,Massachusetts,Hampshire County,
1159,cb328021-a854-dc94-e7ae-426580477308,5/31/64,,999-10-6445,S99986790,X86596484X,Mrs.,Sherry479,Barrows492,,...,M,white,nonhispanic,F,Boston Massachusetts US,976 Ortiz Orchard,Stoughton,Massachusetts,Norfolk County,
1160,41862157-5c14-f706-4a94-d2929be969e7,7/12/67,,999-63-2407,S99976335,X68999803X,Mr.,Vance413,Jakubowski832,,...,M,asian,nonhispanic,M,Brockton Massachusetts US,534 Strosin Corner,Gardner,Massachusetts,Worcester County,1440.0
1161,d53c57a5-4480-2481-32ee-b2844a991c9d,7/28/48,,999-37-8036,S99939062,X79037907X,Mr.,Cody889,Hilll811,,...,M,white,nonhispanic,M,Stoneham Massachusetts US,568 Ryan Stravenue,Mashpee,Massachusetts,Barnstable County,


In [68]:
patients.isnull().sum()

Id               0
BIRTHDATE        0
DEATHDATE     1000
SSN              0
DRIVERS        215
PASSPORT       276
PREFIX         245
FIRST            0
LAST             0
SUFFIX        1147
MAIDEN         832
MARITAL        384
RACE             0
ETHNICITY        0
GENDER           0
BIRTHPLACE       0
ADDRESS          0
CITY             0
STATE            0
COUNTY           0
ZIP            545
dtype: int64

In [69]:
patients_converted = patients

# Rename the 'Id' column to 'PATIENT'
patients_converted.rename(columns={'Id': 'PATIENT'}, inplace=True)
patients_converted

Unnamed: 0,PATIENT,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,...,MARITAL,RACE,ETHNICITY,GENDER,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,ZIP
0,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,2/17/19,,999-65-3251,,,,Damon455,Langosh790,,...,,white,nonhispanic,M,Middleborough Massachusetts US,620 Lynch Tunnel Apt 0,Springfield,Massachusetts,Hampden County,1104.0
1,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,7/4/05,,999-49-3323,S99941126,,,Thi53,Wunsch504,,...,,white,nonhispanic,F,Danvers Massachusetts US,972 Tillman Branch Suite 48,Bellingham,Massachusetts,Norfolk County,
2,339144f8-50e1-633e-a013-f361391c4cff,5/11/98,,999-10-8743,S99996708,X75063318X,Mr.,Chi716,Greenfelder433,,...,,white,nonhispanic,M,Athens Athens Prefecture GR,1060 Bernhard Crossroad Suite 15,Boston,Massachusetts,Suffolk County,2131.0
3,d488232e-bf14-4bed-08c0-a82f34b6a197,1/28/03,,999-56-6057,S99929424,,Ms.,Phillis443,Walter473,,...,,white,nonhispanic,F,Boston Massachusetts US,677 Ritchie Terrace,Hingham,Massachusetts,Plymouth County,2043.0
4,217f95a3-4e10-bd5d-fb67-0cfb5e8ba075,12/23/93,,999-91-4320,S99991143,X44132498X,Mr.,Jerrold404,Herzog843,,...,M,black,nonhispanic,M,Boston Massachusetts US,276 Bernier Branch,Revere,Massachusetts,Suffolk County,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1158,409330fa-7ffd-dbfb-4eba-2349d58a6324,2/28/79,,999-68-5445,S99991649,X72884711X,Mr.,Coy949,Schaden604,,...,M,white,nonhispanic,M,Fitchburg Massachusetts US,434 Hickle Throughway Suite 15,Amherst,Massachusetts,Hampshire County,
1159,cb328021-a854-dc94-e7ae-426580477308,5/31/64,,999-10-6445,S99986790,X86596484X,Mrs.,Sherry479,Barrows492,,...,M,white,nonhispanic,F,Boston Massachusetts US,976 Ortiz Orchard,Stoughton,Massachusetts,Norfolk County,
1160,41862157-5c14-f706-4a94-d2929be969e7,7/12/67,,999-63-2407,S99976335,X68999803X,Mr.,Vance413,Jakubowski832,,...,M,asian,nonhispanic,M,Brockton Massachusetts US,534 Strosin Corner,Gardner,Massachusetts,Worcester County,1440.0
1161,d53c57a5-4480-2481-32ee-b2844a991c9d,7/28/48,,999-37-8036,S99939062,X79037907X,Mr.,Cody889,Hilll811,,...,M,white,nonhispanic,M,Stoneham Massachusetts US,568 Ryan Stravenue,Mashpee,Massachusetts,Barnstable County,


## 9) Procedures

In [70]:
procedures

Unnamed: 0,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,748f8357-6cc7-551d-f31a-32fa2cf84126,4.301930e+08,Medication Reconciliation (procedure),,
1,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,0bee1ce6-3e2c-5506-f71c-a7ba8f64a3d3,4.301930e+08,Medication Reconciliation (procedure),,
2,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,6e93bcf9-45a4-8528-0120-1c1eaa930faf,4.301930e+08,Medication Reconciliation (procedure),,
3,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,3.951420e+08,Allergy screening test,,
4,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,1a7debfc-9582-7f23-a109-4f154a182ee2,4.301930e+08,Medication Reconciliation (procedure),,
...,...,...,...,...,...,...
83818,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,8.661480e+08,Screening for domestic abuse (procedure),,
83819,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,1.712070e+08,Depression screening (procedure),,
83820,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,4.547110e+14,Depression screening using Patient Health Ques...,,
83821,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,4.282110e+14,Assessment of substance use (procedure),,


In [71]:
encounters

Unnamed: 0,Id,DATE,PATIENT,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,748f8357-6cc7-551d-f31a-32fa2cf84126,2019-02-17,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
1,5a4735ae-423f-6563-28ab-b3d11b49b2d4,2019-03-24,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
2,0bee1ce6-3e2c-5506-f71c-a7ba8f64a3d3,2019-05-26,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
3,6e93bcf9-45a4-8528-0120-1c1eaa930faf,2019-07-28,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
4,8b6787c3-4316-a0cb-899d-4746525c319f,2019-10-27,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
...,...,...,...,...,...,...,...
61454,230e2215-38ab-9371-842d-a44d27ae4090,2020-12-18,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,390906007,Follow-up encounter,55822004.0,Hyperlipidemia
61455,db101ad8-66e2-9feb-e0cf-b2618f873c3a,2020-12-28,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,185345009,Encounter for symptom,38822007.0,Cystitis
61456,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,162673000,General examination of patient (procedure),,
61457,b2a4d90b-a2f5-1c88-0fb6-ba49b1487d37,2021-08-13,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,33879002,Administration of vaccine to produce active im...,,


In [72]:
encounters_renamed = encounters.add_prefix('encounter_')

# Perform an outer join
merged_df = pd.merge(procedures, encounters_renamed, left_on='ENCOUNTER', right_on='encounter_Id', how='left')
merged_df

Unnamed: 0,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION,encounter_Id,encounter_DATE,encounter_PATIENT,encounter_CODE,encounter_DESCRIPTION,encounter_REASONCODE,encounter_REASONDESCRIPTION
0,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,748f8357-6cc7-551d-f31a-32fa2cf84126,4.301930e+08,Medication Reconciliation (procedure),,,748f8357-6cc7-551d-f31a-32fa2cf84126,2019-02-17,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
1,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,0bee1ce6-3e2c-5506-f71c-a7ba8f64a3d3,4.301930e+08,Medication Reconciliation (procedure),,,0bee1ce6-3e2c-5506-f71c-a7ba8f64a3d3,2019-05-26,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
2,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,6e93bcf9-45a4-8528-0120-1c1eaa930faf,4.301930e+08,Medication Reconciliation (procedure),,,6e93bcf9-45a4-8528-0120-1c1eaa930faf,2019-07-28,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
3,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,3.951420e+08,Allergy screening test,,,01efcc52-15d6-51e9-faa2-bee069fcbe44,2020-02-17,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,185347001,Encounter for problem,,
4,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,1a7debfc-9582-7f23-a109-4f154a182ee2,4.301930e+08,Medication Reconciliation (procedure),,,1a7debfc-9582-7f23-a109-4f154a182ee2,2020-04-26,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,410620009,Well child visit (procedure),,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
83818,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,8.661480e+08,Screening for domestic abuse (procedure),,,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,162673000,General examination of patient (procedure),,
83819,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,1.712070e+08,Depression screening (procedure),,,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,162673000,General examination of patient (procedure),,
83820,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,4.547110e+14,Depression screening using Patient Health Ques...,,,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,162673000,General examination of patient (procedure),,
83821,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,4.282110e+14,Assessment of substance use (procedure),,,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,162673000,General examination of patient (procedure),,


In [73]:
merged_df.isnull().sum()

PATIENT                            0
ENCOUNTER                          0
CODE                               0
DESCRIPTION                        0
REASONCODE                     63226
REASONDESCRIPTION              63226
encounter_Id                       0
encounter_DATE                     0
encounter_PATIENT                  0
encounter_CODE                     0
encounter_DESCRIPTION              0
encounter_REASONCODE           61974
encounter_REASONDESCRIPTION    61974
dtype: int64

In [74]:
# Rename the 'encounter_DATE' column to 'DATE'
merged_df.rename(columns={'encounter_DATE': 'DATE'}, inplace=True)

# Select only the specified columns
merged_df = merged_df[['DATE', 'PATIENT', 'ENCOUNTER', 'CODE', 'DESCRIPTION', 'REASONCODE', 'REASONDESCRIPTION']]
merged_df

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,2019-02-17,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,748f8357-6cc7-551d-f31a-32fa2cf84126,4.301930e+08,Medication Reconciliation (procedure),,
1,2019-05-26,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,0bee1ce6-3e2c-5506-f71c-a7ba8f64a3d3,4.301930e+08,Medication Reconciliation (procedure),,
2,2019-07-28,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,6e93bcf9-45a4-8528-0120-1c1eaa930faf,4.301930e+08,Medication Reconciliation (procedure),,
3,2020-02-17,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,01efcc52-15d6-51e9-faa2-bee069fcbe44,3.951420e+08,Allergy screening test,,
4,2020-04-26,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,1a7debfc-9582-7f23-a109-4f154a182ee2,4.301930e+08,Medication Reconciliation (procedure),,
...,...,...,...,...,...,...,...
83818,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,8.661480e+08,Screening for domestic abuse (procedure),,
83819,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,1.712070e+08,Depression screening (procedure),,
83820,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,4.547110e+14,Depression screening using Patient Health Ques...,,
83821,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,4.282110e+14,Assessment of substance use (procedure),,


In [75]:
# Ensure merged_df is an independent DataFrame, not a view or slice of another DataFrame
merged_df = merged_df.copy()

# Convert DATE column to datetime
merged_df['DATE'] = pd.to_datetime(merged_df['DATE'])

# Find the latest date for each patient
latest_dates = merged_df.groupby('PATIENT')['DATE'].max().reset_index()

# Merge to get all rows that match the latest date for each patient
procedures_latest = merged_df.merge(latest_dates, on=['PATIENT', 'DATE'])

procedures_latest

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,2021-01-24,b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85,7bb78da2-31b8-497d-bc08-25eca8a904f1,4.301930e+08,Medication Reconciliation (procedure),,
1,2021-10-04,c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8,78fdedd9-24e2-2711-15e9-e43a73ade634,7.660100e+07,Intramuscular injection,,
2,2021-08-02,339144f8-50e1-633e-a013-f361391c4cff,fbf1824f-1ef7-820f-111b-6982ef9a9e5f,7.108240e+08,Assessment of health and social care needs (pr...,,
3,2021-08-02,339144f8-50e1-633e-a013-f361391c4cff,fbf1824f-1ef7-820f-111b-6982ef9a9e5f,7.108410e+08,Assessment of anxiety (procedure),,
4,2021-08-02,339144f8-50e1-633e-a013-f361391c4cff,fbf1824f-1ef7-820f-111b-6982ef9a9e5f,4.282110e+14,Assessment of substance use (procedure),,
...,...,...,...,...,...,...,...
4829,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,8.661480e+08,Screening for domestic abuse (procedure),,
4830,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,1.712070e+08,Depression screening (procedure),,
4831,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,4.547110e+14,Depression screening using Patient Health Ques...,,
4832,2021-01-15,cb1b2c74-d1c5-997c-6f8b-20ca9f332eef,1516d2e6-4846-5f1e-fe27-c1ebb9a39f72,4.282110e+14,Assessment of substance use (procedure),,


In [76]:
# Check how many patients have more than one entry in the filtered dataset
patient_counts = procedures_latest['PATIENT'].value_counts()

# Identify patients with more than one entry
patients_with_multiple_entries = patient_counts[patient_counts > 1]

# Count of such patients
patients_with_multiple_entries_count = len(patients_with_multiple_entries)

# Display the count and the patient IDs with their respective counts
patients_with_multiple_entries_count, patients_with_multiple_entries

(863,
 PATIENT
 050358a1-fcf7-1182-7ad1-6b663afa3002    26
 6fc7b50b-61ff-9237-bcac-7b7b1d8606ab    25
 86dbfa87-dc1c-e529-f39b-4910e4deb3fb    23
 78da7c78-d491-32b2-7ea2-aebb2517d27e    22
 76b289fd-e825-734c-8446-316f59643593    22
                                         ..
 964576b0-7527-b34a-d7c8-37cee9f6e396     2
 3515e99a-c9ef-2628-6949-bf2ba1d74897     2
 297c1550-317d-aac0-b529-042721fed414     2
 346a1435-2455-914f-c287-7b88052d05db     2
 73678386-4129-73b5-024b-dd923dc59095     2
 Name: count, Length: 863, dtype: int64)

In [77]:
procedures_latest.drop(columns=['REASONCODE','REASONDESCRIPTION'], inplace=True)

In [78]:
# Check for different DESCRIPTIONS with the same CODE
desc_with_same_code = procedures_latest.groupby('CODE')['DESCRIPTION'].nunique() > 1

# Filter out the codes and reason codes that meet the criteria
codes_with_multiple_descriptions = desc_with_same_code[desc_with_same_code].index.tolist()

# Prepare the result summaries
result_summary_code = {
    "Codes with Multiple Descriptions": codes_with_multiple_descriptions,
    "Number of Codes with Multiple Descriptions": len(codes_with_multiple_descriptions)
}

result_summary_code

{'Codes with Multiple Descriptions': [5880005.0,
  23426006.0,
  90226004.0,
  171207006.0],
 'Number of Codes with Multiple Descriptions': 4}

In [79]:
procedures_latest.loc[procedures_latest['CODE'] == 5880005, 'DESCRIPTION'] = 'Physical examination'
procedures_latest.loc[procedures_latest['CODE'] == 23426006, 'DESCRIPTION'] = 'Measurement of respiratory function (procedure)'
procedures_latest.loc[procedures_latest['CODE'] == 90226004, 'DESCRIPTION'] = 'Cytopathology procedure  preparation of smear  genital source (procedure)'
procedures_latest.loc[procedures_latest['CODE'] == 171207006, 'DESCRIPTION'] = 'Depression screening (procedure)'

In [80]:
# Check for different DESCRIPTIONS with the same CODE
desc_with_same_code = procedures_latest.groupby('CODE')['DESCRIPTION'].nunique() > 1

# Filter out the codes and reason codes that meet the criteria
codes_with_multiple_descriptions = desc_with_same_code[desc_with_same_code].index.tolist()

# Prepare the result summaries
result_summary_code = {
    "Codes with Multiple Descriptions": codes_with_multiple_descriptions,
    "Number of Codes with Multiple Descriptions": len(codes_with_multiple_descriptions)
}

result_summary_code

{'Codes with Multiple Descriptions': [],
 'Number of Codes with Multiple Descriptions': 0}

In [81]:
# First, sort the data by 'CODE'
sorted_data = procedures_latest.sort_values(by='CODE')

# Create a new column for indicating the presence of an allergy
sorted_data['ProcedurePresent'] = 1

# Pivot the table to have one column for each allergy code, filled with 1s and 0s
pivot_table = sorted_data.pivot_table(index=['DATE', 'PATIENT', 'ENCOUNTER'], 
                                      columns='DESCRIPTION', 
                                      values='ProcedurePresent', 
                                      fill_value=0).reset_index()

# Set the name of the columns index to None to remove the 'DESCRIPTION'
pivot_table.columns.name = None

# Because the pivot operation might have sorted the columns alphabetically by default,
# ensure the columns are ordered by procedures code by reordering them based on the initial sort
# First, get the order of allergy descriptions by code
procedures_order = sorted_data[['CODE', 'DESCRIPTION']].drop_duplicates().sort_values('CODE')['DESCRIPTION']

# Reorder the columns in the pivot table according to the sorted procedures descriptions
ordered_columns = ['DATE', 'PATIENT', 'ENCOUNTER'] + list(procedures_order)
pivot_table = pivot_table.reindex(columns=ordered_columns)

pivot_table

Unnamed: 0,DATE,PATIENT,ENCOUNTER,Upper arm X-ray,Physical examination,Counseling for termination of pregnancy,Peripheral blood smear interpretation,Pulmonary rehabilitation (regime/therapy),Catheter ablation of tissue of heart,Epidural anesthesia,...,Screening for drug abuse (procedure),Induced termination of pregnancy,Depression screening using Patient Health Questionnaire Nine Item score (procedure),Assessment using Morse Fall Scale (procedure),Assessment using Alcohol Use Disorders Identification Test - Consumption (procedure),Screening for domestic abuse (procedure),Assessment using Car Relax Alone Forget Friends Trouble Screening Test (procedure),Assessment of substance use (procedure),Depression screening using Patient Health Questionnaire Two-Item score (procedure),High resolution computed tomography of chest without contrast (procedure)
0,1942-01-05,590ad7c6-6cb7-7181-bf1f-818e179c3edc,db936bf2-cc3f-d943-826f-1cf5e459df87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1942-02-05,c4bdbf50-4728-5778-7f4a-ce39e9c58232,6b04b154-f55a-5f26-258d-cf89f0c8fadd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1945-10-20,ae59c2c9-648d-6f76-2d42-a1b8cac21c8b,d8d18cd2-5548-5ba6-2d38-c0afe37cadff,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1955-11-22,be341f37-13a9-e54b-901a-23d59b802bb8,cc8cf2e8-2b9e-84b0-3f18-0ee08a9f72bc,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1958-01-09,db9b9c1c-0a86-773c-ba82-fbb9b71d9993,f6f931e4-4e31-53a4-243c-f643315f2787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1215,2021-11-18,0e78dfee-9af0-046c-39bb-33fdcb910976,15ff860d-b347-8cfb-3502-89bcdd3738cb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1216,2021-11-18,1ef1309d-cd3a-f6b2-2706-10f9ecfa06ed,8538378f-424b-4738-a9ba-ff4ca26a88ad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1217,2021-11-18,55c5b8d3-99d0-58ad-8444-e42bb81bd5c7,64ac894d-86d8-f6a3-7098-1a2bff037b3f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1218,2021-11-18,6dcdf9cd-efbe-c8c0-c633-ef255bc05ef5,4b5c8930-6006-c82d-f6c1-777468ad4bf5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
# Check how many patients have more than one entry in the filtered dataset
patient_counts = pivot_table['PATIENT'].value_counts()

# Identify patients with more than one entry
patients_with_multiple_entries = patient_counts[patient_counts > 1]

# Count of such patients
patients_with_multiple_entries_count = len(patients_with_multiple_entries)

# Display the count and the patient IDs with their respective counts
patients_with_multiple_entries_count, patients_with_multiple_entries

(58,
 PATIENT
 e000e5c8-30f7-7a53-a324-7f3e431ba008    2
 58e24016-f3f7-e33d-35cd-333d6dedd5aa    2
 4ba15182-a126-7f01-a46e-1901c5c5ceae    2
 db0ba46f-9b5e-63cd-cbe4-3c0ea82abbdf    2
 a57215c7-228c-a1f3-ac96-c5fbe7f3e3f6    2
 23cec4fc-145d-f75d-dc70-2a52c97bd9c3    2
 87cecc0c-9c64-cc50-5370-d7c881ccbd61    2
 436a7fef-6640-96ab-2557-77e40e947bc9    2
 cc0b85a3-ca8d-45fb-c9d6-5b69dd505d41    2
 edf82168-489f-1b3e-c1da-5956c19f4520    2
 76b289fd-e825-734c-8446-316f59643593    2
 e1183fd3-ffb9-5078-31ea-2ab0e5e7db90    2
 c518abbe-9695-1b3a-3173-305ff9bb6150    2
 965ecf4b-40d6-02e3-fe08-acd9eafc68fe    2
 6bdd9c1a-17be-8543-322f-91786c6af444    2
 137651e5-3f15-db77-2712-ee0b75724502    2
 155e8fa2-a0ac-80e4-23e2-0395cdcf1794    2
 f41e89ff-c2ff-9998-ca68-ad438f67447d    2
 30d4add3-4d26-ab22-5a3d-53363f33332f    2
 f99e1c5e-40c9-aa06-3e00-f2d944dc3ae0    2
 1d1af1df-c916-9534-dcb4-b9aaf02e48d5    2
 289f0cd9-e5d2-d575-b227-51874433bb33    2
 e4035211-964b-b90f-237d-655426c3aa1f   

In [83]:
procedures_data = pivot_table

# Convert DATE to datetime for correct sorting and error handling
procedures_data['DATE'] = pd.to_datetime(procedures_data['DATE'], errors='coerce')

# Sort data by PATIENT and DATE to ensure chronological order
procedures_data_sorted = procedures_data.sort_values(by=['PATIENT', 'DATE'])

# Define procedure columns (assuming all columns except DATE, PATIENT, and ENCOUNTER are vaccine indicators)
procedure_columns = [col for col in procedures_data.columns if col not in ['DATE', 'PATIENT', 'ENCOUNTER']]

# Perform aggregation: summing procedure indicators and keeping the last (most recent) encounter's data
grouped_procedures = procedures_data_sorted.groupby('PATIENT', as_index=False).agg(
    {**{col: 'sum' for col in procedure_columns}, **{'ENCOUNTER': 'last', 'DATE': 'last'}}
)

# Define the desired column order
desired_order = ['DATE', 'PATIENT', 'ENCOUNTER'] + procedure_columns

# Reorder the DataFrame according to the desired order
grouped_procedures = grouped_procedures[desired_order]

grouped_procedures

Unnamed: 0,DATE,PATIENT,ENCOUNTER,Upper arm X-ray,Physical examination,Counseling for termination of pregnancy,Peripheral blood smear interpretation,Pulmonary rehabilitation (regime/therapy),Catheter ablation of tissue of heart,Epidural anesthesia,...,Screening for drug abuse (procedure),Induced termination of pregnancy,Depression screening using Patient Health Questionnaire Nine Item score (procedure),Assessment using Morse Fall Scale (procedure),Assessment using Alcohol Use Disorders Identification Test - Consumption (procedure),Screening for domestic abuse (procedure),Assessment using Car Relax Alone Forget Friends Trouble Screening Test (procedure),Assessment of substance use (procedure),Depression screening using Patient Health Questionnaire Two-Item score (procedure),High resolution computed tomography of chest without contrast (procedure)
0,2021-10-30,00126cb9-8460-4747-e302-c3609684531e,1908e893-ac98-aba5-bef4-f6d4cad0f591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2021-02-09,00209bf2-8e4d-06d1-82a4-daad02f25829,b3bc4d75-c000-8b48-5200-0cca2c4232c2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
2,2021-08-21,00ae3b00-9500-efc1-2758-a93d3f77e650,0ebaab7c-63bc-a04b-1fd0-f45adf699563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2021-08-29,00c9ca99-6b9f-add4-8759-f7dfee6ea1a4,45b47ac1-b8c3-36ca-d1d7-372fb40fd4df,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1991-02-01,0142b69f-57f0-9a08-4e2d-65a2b77fdea7,ae24ca09-a297-8db3-5d3b-cc0e1ad6fd2c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1157,1989-04-02,ff06d32e-e1df-b8db-484a-2d0d9e5d8461,3582a1ed-48c3-6a16-9085-79d28b546b30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1158,2008-04-04,ff0e4d0e-6181-e36e-d817-64dbcaecb5d0,9ac93727-a73c-5479-6d06-35999c01756d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1159,2021-08-06,ff59238a-9508-b0e5-39f4-0d4afcbe6f43,15e0aa7f-52a3-6428-2bdc-e1b86a24b722,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1160,2021-03-23,ff9337d4-4f6c-2182-c1aa-a0811a68a4ab,598bb231-e339-5a24-0b08-e005d5198d4a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0


In [84]:
# Check how many patients have more than one entry in the filtered dataset
patient_counts = grouped_procedures['PATIENT'].value_counts()

# Identify patients with more than one entry
patients_with_multiple_entries = patient_counts[patient_counts > 1]

# Count of such patients
patients_with_multiple_entries_count = len(patients_with_multiple_entries)

# Display the count and the patient IDs with their respective counts
patients_with_multiple_entries_count, patients_with_multiple_entries

(0, Series([], Name: count, dtype: int64))

In [85]:
df = grouped_procedures

import numpy as np

# Identify numeric columns in the DataFrame
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Cap values greater than 1 in numeric columns only
for col in numeric_cols:
    df[col] = df[col].apply(lambda x: 1 if x > 1 else x)

In [86]:
# Identify numeric columns in the DataFrame
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Initialize a flag to track if any value is greater than 1
values_greater_than_one = False

# Check each numeric column for values greater than 1
for col in numeric_cols:
    if any(df[col] > 1):
        print(f"Column '{col}' contains values greater than 1.")
        values_greater_than_one = True

# Print the overall result
if not values_greater_than_one:
    print("No values greater than 1 found in the DataFrame.")
else:
    print("There are values greater than 1 in the DataFrame.")

No values greater than 1 found in the DataFrame.


In [87]:
procedures_converted = df
procedures_converted

Unnamed: 0,DATE,PATIENT,ENCOUNTER,Upper arm X-ray,Physical examination,Counseling for termination of pregnancy,Peripheral blood smear interpretation,Pulmonary rehabilitation (regime/therapy),Catheter ablation of tissue of heart,Epidural anesthesia,...,Screening for drug abuse (procedure),Induced termination of pregnancy,Depression screening using Patient Health Questionnaire Nine Item score (procedure),Assessment using Morse Fall Scale (procedure),Assessment using Alcohol Use Disorders Identification Test - Consumption (procedure),Screening for domestic abuse (procedure),Assessment using Car Relax Alone Forget Friends Trouble Screening Test (procedure),Assessment of substance use (procedure),Depression screening using Patient Health Questionnaire Two-Item score (procedure),High resolution computed tomography of chest without contrast (procedure)
0,2021-10-30,00126cb9-8460-4747-e302-c3609684531e,1908e893-ac98-aba5-bef4-f6d4cad0f591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2021-02-09,00209bf2-8e4d-06d1-82a4-daad02f25829,b3bc4d75-c000-8b48-5200-0cca2c4232c2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
2,2021-08-21,00ae3b00-9500-efc1-2758-a93d3f77e650,0ebaab7c-63bc-a04b-1fd0-f45adf699563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2021-08-29,00c9ca99-6b9f-add4-8759-f7dfee6ea1a4,45b47ac1-b8c3-36ca-d1d7-372fb40fd4df,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1991-02-01,0142b69f-57f0-9a08-4e2d-65a2b77fdea7,ae24ca09-a297-8db3-5d3b-cc0e1ad6fd2c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1157,1989-04-02,ff06d32e-e1df-b8db-484a-2d0d9e5d8461,3582a1ed-48c3-6a16-9085-79d28b546b30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1158,2008-04-04,ff0e4d0e-6181-e36e-d817-64dbcaecb5d0,9ac93727-a73c-5479-6d06-35999c01756d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1159,2021-08-06,ff59238a-9508-b0e5-39f4-0d4afcbe6f43,15e0aa7f-52a3-6428-2bdc-e1b86a24b722,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1160,2021-03-23,ff9337d4-4f6c-2182-c1aa-a0811a68a4ab,598bb231-e339-5a24-0b08-e005d5198d4a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0


## Change Some formats

In [88]:
# allergies_converted
# careplans_converted
# conditions_converted
# encounters_converted
# immunizations_converted
# medications_converted
# observations_converted
# patients_converted
# procedures_converted

In [89]:
allergies_converted_new = allergies_converted.rename(columns={'START': 'allergy_DATE'})
allergies_converted_new.drop(columns=['ENCOUNTER'], axis=1, inplace=True)

careplans_df = careplans_converted
careplans_df_dropped = careplans_df.drop(columns=['ENCOUNTER'], axis=1)
careplans_df_dropped.columns = ['PATIENT' if col == 'PATIENT' else f'careplan_{col}' for col in careplans_df_dropped.columns]
careplans_converted_new = careplans_df_dropped

conditions_df_dropped = conditions_converted.drop(columns=['ENCOUNTER'], axis=1)
conditions_df_dropped.columns = ['PATIENT' if col == 'PATIENT' else f'condition_{col}' for col in conditions_df_dropped.columns]
conditions_converted_new = conditions_df_dropped

encounters_converted_new = encounters_converted
encounters_converted_new.columns = ['PATIENT' if col == 'PATIENT' else f'encounter_{col}' for col in encounters_converted_new.columns]

immunizations_converted_new = immunizations_converted.rename(columns={'DATE': 'immunization_DATE'})
immunizations_converted_new.drop(columns=['ENCOUNTER'], axis=1, inplace=True)

medications_converted_new = medications_converted.rename(columns={'START': 'medication_DATE'})
medications_converted_new.drop(columns=['ENCOUNTER'], axis=1, inplace=True)

observations_converted_new = observations_converted.rename(columns={'DATE': 'observation_DATE'})

patients_converted_new = patients_converted

procedures_converted_new = procedures_converted.rename(columns={'DATE': 'procedure_DATE'})
procedures_converted_new.drop(columns=['ENCOUNTER'], axis=1, inplace=True)

In [90]:
allergies_converted_new.to_csv('converted_csv/allergies_converted_new.csv', index=False)
careplans_converted_new.to_csv('converted_csv/careplans_converted_new.csv', index=False)
conditions_converted_new.to_csv('converted_csv/conditions_converted_new.csv', index=False)
encounters_converted_new.to_csv('converted_csv/encounters_converted_new.csv', index=False)
immunizations_converted_new.to_csv('converted_csv/immunizations_converted_new.csv', index=False)
medications_converted_new.to_csv('converted_csv/medications_converted_new.csv', index=False)
observations_converted_new.to_csv('converted_csv/observations_converted_new.csv', index=False)
patients_converted_new.to_csv('converted_csv/patients_converted_new.csv', index=False)
procedures_converted_new.to_csv('converted_csv/procedures_converted_new.csv', index=False)