In [1]:
import pandas as pd

In [2]:
# Load the dataset from the CSV file
df = pd.read_csv("healthcare_dataset.csv")

# Display the first 5 rows to confirm it loaded correctly
df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [3]:
# This will display a list of all column names
df.columns

Index(['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition',
       'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider',
       'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date',
       'Medication', 'Test Results'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

In [5]:
# Remove the 'Room Number' column permanently
df.drop(columns=['Room Number'], inplace=True)

# Now, display the first few rows to confirm it's gone
df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,Urgent,2022-10-09,Penicillin,Abnormal


In [6]:
# This will display a list of all column names
df.columns

Index(['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition',
       'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider',
       'Billing Amount', 'Admission Type', 'Discharge Date', 'Medication',
       'Test Results'],
      dtype='object')

In [7]:
# Trim spaces, fix headers
df.columns = df.columns.str.strip()

# Fix weird capitalizations and spacing
df['Name'] = df['Name'].str.title().str.strip()
df['Gender'] = df['Gender'].str.capitalize()
df['Blood Type'] = df['Blood Type'].str.upper().str.strip()
df['Medical Condition'] = df['Medical Condition'].str.title().str.strip()
df['Doctor'] = df['Doctor'].str.title().str.strip()
df['Hospital'] = df['Hospital'].str.title().str.strip()
df['Insurance Provider'] = df['Insurance Provider'].str.title().str.strip()
df['Medication'] = df['Medication'].str.title().str.strip()
df['Test Results'] = df['Test Results'].str.capitalize().str.strip()

In [8]:
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'], errors='coerce')
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'], errors='coerce')

df['Stay Duration (days)'] = (df['Discharge Date'] - df['Date of Admission']).dt.days


In [9]:
df = df.dropna(subset=['Name', 'Medical Condition', 'Doctor', 'Hospital'])
df['Billing Amount'] = df['Billing Amount'].fillna(df['Billing Amount'].median())
df['Insurance Provider'] = df['Insurance Provider'].fillna('Unknown')


In [10]:
df.info()
df.describe(include='all').T
df.head(10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Name                  55500 non-null  object        
 1   Age                   55500 non-null  int64         
 2   Gender                55500 non-null  object        
 3   Blood Type            55500 non-null  object        
 4   Medical Condition     55500 non-null  object        
 5   Date of Admission     55500 non-null  datetime64[ns]
 6   Doctor                55500 non-null  object        
 7   Hospital              55500 non-null  object        
 8   Insurance Provider    55500 non-null  object        
 9   Billing Amount        55500 non-null  float64       
 10  Admission Type        55500 non-null  object        
 11  Discharge Date        55500 non-null  datetime64[ns]
 12  Medication            55500 non-null  object        
 13  Test Results    

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Admission Type,Discharge Date,Medication,Test Results,Stay Duration (days)
0,Bobby Jackson,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons And Miller,Blue Cross,18856.281306,Urgent,2024-02-02,Paracetamol,Normal,2
1,Leslie Terry,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,Emergency,2019-08-26,Ibuprofen,Inconclusive,6
2,Danny Smith,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook Plc,Aetna,27955.096079,Emergency,2022-10-07,Aspirin,Normal,15
3,Andrew Watts,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers And Vang,",Medicare,37909.78241,Elective,2020-12-18,Ibuprofen,Abnormal,30
4,Adrienne Bell,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,Urgent,2022-10-09,Penicillin,Abnormal,20
5,Emily Johnson,36,Male,A+,Asthma,2023-12-20,Taylor Newton,Nunez-Humphrey,Unitedhealthcare,48145.110951,Urgent,2023-12-24,Ibuprofen,Normal,4
6,Edward Edwards,21,Female,AB-,Diabetes,2020-11-03,Kelly Olson,Group Middleton,Medicare,19580.872345,Emergency,2020-11-15,Paracetamol,Inconclusive,12
7,Christina Martinez,20,Female,A+,Cancer,2021-12-28,Suzanne Thomas,"Powell Robinson And Valdez,",Cigna,45820.462722,Emergency,2022-01-07,Paracetamol,Inconclusive,10
8,Jasmine Aguilar,82,Male,AB+,Asthma,2020-07-01,Daniel Ferguson,Sons Rich And,Cigna,50119.222792,Elective,2020-07-14,Aspirin,Abnormal,13
9,Christopher Berg,58,Female,AB-,Cancer,2021-05-23,Heather Day,Padilla-Walker,Unitedhealthcare,19784.631062,Elective,2021-06-22,Paracetamol,Inconclusive,30


In [11]:
# Save the cleaned DataFrame to a new file
df.to_csv("healthcare_dataset_cleaned.csv", index=False)

In [13]:

df = pd.read_csv("healthcare_dataset_cleaned.csv")

def make_summary(row):
    try:
        return (
            f"Patient {row['Name']} ({row['Gender']}, {row['Age']} years old, blood type {row['Blood Type']}) "
            f"was admitted to {row['Hospital']} on {pd.to_datetime(row['Date of Admission']).date()} "
            f"under Dr. {row['Doctor']} as an {row['Admission Type'].lower()} case. "
            f"They were diagnosed with {row['Medical Condition']} and treated with {row['Medication']}. "
            f"Insurance provider: {row['Insurance Provider']}. "
            f"The total billing amount was ${row['Billing Amount']:.2f}. "
            f"Test results were {row['Test Results'].lower()}. "
            f"Discharged on {pd.to_datetime(row['Discharge Date']).date()} "
            f"after {row.get('Stay Duration (days)', 'unknown')} days of hospitalization."
        )
    except Exception as e:
        return f"Error in row: {e}"

df['Summary'] = df.apply(make_summary, axis=1)
df[['Summary']].to_csv("patient_summaries.csv", index=False)

print(df['Summary'].head(5))


0    Patient Bobby Jackson (Male, 30 years old, blo...
1    Patient Leslie Terry (Male, 62 years old, bloo...
2    Patient Danny Smith (Female, 76 years old, blo...
3    Patient Andrew Watts (Female, 28 years old, bl...
4    Patient Adrienne Bell (Female, 43 years old, b...
Name: Summary, dtype: object
