# Intro:

### Context
This synthetic healthcare dataset has been generated to mimic real-world healthcare data, providing a resource for data science and machine learning enthusiasts to practice their skills in a healthcare context.

### Inspiration
The dataset aims to fill the gap in practical and diverse healthcare data for educational purposes, created using Python's Faker library.

### Dataset Information
The dataset contains various columns representing patient information and healthcare services provided. Below is a brief explanation of each column:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')

data = pd.read_csv('healthcare_dataset.csv')


data.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [2]:
#num of rows & cols:
num_rows, num_cols = data.shape
print(f"Number of rows: {num_rows}, Number of columns: {num_cols}")


#column names and data types
data_info = data.dtypes.reset_index()
data_info.columns = ['Column Name', 'Data Type']
data_info


Number of rows: 55500, Number of columns: 15


Unnamed: 0,Column Name,Data Type
0,Name,object
1,Age,int64
2,Gender,object
3,Blood Type,object
4,Medical Condition,object
5,Date of Admission,object
6,Doctor,object
7,Hospital,object
8,Insurance Provider,object
9,Billing Amount,float64


In [3]:
#numerical columns
columns_to_analyze = ['Age', 'Billing Amount'] 
statistics_list = []

#statistics for columns

for column in columns_to_analyze:
    mean_value = data[column].mean()
    median_value = data[column].median()
    mode_value = data[column].mode()[0]
    std_dev_value = data[column].std()
    quartiles = data[column].quantile([0.25, 0.5, 0.75])
    
    statistics_list.append({
        'Column': column,
        'Mean': mean_value,
        'Median': median_value,
        'Mode': mode_value,
        'Standard Deviation': std_dev_value,
        '25th Percentile': quartiles[0.25],
        '50th Percentile': quartiles[0.5],
        '75th Percentile': quartiles[0.75]
    })

statistics_df = pd.DataFrame(statistics_list)

print(statistics_df)


           Column          Mean        Median         Mode  \
0             Age     51.539459     52.000000    38.000000   
1  Billing Amount  25539.316097  25538.069376 -1316.618581   

   Standard Deviation  25th Percentile  50th Percentile  75th Percentile  
0           19.602454        35.000000        52.000000        68.000000  
1        14211.454431     13241.224652     25538.069376     37820.508436  


In [7]:
# Analyzing categorical columns
categorical_summary = {}

for column in data.select_dtypes(include=[object]):
    unique_count = data[column].nunique()
    mode = data[column].mode()[0]
    mode_count = data[column].value_counts()[mode]
    
    categorical_summary[column] = {
        'Unique Values': unique_count,
        'Mode': mode,
        'Frequency of Mode': mode_count
    }

categorical_summary_df = pd.DataFrame(categorical_summary).T
categorical_summary_df


Unnamed: 0,Unique Values,Mode,Frequency of Mode
Name,49992,DAvId muNoZ,3
Gender,2,Male,27774
Blood Type,8,A-,6969
Medical Condition,6,Arthritis,9308
Date of Admission,1827,2024-03-16,50
Doctor,40341,Michael Smith,27
Hospital,39876,LLC Smith,44
Insurance Provider,5,Cigna,11249
Admission Type,3,Elective,18655
Discharge Date,1856,2020-03-15,53


In [None]:
# Calculate missing values for each column
missing_values = data.isnull().sum()
missing_percentage = (missing_values / num_rows) * 100

missing_summary = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})

# Filter columns with missing values
missing_summary = missing_summary[missing_summary['Missing Count'] > 0]
missing_summary


plt.figure(figsize=(12, 6))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.show()

# Potential Questions: