<a href="https://colab.research.google.com/github/paulkel229/docs/blob/main/analysis_healthcare_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'healthcare-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3934836%2F8356547%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240913%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240913T130119Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Dac122c2e09a017e4a2a7ef91b47eac5381ea3155cc9b95b6c8152d367bae27883a92d547976c68113ef7e2a107a12b28a07480948ba64d92f00e1f0bcce0e80631163eb4f05b0c0288762474df743bb6eb3990cecafdb427a18d622bb72c1a0d41ac1e525d9b0a051e632a9e0b86092caf410d8b220b6a88a570f0323230253fc41fc90c25c08b44f19f10e39f1b6fa2139ec55a82febc03c1a482ce50e4cc8e72f73d96a20ea747c0b2547f4170adb2c58e756acd48470b2b083a7721eb33b222b9036511ff5c6f10eb6622abf12dcba52794dfd851b6d0e1b5d4430fdeefcb70e98108e0076f3d6655959e6b0fc052c95b734e281fc0aa495d8ba46da968cf'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv("/kaggle/input/healthcare-dataset/healthcare_dataset.csv")

In [None]:
df

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
duplicate_rows = df[df.duplicated()].sort_values(by="Name")
duplicate_rows

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.info()

# object description

In [None]:
df.describe(include=['O'])

# Static Description

In [None]:
df.describe()

# histogram of Ages

In [None]:
df.Age.hist(edgecolor="White", figsize=(5,3),grid=False , color="#80C4E9")

# What is the distribution of test results in the data ?
## (or) What is the status of the patients based on the test results?

In [None]:
colors = plt.get_cmap('Pastel1_r').colors
df["Test Results"].value_counts().plot(kind='pie', autopct='%1.1f%%', figsize=(7,5), title='Distribution of Test Results', explode=(0.11,0, 0) , shadow=True , colors=colors)
plt.show()

# What is the percentage of each gender in the data?

In [None]:
df["Gender"].value_counts(normalize=True)*100

# distribute of blood type

In [None]:
df["Blood Type"].value_counts()

# What is the gender distribution of blood types?
## (or) What is the relation between blood type and Gender?

In [None]:
df.groupby(["Gender"])["Blood Type"].value_counts()

## ❄️❄️ A+ , O+ , O- high for women ❄️❄️

# What is the distribution of medical conditions ?

In [None]:
df["Medical Condition"].value_counts()

# Which gender is more affected by the disease?

In [None]:
df.groupby(["Medical Condition"])["Gender"].value_counts(normalize=True)*100

## What is the number of patients broken down by sex, medical condition, and blood type?
## (or)How many male and female patients have a certain disease and have a certain blood type ?

In [None]:
grouped_df = df.groupby(["Gender", "Medical Condition", "Blood Type"]).count()["Name"]
sorted_df = grouped_df.unstack().sort_values(by=["Medical Condition"], ascending=True)
print(sorted_df)


![iiiii](C:\Users\UAS\Desktop\Image.png)

# distribute Medication

In [None]:
medication_counts = df['Medication'].value_counts()
print("Medication Counts:\n", medication_counts)

# What is the distribution of medical conditions for each type of medication?
# (or)The number of patients according to their Medical Condition who use these medicines ? and which Medication are use most for each medical conditions?¶

In [None]:
df.groupby(['Medication'])["Medical Condition"].value_counts()

____
## 1. Aspirin
- **Arthritis:** Used to manage pain and inflammation.
- **Cancer:** Some studies suggest it may lower the risk of certain cancers (e.g., colorectal cancer) due to its anti-inflammatory effects.
- **Diabetes:** Often prescribed to individuals with diabetes to prevent cardiovascular complications.
- **Obesity:** No direct link, but used to manage related pain or inflammation.
- **Asthma:** Not typically recommended; can sometimes worsen asthma in sensitive individuals.
- **Hypertension:** Used with caution as it can affect blood pressure.

## 2. Ibuprofen
- **Arthritis:** Commonly used to manage arthritis symptoms due to its anti-inflammatory properties.
- **Cancer:** No direct evidence linking it to cancer risk, but its anti-inflammatory effects may have some indirect benefits.
- **Diabetes:** Generally safe but should be used cautiously in people with kidney issues.
- **Obesity:** No direct link; used for managing related pain.
- **Asthma:** Can sometimes worsen asthma symptoms.
- **Hypertension:** Should be used cautiously as it can increase blood pressure.

## 3. Lipitor (Atorvastatin)
- **Arthritis:** No direct link, but may help manage cardiovascular risk associated with arthritis.
- **Cancer:** Some studies suggest it may have a protective effect against certain cancers, but evidence is inconclusive.
- **Diabetes:** Can increase the risk of developing diabetes in some individuals.
- **Obesity:** Not directly linked but may be used to manage cholesterol levels in obese individuals.
- **Asthma:** No direct link.
- **Hypertension:** Used to manage cardiovascular risk factors associated with hypertension.

## 4. Paracetamol (Acetaminophen)
- **Arthritis:** Used to manage pain, but does not have anti-inflammatory properties.
- **Cancer:** No direct link; used for pain management.
- **Diabetes:** Generally safe but should be used cautiously in high doses.
- **Obesity:** No direct link; used for managing pain related to obesity.
- **Asthma:** Safe for most people with asthma.
- **Hypertension:** Safe for most people with hypertension.

## 5. Penicillin
- **Arthritis:** No direct link; used if an infection complicates arthritis.
- **Cancer:** No direct link; used to treat infections.
- **Diabetes:** Safe to use, but diabetic patients need to monitor for potential side effects.
- **Obesity:** No direct link; used for infections.
- **Asthma:** Generally safe but can cause allergic reactions in some individuals.
- **Hypertension:** Safe for most individuals with hypertension.
____


In [None]:
colors = plt.get_cmap('Pastel1').colors
df["Admission Type"].value_counts().plot(kind='pie', autopct='%1.1f%%', figsize=(10,5), title='Distribution of Admission Type', explode=(0.11,0, 0) , shadow=True , colors=colors)

In [None]:
df.groupby(df["Gender"])["Admission Type"].value_counts().unstack()

In [None]:
df.groupby(["Admission Type"])["Medication"].value_counts()

## What medications are most commonly used for each medical condition within each type of admission?

In [None]:
grouped_df = df.groupby(["Admission Type", "Medication", "Medical Condition"]).count()["Name"].unstack()
grouped_df

![image1](C:\Users\UAS\Desktop\image2.png)

# Static describtion for Billing Amount column

In [None]:
df["Billing Amount"].describe()

## Row --> Max value of "Billing Amount"

In [None]:
df[ df["Billing Amount"] == df["Billing Amount"].max() ]

## What are rows  that has negative values in Billing Amount column ?

In [None]:
new_df = df[ df["Billing Amount"] < 0 ]
new_df

# 👉 *The reason Billing Amount has negative value may be:*
## 1. Refund or Return:
#### A negative number may indicate a refund to the customer. For example, if a customer returns a product and gets a refund, the invoice amount may be  recorded in negative to indicate that the amount has been deducted from the invoice or refunded.‎   
## 2. Adjustment:
#### Negative values may be used to correct past billing errors. If there is an error in a previous amount and it has been corrected, the correction amount may appear in negati
## 3. Discounts:
#### In some cases, discounts or discounts that are given to the customer can be recorded as negative amounts in the "Billing Amount" column.
## 4. Credit:
#### If the system allows accounts payable, a negative amount may be recorded to indicate that there is a credit balance in the customer's account, which means that they have prepaid funds or owe them a certain amount.
## 5. Void or Cancellation:
#### The negative amount can be the result of canceling an invoice or transaction after it has been issued, reversing the original amount.
## 6. Administrative error:
#### Sometimes, negative values can be the result of a data entry error or a system problem.

____
# Add new column Duration of Stay (Days)

In [None]:
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'])
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'])
df['Duration of Stay (Days)'] = (df['Discharge Date'] - df['Date of Admission']).dt.days
df[['Date of Admission', 'Discharge Date', 'Duration of Stay (Days)']].head(10)

In [None]:
df

# static description after modification

In [None]:
df.describe()

## ❄️❄️WE observe that data has information from  2019-05-08  to 2024-06-06 ❄️❄️

## Data type new columon added ---> Duration of Stay (Days)

In [None]:
df["Duration of Stay (Days)"].dtype

# Max duration

In [None]:
df["Duration of Stay (Days)"].max()

# New dataframe For paitent who stay max duration 30 day

In [None]:
new_df = df[ df["Duration of Stay (Days)"] == df["Duration of Stay (Days)"].max() ]
new_df

# object description

In [None]:
new_df.describe(include=['O'])

# Num people stay max durution

In [None]:
print ( "There is" , new_df.shape[0]  , "person stay" ,  df["Duration of Stay (Days)"].max())

In [None]:
plt.figure(figsize=(17, 8))
n, bins, patches = plt.hist(new_df["Age"], bins=30, edgecolor='black')
cmap = plt.get_cmap('Blues')
norm = plt.Normalize(vmin=min(bins), vmax=max(bins))

for patch, bin_value in zip(patches, bins):
    color = cmap(norm(bin_value))
    patch.set_facecolor(color)

plt.xticks(ticks=np.arange(0, new_df["Age"].max(), step=5), fontsize=12)
plt.xlabel("Age", fontsize=14)
plt.ylabel("Frequency", fontsize=14)


plt.show()


In [None]:
plt.figure(figsize=(17, 8))
cmap = plt.get_cmap('vlag_r')
n, bins, patches = plt.hist(new_df["Billing Amount"] , bins=30, edgecolor='black')
norm = plt.Normalize(vmin=min(new_df["Billing Amount"]), vmax=max(new_df["Billing Amount"]))
for bin, patch in zip(bins, patches):
    color = cmap(norm(bin))
    patch.set_facecolor(color)
ticks = np.arange(0, new_df["Billing Amount"].max(), step=5000)
plt.xticks(ticks=ticks, fontsize=12)
plt.xlabel("Billing Amount", fontsize=14)
plt.ylabel("Frequency", fontsize=14)


plt.show()


In [None]:
colors = plt.get_cmap('Pastel1_r').colors
new_df["Test Results"].value_counts().plot(kind='pie', autopct='%1.1f%%', figsize=(15,5), title='Distribution of Test Results', explode=(0.11,0, 0) , shadow=True , colors=colors)
plt.show()

In [None]:
df["Duration of Stay (Days)"].min()

In [None]:
new_df2 = df[ df["Duration of Stay (Days)"] == df["Duration of Stay (Days)"].min() ]
new_df2

In [None]:
print ( "There is" , new_df2.shape[0]  , "person stay" ,  df["Duration of Stay (Days)"].min())