<a href="https://colab.research.google.com/github/olabisiojo/pfx/blob/main/cbcdiabetes2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the dataset
url = 'https://github.com/olabisiojo/pfx/raw/main/bcDiabetes2024.csv'
df = pd.read_csv(url)

# Display the first few rows of the dataframe
print("Initial DataFrame:")
print(df.head())

# Handle missing values
# Replace '?' with NaN
df.replace('?', pd.NA, inplace=True)

# Drop columns with more than 50% missing values
threshold = len(df) * 0.5
df.dropna(thresh=threshold, axis=1, inplace=True)

# Fill remaining missing values with appropriate strategies
# For numerical columns, fill with the median
num_cols = df.select_dtypes(include=['number']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# For categorical columns, fill with the mode
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# Remove duplicates
df.drop_duplicates(inplace=True)

# Standardize data types
# Convert columns to appropriate data types
df['encounter_id'] = df['encounter_id'].astype(int)
df['patient_nbr'] = df['patient_nbr'].astype(int)
df['age'] = df['age'].astype('category')
df['gender'] = df['gender'].astype('category')
df['race'] = df['race'].astype('category')
df['admission_type_id'] = df['admission_type_id'].astype(int)
df['discharge_disposition_id'] = df['discharge_disposition_id'].astype(int)
df['admission_source_id'] = df['admission_source_id'].astype(int)
df['time_in_hospital'] = df['time_in_hospital'].astype(int)
df['readmitted'] = df['readmitted'].astype('category')

# Convert medication columns to categorical
medication_cols = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
                   'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol',
                   'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburidemetformin',
                   'glipizidemetformin', 'glimepiridepioglitazone', 'metforminrosiglitazone', 'metforminpioglitazone']
df[medication_cols] = df[medication_cols].astype('category')

# Convert 'change' and 'diabetesmed' to categorical
df['change'] = df['change'].astype('category')
df['diabetesmed'] = df['diabetesmed'].astype('category')

# Convert 'a1cresult_num' to numeric
df['a1cresult_num'] = pd.to_numeric(df['a1cresult_num'], errors='coerce')

# Display the cleaned DataFrame
print("Cleaned DataFrame:")
print(df.head())

# Save the cleaned data to a new CSV file
df.to_csv('cleaned_bcDiabetes2024.csv', index=False)

# Save (download) the cleaned data to a new CSV file to computer
from google.colab import files
files.download('cleaned_bcDiabetes2024.csv')