In [15]:
#Importing all the Necessary Libraries
# -------------------------------
import pandas as pd
import glob
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

In [None]:
**This section defines the file paths for the HUPA-UC Diabetes Dataset and various CSV outputs.

In [22]:
#Downloads/PyQuesters_Python_Hackathon/HUPA-UC Diabetes Dataset
path = r"C:\RemyaProjects2025\Python Hackathon2025\HUPA-UC Diabetes Dataset"
demographic_path = path + r"\T1DM_patient_sleep_demographics_with_race.csv"
all_patients_path = path + r"ALL_PATIENTS.csv"
modified_demographic_path = path + r"DEMOGRAPHIC.csv"
patients_demographic_path = path + r"\PATIENTS_WITH_DEMOGRAPHIC.csv"

In [None]:
This code reads all CSV files containing patient data, adds a patient_id column based on the filename, concatenates them into a single DataFrame, and saves it as a new CSV file.

Each patient has their own CSV. To analyze across all patients, you must combine them into one unified DataFrame.
How it helps?
Easier comparisons across patients

Standardizes your dataset for ML/EDA

Adds patient_id so individual data isn’t lost after merging

In [23]:
#Creating a dataframe by merging all csv files having patient data
csv_files = glob.glob(os.path.join(path,"H*.csv"))
print(csv_files)
df_list =[]
for file in csv_files:
    patientId = os.path.basename(file).split(".")[0]
    df = pd.read_csv(file,sep=";")
    df["patient_id"] = patientId
    df_list.append(df)
patients_df = pd.concat(df_list, ignore_index = True)
print(patients_df.columns)
patients_df.to_csv(all_patients_path,index=False)

['C:\\RemyaProjects2025\\Python Hackathon2025\\HUPA-UC Diabetes Dataset\\HUPA0001P.csv', 'C:\\RemyaProjects2025\\Python Hackathon2025\\HUPA-UC Diabetes Dataset\\HUPA0002P.csv', 'C:\\RemyaProjects2025\\Python Hackathon2025\\HUPA-UC Diabetes Dataset\\HUPA0003P.csv', 'C:\\RemyaProjects2025\\Python Hackathon2025\\HUPA-UC Diabetes Dataset\\HUPA0004P.csv', 'C:\\RemyaProjects2025\\Python Hackathon2025\\HUPA-UC Diabetes Dataset\\HUPA0005P.csv', 'C:\\RemyaProjects2025\\Python Hackathon2025\\HUPA-UC Diabetes Dataset\\HUPA0006P.csv', 'C:\\RemyaProjects2025\\Python Hackathon2025\\HUPA-UC Diabetes Dataset\\HUPA0007P.csv', 'C:\\RemyaProjects2025\\Python Hackathon2025\\HUPA-UC Diabetes Dataset\\HUPA0009P.csv', 'C:\\RemyaProjects2025\\Python Hackathon2025\\HUPA-UC Diabetes Dataset\\HUPA0010P.csv', 'C:\\RemyaProjects2025\\Python Hackathon2025\\HUPA-UC Diabetes Dataset\\HUPA0011P.csv', 'C:\\RemyaProjects2025\\Python Hackathon2025\\HUPA-UC Diabetes Dataset\\HUPA0014P.csv', 'C:\\RemyaProjects2025\\Python 

In [None]:
Data Cleaning and Standardization
Why needed?
Standardizes column names to HUPA convention (lowercase, underscores).
How it helps?
Consistency across datasets
Prevents merge errors (e.g., Patient_ID vs patient_id)
Easier coding (no spaces or special chars in column names)

In [24]:
#Renamed Column names as per the HUPA standred naming conventions and created new csv filedemographic_df = pd.read_csv(demographic_path)
# Read demographic CSV
demographic_df = pd.read_csv(demographic_path)
demographic_df.rename(columns={"Patient_ID":"patient_id","Age":"age","Gender":"gender","Race":"race","Average Sleep Duration (hrs)":"avg_sleep_duration_hrs","Sleep Quality (1-10)":"sleep_quality_score","% with Sleep Disturbances":"sleep_disturbances_percentage"},inplace = True)
print(demographic_df.columns)
demographic_df.to_csv(modified_demographic_path, index=False)

Index(['patient_id', 'age', 'gender', 'race', 'avg_sleep_duration_hrs',
       'sleep_quality_score', 'sleep_disturbances_percentage'],
      dtype='object')


In [None]:
This code merges the combined patient dataset with the demographic dataset on patient_id and saves the resulting DataFrame as a new CSV file.

Key Steps:

Merge DataFrames: Use pd.merge() with how="inner" to combine patient and demographic datasets based on patient_id.
Save CSV: Export the merged DataFrame to a CSV file without including the index column.
Verify Columns: Check the resulting columns to ensure all patient and demographic data are included

In [25]:
#Dataframe for merged allpatients and demographic data and created csv file
patients_demo_df = pd.merge(patients_df,demographic_df, on ="patient_id", how="inner")
patients_demo_df.to_csv(patients_demographic_path, index=False)
patients_demo_df.columns

Index(['time', 'glucose', 'calories', 'heart_rate', 'steps', 'basal_rate',
       'bolus_volume_delivered', 'carb_input', 'patient_id', 'age', 'gender',
       'race', 'avg_sleep_duration_hrs', 'sleep_quality_score',
       'sleep_disturbances_percentage'],
      dtype='object')