In [283]:
#Importing all the Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore", UserWarning)

In [285]:
#Reading the file 
demo = pd.read_csv("HUPA-UC Diabetes Dataset/T1DM_patient_sleep_demographics_with_race.csv")


Renamed columns to stadardize format

In [287]:
rename_map = {
    'Patient_ID': 'patient_id',
    'Age': 'age',
    'Gender': 'gender',
    'Race': 'race',
    'Average Sleep Duration (hrs)': 'avg_sleep_hours',
    'Sleep Quality (1-10)': 'sleep_quality',
    '% with Sleep Disturbances': 'sleep_disturbances_pct'
}

demo.rename(columns=rename_map, inplace=True)

In [289]:

demo.to_csv("cleaned_demographics.csv", index=False)

Combining patient files and added PatientID column

In [291]:
folder = "HUPA-UC Diabetes Dataset/PatientData/"
files = [f for f in os.listdir(folder) if f.endswith('.csv')]
#files

patient_list = []

for f in files:
    # Read CSV (specify sep if needed)
    df = pd.read_csv(os.path.join(folder, f), sep=';')
    
    # Standardize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
    
   # Use filename without .csv as patient_id (e.g., HUPA0001P)
    pid = f.replace(".csv", "")
    df['patient_id'] = pid
    
    # Append to list
    patient_list.append(df)

# Combine all dataframes
combined = pd.concat(patient_list, ignore_index=True)

# Move patient_id to first column
cols = combined.columns.tolist()
cols.remove('patient_id')
cols = ['patient_id'] + cols
combined = combined[cols]

# Save to CSV with patient_id first
combined.to_csv("combined_patient_data.csv", index=False)

#combined.info() 
#combined.columns

Splitting datetime into date and time to make dataset more flexible

In [293]:
combined['time'] = pd.to_datetime(combined['time'], errors='coerce')
combined['date'] = combined['time'].dt.date
combined['time_only'] = combined['time'].dt.time 
combined.drop(columns=['time'], inplace=True)
combined.to_csv("cleaned_patient_data.csv", index=False)

In [295]:
#Loop through each file and show info
for f in files:
  #  print(f"--- Info for {f} ---")
    df = pd.read_csv(os.path.join(folder, f))
    #df.info()
  #  print("\n")

In [297]:
# Load cleaned files
combined = pd.read_csv("cleaned_patient_data.csv")
demo = pd.read_csv("cleaned_demographics.csv")

In [299]:
merged = pd.merge(combined, demo, on="patient_id", how="left")

# Clean gender and race
merged['gender'] = merged['gender'].str.strip().str.capitalize()
merged['race'] = merged['race'].str.strip().str.title()

# Save merged dataset
merged.to_csv("cleaned_patient_with_demo.csv", index=False)
#merged.head

In [303]:
merged.isnull().sum()

patient_id                0
glucose                   0
calories                  0
heart_rate                0
steps                     0
basal_rate                0
bolus_volume_delivered    0
carb_input                0
date                      0
time_only                 0
age                       0
gender                    0
race                      0
avg_sleep_hours           0
sleep_quality             0
sleep_disturbances_pct    0
dtype: int64

In [305]:
merged.drop_duplicates(inplace=True)

In [307]:
merged['date'] = pd.to_datetime(merged['date'], errors='coerce')

In [309]:
# Ensure numeric columns are actually numbers
num_cols = ['glucose', 'calories', 'heart_rate', 'steps',
            'basal_rate', 'bolus_volume_delivered', 'carb_input', 'age']
merged[num_cols] = merged[num_cols].apply(pd.to_numeric, errors='coerce')

In [315]:
merged['glucose'] = merged['glucose'].round(1)
merged['calories'] = merged['calories'].round(0)
merged['heart_rate'] = merged['heart_rate'].round(0)
merged['steps'] = merged['steps'].round(0)
merged['basal_rate'] = merged['basal_rate'].round(4)
merged['bolus_volume_delivered'] = merged['bolus_volume_delivered'].round(3)
merged['carb_input'] = merged['carb_input'].round(1)
merged['avg_sleep_hours'] = merged['avg_sleep_hours'].round(1)
merged['sleep_quality'] = merged['sleep_quality'].round(1)
merged['sleep_disturbances_pct'] = merged['sleep_disturbances_pct'].round(0)

In [317]:
# Set negative bolus_volume_delivered values to 0
merged['bolus_volume_delivered'] = merged['bolus_volume_delivered'].clip(lower=0)

In [319]:
# Save cleaned merged dataset
merged.to_csv("cleaned_final_dataset.csv", index=False)