In [None]:
pip install --upgrade ydata-profiling


In [270]:
pip install streamlit


python(19492) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import glob
import os
import numpy as np
from ydata_profiling import ProfileReport

In [None]:
# Path where the 25 CSVs are stored
path_to_patient_files = '/Users/maya/Downloads/PyQuesters_Python_Hackathon/HUPA-UC Diabetes Dataset'
# Read and concatenate all the patient CSVs
all_files = glob.glob(os.path.join(path_to_patient_files, "*.csv"))
list_df = []
for filename in all_files:
    df = pd.read_csv(filename,sep=';')
    # Patient_ID is the filename without ".csv"
    df['Patient_ID'] = os.path.splitext(os.path.basename(filename))[0]
    list_df.append(df)

patient_df = pd.concat(list_df, ignore_index=True)
non_cleaned_df = pd.DataFrame(patient_df)
non_cleaned_df.to_csv("non_cleaned_df_output.csv")

'''convert the time from type object to datetime.
errors='coerce' means
--> If a value can be converted to a datetime → convert it.
--> If a value cannot be converted (e.g., "abc", "2020-13-45") →
instead of raising an error, replace it with NaT (Not-a-Time, the datetime version of NaN).'''
patient_df['time'] = pd.to_datetime(patient_df['time'], errors='coerce')
#No Negative Values Expected for insulin dose administered 
patient_df.loc[patient_df['bolus_volume_delivered'] < 0, 'bolus_volume_delivered'] = 0
## Round heart_rate to nearest integer for Clinical/medical analysis 
patient_df['heart_rate'] = patient_df['heart_rate'].round(0).astype(int)
# changing to  realistic basal_rate for modeling/analysis.
patient_df['basal_rate_cleaned'] = (patient_df['basal_rate'] / 0.025).round() * 0.025

# clip tiny noise values
patient_df.loc[patient_df['basal_rate_cleaned'] < 0.025, 'basal_rate_cleaned'] = 0
#Round calories to 1 decimal place
patient_df['calories_cleaned'] = patient_df['calories'].round(1)
#Round glucose to 1 decimal place
patient_df['glucose_cleaned'] = patient_df['glucose'].round(1)


cleaned_df = pd.DataFrame(patient_df)
cleaned_df.to_csv("cleaned_df_output.csv")


In [None]:
from ydata_profiling import ProfileReport
import pandas as pd

df = pd.read_csv("non_cleaned_df_output.csv")

profile = ProfileReport(df, title="Data Quality Report", explorative=True)

# Option 1: Display as iframe in notebook (works more reliably)
profile.to_notebook_iframe()

# Option 2: Save to HTML and open in browser
profile.to_file("data_quality_report.html")


In [None]:
from ydata_profiling import ProfileReport
import pandas as pd

df = pd.read_csv("cleaned_df_output.csv")

profile = ProfileReport(df, title="Data Quality Report for cleaned data", explorative=True)

# Option 1: Display as iframe in notebook (works more reliably)
profile.to_notebook_iframe()

# Option 2: Save to HTML and open in browser
profile.to_file("data_quality_report_cleaned_data.html")


In [None]:
print(patient_df)

In [None]:
import pandas as pd

def data_quality_report(df):
    print("DATA QUALITY REPORT")
    print("="*50)
    
    # Shape
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("="*50)
    
    # Missing values
    print("*"*50)
    print("\n Check for Missing Values:")
    print("*"*50)
    print("\n Number of missing column values are:\n", df.isnull().sum())
    
    # Duplicates
    print("*"*50)
    print("\n Check for Duplicate Rows:")
    print("*"*50)
    print("\n Number of duplicate rows  are :", df.duplicated().sum())
    
    # Data types
    print("*"*50)
    print("\n Check for Column Data Types:")
    print("*"*50)
    print(df.dtypes)
    
    # Unique Patient IDs
    if "Patient_ID" in df.columns:
        print("*"*50)
        print("\n Check for Patient IDs uniqueness and number of records:")
        print("*"*50)
        print("Number of Unique Patient IDs:", df['Patient_ID'].nunique())
        print("Total patient records are  :", len(df))
    
    # Numeric ranges
    print("*"*50)
    print("\n Check for Numeric Column Summary:")
    print("*"*50)
    print(df.describe().T[['min', 'max', 'mean']])
    
    # Time range
    if "time" in df.columns:
        print("*"*50)
        print("\n Check for Time Range to check if time is continuous :")
        print("*"*50)
        print("Earliest time :", df['time'].min())
        print("Latest time  :", df['time'].max())
        
    
    #Check for impossible or outlier values
    print("*"*50)
    print("\n Check for Impossible or outlier values:")
    print("*"*50)
    print("Negative glucose values:", (df['glucose'] < 0).sum())
    print("Negative steps values:", (df['steps'] < 0).sum())
    print("Negative calories values:", (df['calories'] < 0).sum())
    print("Negative carb_input values:", (df['carb_input'] < 0).sum())
    print("Negative heart_rate values:", (df['heart_rate'] < 0).sum())
    print("Negative steps values:", (df['steps'] < 0).sum())
    print("Unrealistic heart_rate values <20 BPM:", (df['heart_rate'] < 20).sum())
    print("Unrealistic heart_rate values >250 BPM:", (df['heart_rate'] > 250).sum())
    print("Unrealistic glucose <30 :", (df['glucose'] < 30).sum())
    print("Unrealistic glucose > 600:", (df['glucose'] > 600).sum())
    print("Negative Insulin dose administered(bolus_volume_delivered) values:", (df['bolus_volume_delivered'] < 0).sum())

    

    #check for range of values
    print("*"*50)
    print("\n Check for range of  values for Glucose Hear rate ,Calories and Steps:")
    print("*"*50)
    print("Glucose range:", patient_df['glucose_cleaned'].min(), "-", patient_df['glucose_cleaned'].max())
    print("Heart rate range:", patient_df['heart_rate'].min(), "-", patient_df['heart_rate'].max())
    print("Calories range:", patient_df['calories_cleaned'].min(), "-", patient_df['calories_cleaned'].max())
    print("Steps range:", patient_df['steps'].min(), "-", patient_df['steps'].max())
    print("="*50)

    print("DATA QUALITY REPORT Completed")

# Example usage
data_quality_report(patient_df)


In [274]:
#Load demographics / sleep Excel and merge
demo_df = pd.read_excel('/Users/maya/Downloads/PyQuesters_Python_Hackathon/DemographicsDetails/T1DM_patient_sleep_demographics_with_race.xlsx')
merged_df = patient_df.merge(demo_df, on="Patient_ID", how="left")
#print(merged_df)
print(merged_df.head)
cleaned_df_patient_race = pd.DataFrame(merged_df)
cleaned_df_patient_race.to_csv("cleaned_df_patient_race.csv")


<bound method NDFrame.head of                       time     glucose  calories  heart_rate  steps  \
0      2020-01-17 00:00:00   40.000000  15.04290          96    8.0   
1      2020-01-17 00:05:00   41.333333   8.31640          91    0.0   
2      2020-01-17 00:10:00   42.666667   7.58260          86    0.0   
3      2020-01-17 00:15:00   44.000000   7.33800          82    0.0   
4      2020-01-17 00:20:00   50.000000   7.58260          79    0.0   
...                    ...         ...       ...         ...    ...   
309387 2019-07-13 18:20:00   70.000000  13.58784          85   61.0   
309388 2019-07-13 18:25:00   80.000000   6.57824          82    0.0   
309389 2019-07-13 18:30:00   90.000000   6.90176          84    0.0   
309390 2019-07-13 18:35:00  108.666667   6.47040          82    0.0   
309391 2019-07-13 18:40:00  127.333333   5.60768          86    0.0   

        basal_rate  bolus_volume_delivered  carb_input Patient_ID  \
0            0.035                     0.0      

In [None]:
import pandas as pd

def data_quality_report(df):
    print("DATA QUALITY REPORT FOR MERGED DATASET(PATIENT AND DEMOGRAPHICS")
    print("="*50)
    
    # Shape
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("="*50)
    
    # Missing values
    print("*"*50)
    print("\n Check for Missing Values:")
    print("*"*50)
    print("\n Number of missing column values are:\n", df.isnull().sum())
    
    # Duplicates
    print("*"*50)
    print("\n Check for Duplicate Rows:")
    print("*"*50)
    print("\n Number of duplicate rows  are :", df.duplicated().sum())
    
    # Data types
    print("*"*50)
    print("\n Check for Column Data Types:")
    print("*"*50)
    print(df.dtypes)
    
    # Unique Patient IDs
    if "Patient_ID" in df.columns:
        print("*"*50)
        print("\n Check for Patient IDs uniqueness and number of records:")
        print("*"*50)
        print("Number of Unique Patient IDs:", df['Patient_ID'].nunique())
        print("Total patient records are  :", len(df))
    
    # Numeric ranges
    print("*"*50)
    print("\n Check for Numeric Column Summary:")
    print("*"*50)
    print(df.describe().T[['min', 'max', 'mean']])
    
    # Time range
    if "time" in df.columns:
        print("*"*50)
        print("\n Check for Time Range to check if time is continuous :")
        print("*"*50)
        print("Earliest time :", df['time'].min())
        print("Latest time  :", df['time'].max())
        
    
    #Check for impossible or outlier values
    print("*"*50)
    print("\n Check for Impossible or outlier values:")
    print("*"*50)
    print("Negative glucose values:", (df['glucose'] < 0).sum())
    print("Negative steps values:", (df['steps'] < 0).sum())
    print("Negative calories values:", (df['calories'] < 0).sum())
    print("Negative carb_input values:", (df['carb_input'] < 0).sum())
    print("Negative heart_rate values:", (df['heart_rate'] < 0).sum())
    print("Negative steps values:", (df['steps'] < 0).sum())
    print("Unrealistic heart_rate values <20 BPM:", (df['heart_rate'] < 20).sum())
    print("Unrealistic heart_rate values >250 BPM:", (df['heart_rate'] > 250).sum())
    print("Unrealistic glucose <30 :", (df['glucose'] < 30).sum())
    print("Unrealistic glucose > 600:", (df['glucose'] > 600).sum())
    print("Negative Insulin dose administered(bolus_volume_delivered) values:", (df['bolus_volume_delivered'] < 0).sum())

    

    #check for range of values
    print("*"*50)
    print("\n Check for range of  values for Glucose Hear rate ,Calories and Steps:")
    print("*"*50)
    print("Glucose range:", patient_df['glucose_cleaned'].min(), "-", patient_df['glucose_cleaned'].max())
    print("Heart rate range:", patient_df['heart_rate'].min(), "-", patient_df['heart_rate'].max())
    print("Calories range:", patient_df['calories_cleaned'].min(), "-", patient_df['calories_cleaned'].max())
    print("Steps range:", patient_df['steps'].min(), "-", patient_df['steps'].max())
    print("="*50)

    print("DATA QUALITY REPORT FOR MERGED DATASET(PATIENT AND DEMOGRAPHICS Completed")

# Example usage
data_quality_report(merged_df)


In [287]:
#descriptive analysis 
'''Q1. How many participants in the dataset have diabetes, and what is their percentage?
Reasoning:
Since understanding the prevalence of diabetes is key to this analysis, examining the proportion of 
participants with the condition helps contextualize the population.
'''
import pandas as pd

# Load dataframe
df = pd.read_csv('cleaned_df_patient_race.csv')

# Count unique patients by gender
gender_counts = df.groupby('Patient_ID')['Gender'].first().value_counts()
gender_percent = gender_counts / gender_counts.sum() * 100

print("Number of participants by gender:")
print(gender_counts)
print("\nPercentage of participants by gender:")
print(gender_percent.round(2))
# Print which gender is higher
if gender_counts['Female'] > gender_counts['Male']:
    print("\nFemale participants are higher in number.")
elif gender_counts['Male'] > gender_counts['Female']:
    print("\nMale participants are higher in number.")
else:
    print("\nNumber of male and female participants are equal.")


Number of participants by gender:
Gender
Male      16
Female     9
Name: count, dtype: int64

Percentage of participants by gender:
Gender
Male      64.0
Female    36.0
Name: count, dtype: float64

Male participants are higher in number.


In [291]:
'''Q2. What is the average glucose for females and males?
Reasoning:
Comparing glucose by gender can highlight physiological differences or help identify at-risk groups.'''
# Average glucose by gender
avg_glucose_gender = df.groupby('Gender')['glucose_cleaned'].mean()
print(avg_glucose_gender.round(2))
#print which gender has more average glucose
if avg_glucose_gender['Female']>avg_glucose_gender['Male']:
    print("\nFemales in this dataset may have slightly higher average glucose readings.")
elif avg_glucose_gender['Female']< avg_glucose_gender['Male']:
     print("\nMales in this dataset may have slightly higher average glucose readings.")
elif avg_glucose_gender['Female']== avg_glucose_gender['Male']:
    print("\nMales and Females in this dataset  have equal average glucose readings.")


Gender
Female    135.77
Male      150.90
Name: glucose_cleaned, dtype: float64

Males in this dataset may have slightly higher average glucose readings.


In [301]:
'''Q3. What is the average sleep quality and average sleep duration of participants?

Reasoning:
Sleep affects glucose metabolism and insulin sensitivity. Understanding baseline sleep metrics helps contextualize glucose trends.'''
# Average sleep metrics
sleep_avg = df.groupby('Patient_ID')[['Average Sleep Duration (hrs)', 'Sleep Quality (1-10)']].first().mean()
print(sleep_avg.round(2))

if sleep_avg['Average Sleep Duration (hrs)']<7:
    print("\n Participants are sleep-deprived, which may affect glucose control, insulin sensitivity, heart health, and overall well-being")
elif sleep_avg['Average Sleep Duration (hrs)']>7 and sleep_avg['Average Sleep Duration (hrs)']<9:
    print("\n Participants are getting adequate sleep")
if sleep_avg['Sleep Quality (1-10)']<6:
    print("\n Participants sleep quality is moderate. Poor sleep quality can contribute to higher glucose variability, fatigue, and worse metabolic outcomes.")
elif sleep_avg['Sleep Quality (1-10)']>6:
    print ("\n Particants sleep quality is good")

Average Sleep Duration (hrs)    5.97
Sleep Quality (1-10)            5.95
dtype: float64

 Participants are sleep-deprived, which may affect glucose control, insulin sensitivity, heart health, and overall well-being

 Participants sleep quality is moderate. Poor sleep quality can contribute to higher glucose variability, fatigue, and worse metabolic outcomes.


In [305]:
'''Q5. How many glucose readings are above 180 mg/dL, and what percentage of total readings do they represent?

Reasoning:
High glucose readings indicate hyperglycemia risk. Quantifying them helps assess dataset severity and identify participants at risk.'''
high_glucose_count = (df['glucose_cleaned'] > 180).sum()
total_readings = df.shape[0]
high_glucose_percent = high_glucose_count / total_readings * 100

print(f"Number of high glucose readings (>180 mg/dL): {high_glucose_count}")
print(f"Percentage of total readings: {high_glucose_percent:.2f}%")
if high_glucose_percent>50:
    print("\n glucose readings for most participants are not within normal/moderate range")
elif high_glucose_percent<50:
    print("\n Most participants glucose readings are within normal or moderately elevated ranges, with a small fraction at hyperglycemic levels")


Number of high glucose readings (>180 mg/dL): 67117
Percentage of total readings: 21.69%

 Most participants glucose readings are within normal or moderately elevated ranges, with a small fraction at hyperglycemic levels


In [315]:
'''Q6. What is the age distribution of participants?

Reasoning:
Age influences glucose metabolism, insulin sensitivity, and basal rate.'''
age_stats = df.groupby('Patient_ID')['Age'].first().describe()
print(age_stats.round(1))
if age_stats['mean']>45 and age_stats['mean']<70:
    print("\n Participants are mostly middle-aged adults")
elif age_stats['mean']<45:
     print("\n Participants are mostly younger adults")
elif age_stats['mean']>70:
    print("\n Participants are mostly older adults")
if age_stats['std']>15 and age_stats['mean']<30:
     print("\n Participants are  spread from young adults to older adults with moderate variation of age")
elif age_stats['mean']>30:
    print("\n Participants are mostly from a specific age group with little variation in age")


count    25.0
mean     46.7
std      15.5
min      20.0
25%      34.0
50%      46.0
75%      62.0
max      74.0
Name: Age, dtype: float64

 Participants are mostly middle-aged adults
