In [2]:
import numpy as np
import pandas as pd

def generate_data(sz):
    np.random.seed(30)

    # PatientID: Unique integer values for each patient
    PatientID = np.arange(1, sz + 1)

     # PatientGender: Randomly assigning 'Male' and 'Female' as string values
    PatientGender = np.random.choice(['Male', 'Female'], sz)

    # Pregnancies: Integer values between 0 and 10
    Pregnancies = np.random.binomial(10, 0.5, sz)

    # Glucose: Normally distributed around 120 with standard deviation of 20
    Glucose = np.random.normal(120, 20, sz)

    # BloodPressure: Normally distributed around 70 with standard deviation of 12
    BloodPressure = np.random.normal(70, 12, sz)

    # SkinThickness: Normally distributed around 25 with standard deviation of 10
    SkinThickness = np.random.normal(25, 10, sz)

    # Insulin: Gamma distribution to produce a positive skew
    Insulin = np.random.gamma(2, 50, sz)

    # BMI: Normally distributed around 30 with standard deviation of 5
    BMI = np.random.normal(30, 5, sz)

    # DiabetesPedigreeFunction: Beta distribution scaled between 0 and 2
    DiabetesPedigreeFunction = np.random.beta(2, 2, sz) * 2

    # Age: Normally distributed around 35 with standard deviation of 10
    Age = np.random.normal(35, 10, sz)

    # Outcome: Binary outcome based on glucose level (for example)
    Outcome = np.random.binomial(1, 1 / (1 + np.exp(-0.05 * (Glucose - 120))))

    # Create a DataFrame with all features
    df = pd.DataFrame({
        'PatientID': PatientID,
        'PatientGender': PatientGender,
        'Pregnancies': Pregnancies,
        'Glucose': Glucose,
        'BloodPressure': BloodPressure,
        'SkinThickness': SkinThickness,
        'Insulin': Insulin,
        'BMI': BMI,
        'DiabetesPedigreeFunction': DiabetesPedigreeFunction,
        'Age': Age,
        'Outcome': Outcome
    })

    return df

# Generate 100 rows of synthetic data
df = generate_data(100)

# Display summary statistics of the generated data
print(df.describe())

# Save the synthetic data to a CSV file
df.to_csv('synthetic_diabetes_data.csv', index=False)

# Show the first few rows of the dataset
print(df.head())

        PatientID  Pregnancies     Glucose  BloodPressure  SkinThickness  \
count  100.000000   100.000000  100.000000     100.000000     100.000000   
mean    50.500000     4.880000  121.171400      69.759678      22.608808   
std     29.011492     1.659256   20.057919      12.181358      10.015776   
min      1.000000     2.000000   80.105797      42.797134       0.430455   
25%     25.750000     4.000000  107.456069      61.120041      16.162872   
50%     50.500000     5.000000  121.387167      69.747231      22.356281   
75%     75.250000     6.000000  130.307805      76.422673      27.279985   
max    100.000000     9.000000  180.409919     111.938601      48.023477   

          Insulin         BMI  DiabetesPedigreeFunction         Age  \
count  100.000000  100.000000                100.000000  100.000000   
mean   100.919124   30.487864                  1.016800   34.085870   
std     64.541339    4.952201                  0.432367   10.075993   
min      3.486649   17.728838  

# New Section

In [3]:
df.head(100)

Unnamed: 0,PatientID,PatientGender,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,Female,6,111.806443,73.473185,18.086828,185.948632,35.603819,0.368932,13.078798,0
1,2,Female,4,131.547615,69.675359,41.703432,113.544109,38.225351,1.630908,20.037802,0
2,3,Female,4,128.331600,80.017664,21.505127,64.068443,28.724706,0.959410,48.561053,0
3,4,Female,6,130.429035,59.687417,46.852958,286.750067,36.138918,1.204599,41.452991,0
4,5,Male,6,90.526870,57.149703,24.640784,86.605099,36.087633,0.497341,35.840749,0
...,...,...,...,...,...,...,...,...,...,...,...
95,96,Female,4,107.711347,90.968826,29.555189,122.487358,30.238389,1.342516,42.457858,0
96,97,Female,2,128.242787,57.013179,7.241990,169.496526,33.051702,1.769116,53.633918,1
97,98,Female,2,103.574166,60.980793,22.270612,104.535544,40.049379,1.513327,35.071782,0
98,99,Male,6,110.186307,42.797134,26.345577,167.307425,24.945340,1.672139,40.519178,1


In [None]:
# Cleaning the data by removing any negative values for relevant columns

def clean_data(df):
    # Remove rows where any values are negative in columns where negative values don't make sense
    df_cleaned = df[(df['Glucose'] >= 0) &
                    (df['BloodPressure'] >= 0) &
                    (df['SkinThickness'] >= 0) &
                    (df['Insulin'] >= 0) &
                    (df['BMI'] >= 0)]

    return df_cleaned

# Apply the cleaning function
df_cleaned = clean_data(df)

# Save the cleaned data to a CSV file
df_cleaned.to_csv('cleaned_synthetic_diabetes_data.csv', index=False)

# Display the cleaned data
print("Cleaned Data:")
print(df_cleaned.head())

# Show summary statistics for the cleaned data
print("\nSummary Statistics:")
print(df_cleaned.describe())

Cleaned Data:
   Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
0            6  114.801303      75.222807      33.955235   72.822372   
1            5  131.325869      77.026977      12.421831   39.239014   
2            6  139.810449      62.001792      13.679228   15.039004   
3            3  108.104734      70.065115      18.995506  337.802301   
4            8  126.869260      49.352141      16.896466  160.059608   

         BMI  DiabetesPedigreeFunction        Age  Outcome  
0  32.646058                  1.917104  33.039557        0  
1  37.045618                  0.578914  32.451038        1  
2  33.642247                  1.479619  23.876136        1  
3  22.611539                  1.039705  36.743826        0  
4  32.649616                  1.520452  20.781325        0  

Summary Statistics:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count    100.00000  100.000000     100.000000     100.000000  100.000000   
mean       4.8

In [None]:
# Glucose (Fasting Blood Glucose)
# Normal range: 70–99 mg/dL
# Prediabetes: 100–125 mg/dL
# Diabetes: 126 mg/dL or higher

def categorize_glucose(df):
    df['Glucose_Category'] = pd.cut(df['Glucose'],
                                    bins=[0, 99, 125, float('inf')],
                                    labels=['Normal', 'Prediabetes', 'Diabetes'])
    return df

# Apply the categorization to Glucose
df_glucose_categorized = categorize_glucose(df_cleaned)

# Display the categorized glucose data
print("Glucose Categorized Data:")
print(df_glucose_categorized[['Glucose', 'Glucose_Category']].head(100))


Glucose Categorized Data:
       Glucose Glucose_Category
0   114.801303      Prediabetes
1   131.325869         Diabetes
2   139.810449         Diabetes
3   108.104734      Prediabetes
4   126.869260         Diabetes
..         ...              ...
95  140.599907         Diabetes
96  103.101623      Prediabetes
97   90.472173           Normal
98  121.176809      Prediabetes
99  159.864505         Diabetes

[100 rows x 2 columns]


In [None]:
# Blood Pressure (Systolic/Diastolic)
# Normal: Less than 120/80 mmHg
# Stage 1 Hypertension: Systolic 130–139 mmHg or diastolic 80–89 mmHg
# Stage 2 Hypertension: Systolic 140 mmHg or higher or diastolic 90 mmHg or higher

def categorize_blood_pressure(df):
    df['BloodPressure_Category'] = pd.cut(df['BloodPressure'],
                                          bins=[0, 80, 89, float('inf')],
                                          labels=['Normal', 'Stage 1 Hypertension', 'Stage 2 Hypertension'])
    return df

# Apply the categorization to Blood Pressure
df_bp_categorized = categorize_blood_pressure(df_glucose_categorized)

# Display the categorized blood pressure data
print("\nBlood Pressure Categorized Data:")
print(df_bp_categorized[['BloodPressure', 'BloodPressure_Category']].head(100))



Blood Pressure Categorized Data:
    BloodPressure BloodPressure_Category
0       75.222807                 Normal
1       77.026977                 Normal
2       62.001792                 Normal
3       70.065115                 Normal
4       49.352141                 Normal
..            ...                    ...
95      91.525299   Stage 2 Hypertension
96      76.721249                 Normal
97      77.604149                 Normal
98      72.896243                 Normal
99      62.648791                 Normal

[100 rows x 2 columns]


In [None]:
# Skin Thickness (Triceps Skinfold)
# Normal range: 10–25 mm for adults
# Higher values indicate excess body fat

def categorize_skin_thickness(df):
    df['SkinThickness_Category'] = pd.cut(df['SkinThickness'],
                                          bins=[0, 25, float('inf')],
                                          labels=['Normal', 'High'])
    return df

# Apply the categorization to Skin Thickness
df_skin_categorized = categorize_skin_thickness(df_bp_categorized)

# Display the categorized skin thickness data
print("\nSkin Thickness Categorized Data:")
print(df_skin_categorized[['SkinThickness', 'SkinThickness_Category']].head(100))



Skin Thickness Categorized Data:
    SkinThickness SkinThickness_Category
0       33.955235                   High
1       12.421831                 Normal
2       13.679228                 Normal
3       18.995506                 Normal
4       16.896466                 Normal
..            ...                    ...
95      29.057589                   High
96      20.417033                 Normal
97      20.689842                 Normal
98      30.621563                   High
99       3.417662                 Normal

[100 rows x 2 columns]


In [None]:
# Insulin (Fasting Insulin)
# Normal range: 2–25 µIU/mL
# Higher insulin levels indicate insulin resistance

def categorize_insulin(df):
    df['Insulin_Category'] = pd.cut(df['Insulin'],
                                    bins=[0, 25, float('inf')],
                                    labels=['Normal', 'High'])
    return df

# Apply the categorization to Insulin
df_insulin_categorized = categorize_insulin(df_skin_categorized)

# Display the categorized insulin data
print("\nInsulin Categorized Data:")
print(df_insulin_categorized[['Insulin', 'Insulin_Category']].head(100))



Insulin Categorized Data:
       Insulin Insulin_Category
0    72.822372             High
1    39.239014             High
2    15.039004           Normal
3   337.802301             High
4   160.059608             High
..         ...              ...
95  105.013765             High
96  236.888273             High
97  118.198257             High
98   44.348788             High
99   73.458859             High

[100 rows x 2 columns]


In [None]:
# Body Mass Index (BMI)
# Underweight: Less than 18.5
# Normal: 18.5–24.9
# Overweight: 25–29.9
# Obesity: 30 or greater

def categorize_bmi(df):
    df['BMI_Category'] = pd.cut(df['BMI'],
                                bins=[0, 18.5, 24.9, 29.9, float('inf')],
                                labels=['Underweight', 'Normal', 'Overweight', 'Obesity'])
    return df

# Apply the categorization to BMI
df_bmi_categorized = categorize_bmi(df_insulin_categorized)

# Display the categorized BMI data
print("\nBMI Categorized Data:")
print(df_bmi_categorized[['BMI', 'BMI_Category']].head(100))



BMI Categorized Data:
          BMI BMI_Category
0   32.646058      Obesity
1   37.045618      Obesity
2   33.642247      Obesity
3   22.611539       Normal
4   32.649616      Obesity
..        ...          ...
95  34.047852      Obesity
96  34.604909      Obesity
97  34.523769      Obesity
98  34.408470      Obesity
99  24.078637       Normal

[100 rows x 2 columns]


In [None]:
# Diabetes Pedigree Function (DPF)
# Values close to or higher than 1 suggest a stronger hereditary risk

def categorize_dpf(df):
    df['DPF_Category'] = df['DiabetesPedigreeFunction'].apply(lambda x: 'High' if x > 1 else 'Low')
    return df

# Apply the categorization to DPF
df_dpf_categorized = categorize_dpf(df_bmi_categorized)

# Display the categorized DPF data
print("\nDiabetes Pedigree Function (DPF) Categorized Data:")
print(df_dpf_categorized[['DiabetesPedigreeFunction', 'DPF_Category']].head(100))



Diabetes Pedigree Function (DPF) Categorized Data:
    DiabetesPedigreeFunction DPF_Category
0                   1.917104         High
1                   0.578914          Low
2                   1.479619         High
3                   1.039705         High
4                   1.520452         High
..                       ...          ...
95                  0.910761          Low
96                  1.125659         High
97                  1.957878         High
98                  0.817578          Low
99                  0.944623          Low

[100 rows x 2 columns]


In [None]:
# Display the final categorized data for all parameters
print("\nFinal Categorized Data:")
print(df_dpf_categorized.head(100))

# Save the final categorized data to a CSV file
df_dpf_categorized.to_csv('final_categorized_synthetic_diabetes_data.csv', index=False)



Final Categorized Data:
    Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
0             6  114.801303      75.222807      33.955235   72.822372   
1             5  131.325869      77.026977      12.421831   39.239014   
2             6  139.810449      62.001792      13.679228   15.039004   
3             3  108.104734      70.065115      18.995506  337.802301   
4             8  126.869260      49.352141      16.896466  160.059608   
..          ...         ...            ...            ...         ...   
95            5  140.599907      91.525299      29.057589  105.013765   
96            3  103.101623      76.721249      20.417033  236.888273   
97            6   90.472173      77.604149      20.689842  118.198257   
98            3  121.176809      72.896243      30.621563   44.348788   
99            4  159.864505      62.648791       3.417662   73.458859   

          BMI  DiabetesPedigreeFunction        Age  Outcome Glucose_Category  \
0   32.646058     

In [None]:
import pandas as pd

pd.set_option('display.max_rows', 100)

print(df.head(100))

    Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
0             6  114.801303      75.222807      33.955235   72.822372   
1             5  131.325869      77.026977      12.421831   39.239014   
2             6  139.810449      62.001792      13.679228   15.039004   
3             3  108.104734      70.065115      18.995506  337.802301   
4             8  126.869260      49.352141      16.896466  160.059608   
5             4   68.213549      90.042856      17.638714  189.166682   
6             9  119.669954      67.538347      14.665431   88.695072   
7             4  118.678186      69.452792      25.003346   38.434560   
8             5  117.432686      62.520696      33.779049   50.618874   
9             5  102.088454      74.418946      19.289063   80.592237   
10            3  131.501954      95.235915      30.269289   96.827526   
11            5  110.031066      63.513801      32.062486   24.398045   
12            5   96.893998      96.224090      21.

In [None]:
df.head(100)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,114.801303,75.222807,33.955235,72.822372,32.646058,1.917104,33.039557,0
1,5,131.325869,77.026977,12.421831,39.239014,37.045618,0.578914,32.451038,1
2,6,139.810449,62.001792,13.679228,15.039004,33.642247,1.479619,23.876136,1
3,3,108.104734,70.065115,18.995506,337.802301,22.611539,1.039705,36.743826,0
4,8,126.86926,49.352141,16.896466,160.059608,32.649616,1.520452,20.781325,0
5,4,68.213549,90.042856,17.638714,189.166682,29.089005,1.140934,15.763108,0
6,9,119.669954,67.538347,14.665431,88.695072,40.068879,0.847629,39.61008,0
7,4,118.678186,69.452792,25.003346,38.43456,30.605479,0.194937,32.887751,0
8,5,117.432686,62.520696,33.779049,50.618874,29.749435,0.995799,37.98328,0
9,5,102.088454,74.418946,19.289063,80.592237,34.045926,1.43988,33.320477,0


In [None]:
df.to_csv('synthetic_diabetic_data.csv', index= False)

from google.colab import files
files.download('synthetic_diabetic_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>