In [None]:
import pandas as pd
import numpy as np

In [None]:
filepath = "/Dataset of Diabetes .csv"
df = pd.read_csv(filepath)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder

# Load the data
data = pd.read_csv('/Dataset of Diabetes .csv')

# 1. Data Cleaning

# Check for missing values
print("Missing values in each column:")
print(data.isnull().sum())


Missing values in each column:
ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         0
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64


In [None]:
# Handle missing values
# For numerical columns, we'll fill with median
numerical_columns = ['Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']

# Option 1: Fill missing values column by column
for col in numerical_columns:
    data[col] = data[col].fillna(data[col].median())

# Check for duplicates and remove them
data.drop_duplicates(inplace=True)

# Verify the changes
print("Missing values after handling:")
print(data[numerical_columns].isnull().sum())
print("\nShape after removing duplicates:", data.shape)



Missing values after handling:
Urea     0
Cr       0
HbA1c    0
Chol     0
TG       0
HDL      0
LDL      0
VLDL     0
BMI      0
dtype: int64

Shape after removing duplicates: (1000, 14)


In [None]:
# Handling categorical data
# Identify categorical columns
categorical_columns = ['Gender', 'CLASS']
# Encode categorical variables
le = LabelEncoder()
for col in categorical_columns:
    data[col] = le.fit_transform(data[col])
    print(f"Encoded {col}:", dict(zip(le.classes_, le.transform(le.classes_))))

Encoded Gender: {0: 0, 1: 1, 2: 2}
Encoded CLASS: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}


In [None]:

# Handle Outliers using IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    print("Quantile1:",Q1)
    print("Quantile3:",Q3)
    IQR = Q3 - Q1
    print("IQR:",IQR)
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    print("Lower Bound:",lower_bound)
    print("Upper Bound:",upper_bound)
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# Apply outlier removal to numerical columns
for col in numerical_columns:
    data = remove_outliers(data, col)
    print(f"Shape after removing outliers in {col}:", data.shape)


Quantile1: 3.4
Quantile3: 5.4
IQR: 2.0000000000000004
Lower Bound: 0.399999999999999
Upper Bound: 8.400000000000002
Shape after removing outliers in Urea: (721, 14)
Quantile1: 46.0
Quantile3: 70.0
IQR: 24.0
Lower Bound: 10.0
Upper Bound: 106.0
Shape after removing outliers in Cr: (721, 14)
Quantile1: 6.3
Quantile3: 10.0
IQR: 3.7
Lower Bound: 0.7499999999999991
Upper Bound: 15.55
Shape after removing outliers in HbA1c: (721, 14)
Quantile1: 4.1
Quantile3: 5.5
IQR: 1.4000000000000004
Lower Bound: 1.9999999999999991
Upper Bound: 7.6000000000000005
Shape after removing outliers in Chol: (721, 14)
Quantile1: 1.3
Quantile3: 2.6
IQR: 1.3
Lower Bound: -0.6500000000000001
Upper Bound: 4.550000000000001
Shape after removing outliers in TG: (721, 14)
Quantile1: 0.9
Quantile3: 1.3
IQR: 0.4
Lower Bound: 0.29999999999999993
Upper Bound: 1.9000000000000001
Shape after removing outliers in HDL: (721, 14)
Quantile1: 1.8
Quantile3: 3.3
IQR: 1.4999999999999998
Lower Bound: -0.4499999999999995
Upper Bound:

In [None]:
# 2. Data Transformations

# Create copies for different scaling methods
data_minmax = data.copy()
data_standard = data.copy()

# Min-Max Scaling
minmax_scaler = MinMaxScaler()
data_minmax[numerical_columns] = minmax_scaler.fit_transform(data_minmax[numerical_columns])

# Standard Scaling
standard_scaler = StandardScaler()
data_standard[numerical_columns] = standard_scaler.fit_transform(data_standard[numerical_columns])

# Save processed datasets
data.to_csv('diabetes_cleaned.csv', index=False)
data_minmax.to_csv('diabetes_minmax_scaled.csv', index=False)
data_standard.to_csv('diabetes_standard_scaled.csv', index=False)

# Print basic statistics of transformed data
print("\nMin-Max Scaled Data Statistics:")
print(data_minmax[numerical_columns].describe())

print("\nStandard Scaled Data Statistics:")
print(data_standard[numerical_columns].describe())


Min-Max Scaled Data Statistics:
             Urea          Cr       HbA1c        Chol          TG         HDL  \
count  721.000000  721.000000  721.000000  721.000000  721.000000  721.000000   
mean     0.473130    0.443393    0.483103    0.496012    0.421769    0.474877   
std      0.193842    0.200551    0.197510    0.193497    0.224379    0.209791   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      0.323944    0.302326    0.335938    0.375000    0.238095    0.333333   
50%      0.464789    0.418605    0.468750    0.500000    0.404762    0.466667   
75%      0.605634    0.581395    0.625000    0.625000    0.547619    0.600000   
max      1.000000    1.000000    1.000000    1.000000    1.000000    1.000000   

              LDL        VLDL         BMI  
count  721.000000  721.000000  721.000000  
mean     0.452509    0.437864    0.500985  
std      0.221685    0.220152    0.235918  
min      0.000000    0.000000    0.000000  
25%      0.272727

In [None]:
print(data)

      ID  No_Pation  Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
1    735      34221       1   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6   
4    504      34223       1   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4   
5    634      34224       0   45   2.3  24    4.0   2.9  1.0  1.0  1.5   0.4   
6    721      34225       0   50   2.0  50    4.0   3.6  1.3  0.9  2.1   0.6   
7    421      34227       1   48   4.7  47    4.0   2.9  0.8  0.9  1.6   0.4   
..   ...        ...     ...  ...   ...  ..    ...   ...  ...  ...  ...   ...   
990  194     454316       0   57   4.1  70    9.3   5.3  3.3  1.0  1.4   1.3   
991  195       4543       2   55   4.1  34   13.9   5.4  1.6  1.6  3.1   0.7   
992  196     454316       1   55   3.1  39    8.5   5.0  2.5  1.9  2.9   0.7   
993  198     454316       1   28   3.5  61    8.5   4.5  1.9  1.1  2.6   0.8   
999  248      24054       1   54   5.0  67    6.9   3.8  1.7  1.1  3.0   0.7   

      BMI  CLASS  
1    23.0      0  
4