In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load datasets
diabetes_data = pd.read_csv('/content/Dataset of Diabetes .csv')
adult_income_data = pd.read_csv('/content/adult.csv')

# Diabetes Data Preprocessing

#Handle missing values for numeric columns: Use median for numerical columns
numeric_cols = diabetes_data.select_dtypes(include=['float64', 'int64']).columns
imputer_numeric = SimpleImputer(strategy='median')
diabetes_data[numeric_cols] = imputer_numeric.fit_transform(diabetes_data[numeric_cols])

#Handle missing values for categorical columns: Use most frequent value for categorical columns
categorical_cols = diabetes_data.select_dtypes(include=['object']).columns
imputer_categorical = SimpleImputer(strategy='most_frequent')
diabetes_data[categorical_cols] = imputer_categorical.fit_transform(diabetes_data[categorical_cols])

# Handle categorical data: Encode 'Gender' and 'CLASS' columns
label_encoder = LabelEncoder()
diabetes_data['Gender'] = label_encoder.fit_transform(diabetes_data['Gender'])
diabetes_data['CLASS'] = label_encoder.fit_transform(diabetes_data['CLASS'])

# Handle outliers: Cap values for 'Urea' column
lower, upper = diabetes_data['Urea'].quantile([0.01, 0.99])
diabetes_data['Urea'] = np.clip(diabetes_data['Urea'], lower, upper)

# Apply Min-Max scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
diabetes_scaled_minmax = pd.DataFrame(scaler.fit_transform(diabetes_data), columns=diabetes_data.columns)


# Adult Income Data Preprocessing

# Handle missing values for numeric columns: Use median for numerical columns
numeric_cols_adult = adult_income_data.select_dtypes(include=['float64', 'int64']).columns
imputer_numeric_adult = SimpleImputer(strategy='median')
adult_income_data[numeric_cols_adult] = imputer_numeric_adult.fit_transform(adult_income_data[numeric_cols_adult])

# Handle missing values for categorical columns: Use most frequent value for categorical columns
categorical_cols_adult = adult_income_data.select_dtypes(include=['object']).columns
imputer_categorical_adult = SimpleImputer(strategy='most_frequent')
adult_income_data[categorical_cols_adult] = imputer_categorical_adult.fit_transform(adult_income_data[categorical_cols_adult])

# Handle categorical data: Encode categorical columns
for col in categorical_cols_adult:
    adult_income_data[col] = label_encoder.fit_transform(adult_income_data[col])

# Handle outliers: Cap values for 'age' column
lower, upper = adult_income_data['age'].quantile([0.01, 0.99])
adult_income_data['age'] = np.clip(adult_income_data['age'], lower, upper)

# Apply Min-Max scaling
adult_income_scaled_minmax = pd.DataFrame(scaler.fit_transform(adult_income_data), columns=adult_income_data.columns)

# Display the first few rows of the scaled data
print("Diabetes dataset after Min-Max scaling:")
print(diabetes_scaled_minmax.head())

print("Adult Income dataset after Min-Max scaling:")
print(adult_income_scaled_minmax.head())

Diabetes dataset after Min-Max scaling:
         ID  No_Pation  Gender       AGE      Urea        Cr     HbA1c  \
0  0.627034   0.000237     0.0  0.508475  0.143617  0.050378  0.264901   
1  0.918648   0.000452     0.5  0.101695  0.132979  0.070529  0.264901   
2  0.524406   0.000634     0.0  0.508475  0.143617  0.050378  0.264901   
3  0.849812   0.001160     0.0  0.508475  0.143617  0.050378  0.264901   
4  0.629537   0.000452     0.5  0.220339  0.271277  0.050378  0.264901   

       Chol        TG       HDL       LDL      VLDL       BMI  CLASS  
0  0.407767  0.044444  0.226804  0.114583  0.011461  0.173913    0.0  
1  0.359223  0.081481  0.092784  0.187500  0.014327  0.139130    0.0  
2  0.407767  0.044444  0.226804  0.114583  0.011461  0.173913    0.0  
3  0.407767  0.044444  0.226804  0.114583  0.011461  0.173913    0.0  
4  0.475728  0.051852  0.061856  0.177083  0.008596  0.069565    0.0  
Adult Income dataset after Min-Max scaling:
        age  workclass    fnlwgt  education  