In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

def preprocess_diabetes(file_path="/content/Dataset of Diabetes .csv"):
    # Load dataset
    df = pd.read_csv(file_path)

    # Show missing values before cleaning
    print("Missing Values Before Cleaning:\n", df.isnull().sum())

    # Handle categorical columns
    df_processed = df.copy()
    df_processed["Gender"] = df_processed["Gender"].map({"F": 0, "M": 1})
    df_processed["CLASS"] = df_processed["CLASS"].map({"N": 0, "Y": 1})

    # Define numerical columns
    num_cols = ["AGE", "Urea", "Cr", "HbA1c", "Chol", "TG", "HDL", "LDL", "VLDL", "BMI"]

    # Show unique values before encoding
    print("Unique values before encoding:", df["Gender"].unique(), df["CLASS"].unique())
    print("Unique values after encoding:", df_processed["Gender"].unique(), df_processed["CLASS"].unique())

    # Min-Max Scaling
    minmax_scaler = MinMaxScaler()
    df_minmax = df_processed.copy()
    df_minmax[num_cols] = minmax_scaler.fit_transform(df_processed[num_cols])

    # Standardization
    standard_scaler = StandardScaler()
    df_standard = df_processed.copy()
    df_standard[num_cols] = standard_scaler.fit_transform(df_processed[num_cols])

    return df_minmax, df_standard

def preprocess_adult_income(file_path="/content/adult.csv"):
    # Load dataset
    df = pd.read_csv(file_path)

    # Show missing values before cleaning
    print("Missing Values Before Cleaning:\n", df.isnull().sum())

    # Handle missing values
    df_processed = df.copy()
    df_processed.replace('?', np.nan, inplace=True)
    for col in ["workclass", "occupation", "native-country"]:
        df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)

    # Show missing values after cleaning
    print("Missing Values After Cleaning:\n", df_processed.isnull().sum())

    # Encode categorical columns
    df_processed["income"] = df_processed["income"].map({"<=50K": 0, ">50K": 1})
    categorical_columns = ["workclass", "education", "marital-status", "occupation",
                           "relationship", "race", "gender", "native-country"]
    df_encoded = pd.get_dummies(df_processed, columns=categorical_columns, drop_first=True)

    # Define numerical columns
    num_cols = ["age", "fnlwgt", "educational-num", "capital-gain", "capital-loss", "hours-per-week"]

    # Min-Max Scaling
    minmax_scaler = MinMaxScaler()
    df_minmax = df_encoded.copy()
    df_minmax[num_cols] = minmax_scaler.fit_transform(df_encoded[num_cols])

    # Standardization
    standard_scaler = StandardScaler()
    df_standard = df_encoded.copy()
    df_standard[num_cols] = standard_scaler.fit_transform(df_encoded[num_cols])

    return df_minmax, df_standard

def check_scaling_effect(df, num_cols):
    print("\nBefore Scaling:\n", df[num_cols].describe())

    # Apply Min-Max Scaling
    minmax_scaler = MinMaxScaler()
    df_minmax = df.copy()
    df_minmax[num_cols] = minmax_scaler.fit_transform(df[num_cols])

    # Apply Standardization
    standard_scaler = StandardScaler()
    df_standard = df.copy()
    df_standard[num_cols] = standard_scaler.fit_transform(df[num_cols])

    print("\nAfter Min-Max Scaling:\n", df_minmax[num_cols].describe())
    print("\nAfter Standardization:\n", df_standard[num_cols].describe())

    return df_minmax, df_standard
# Run preprocessing for Diabetes dataset
print("Processing Diabetes Dataset...\n")
diabetes_minmax, diabetes_standard = preprocess_diabetes()

# Run preprocessing for Adult Income dataset
print("\nProcessing Adult Income Dataset...\n")
adult_income_minmax, adult_income_standard = preprocess_adult_income()

# Example Usage
# diabetes_minmax, diabetes_standard = preprocess_diabetes()
# adult_income_minmax, adult_income_standard = preprocess_adult_income()

Processing Diabetes Dataset...

Missing Values Before Cleaning:
 ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         0
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64
Unique values before encoding: ['F' 'M' 'f'] ['N' 'N ' 'P' 'Y' 'Y ']
Unique values after encoding: [ 0.  1. nan] [ 0. nan  1.]

Processing Adult Income Dataset...

Missing Values Before Cleaning:
 age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64
Missing Values After Cleaning:
 age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relation

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)


In [None]:
# For Diabetes dataset
df_diabetes = pd.read_csv("/content/Dataset of Diabetes .csv")
print("Missing Values in Diabetes Dataset:\n", df_diabetes.isnull().sum())

# For Adult Income dataset
df_adult = pd.read_csv("/content/adult.csv")
df_adult.replace('?', np.nan, inplace=True)  # Replace '?' with NaN
print("\nMissing Values in Adult Income Dataset Before Handling:\n", df_adult.isnull().sum())

# Handling missing values in Adult Income dataset
for col in ["workclass", "occupation", "native-country"]:
    df_adult[col].fillna(df_adult[col].mode()[0], inplace=True)

print("\nMissing Values in Adult Income Dataset After Handling:\n", df_adult.isnull().sum())


Missing Values in Diabetes Dataset:
 ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         0
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64

Missing Values in Adult Income Dataset Before Handling:
 age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

Missing Values in Adult Income Dataset After Handling:
 age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_adult[col].fillna(df_adult[col].mode()[0], inplace=True)


In [None]:
# For Diabetes dataset
print("Unique values in categorical columns of Diabetes Dataset:")
print(df_diabetes["Gender"].unique(), df_diabetes["CLASS"].unique())

df_diabetes["Gender"] = df_diabetes["Gender"].map({"F": 0, "M": 1})
df_diabetes["CLASS"] = df_diabetes["CLASS"].map({"N": 0, "Y": 1})

print("\nAfter Encoding:")
print(df_diabetes["Gender"].unique(), df_diabetes["CLASS"].unique())

# For Adult Income dataset
categorical_columns = ["workclass", "education", "marital-status", "occupation",
                        "relationship", "race", "gender", "native-country"]
print("\nCategorical columns in Adult Income Dataset:", categorical_columns)

df_adult_encoded = pd.get_dummies(df_adult, columns=categorical_columns, drop_first=True)
print("\nAdult Income dataset after encoding:", df_adult_encoded.head())


Unique values in categorical columns of Diabetes Dataset:
['F' 'M' 'f'] ['N' 'N ' 'P' 'Y' 'Y ']

After Encoding:
[ 0.  1. nan] [ 0. nan  1.]

Categorical columns in Adult Income Dataset: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']

Adult Income dataset after encoding:    age  fnlwgt  educational-num  capital-gain  capital-loss  hours-per-week  \
0   25  226802                7             0             0              40   
1   38   89814                9             0             0              50   
2   28  336951               12             0             0              40   
3   44  160323               10          7688             0              40   
4   18  103497               10             0             0              30   

  income  workclass_Local-gov  workclass_Never-worked  workclass_Private  ...  \
0  <=50K                False                   False               True  ...   
1  <=50K                Fal

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Selecting numerical columns
num_cols = ["age", "fnlwgt", "educational-num", "capital-gain", "capital-loss", "hours-per-week"]

# Before Scaling
print("Before Scaling:\n", df_adult[num_cols].describe())

# Min-Max Scaling
minmax_scaler = MinMaxScaler()
df_adult_minmax = df_adult.copy()
df_adult_minmax[num_cols] = minmax_scaler.fit_transform(df_adult[num_cols])
print("\nAfter Min-Max Scaling:\n", df_adult_minmax[num_cols].describe())

# Standardization
standard_scaler = StandardScaler()
df_adult_standard = df_adult.copy()
df_adult_standard[num_cols] = standard_scaler.fit_transform(df_adult[num_cols])
print("\nAfter Standardization:\n", df_adult_standard[num_cols].describe())


Before Scaling:
                 age        fnlwgt  educational-num  capital-gain  \
count  48842.000000  4.884200e+04     48842.000000  48842.000000   
mean      38.643585  1.896641e+05        10.078089   1079.067626   
std       13.710510  1.056040e+05         2.570973   7452.019058   
min       17.000000  1.228500e+04         1.000000      0.000000   
25%       28.000000  1.175505e+05         9.000000      0.000000   
50%       37.000000  1.781445e+05        10.000000      0.000000   
75%       48.000000  2.376420e+05        12.000000      0.000000   
max       90.000000  1.490400e+06        16.000000  99999.000000   

       capital-loss  hours-per-week  
count  48842.000000    48842.000000  
mean      87.502314       40.422382  
std      403.004552       12.391444  
min        0.000000        1.000000  
25%        0.000000       40.000000  
50%        0.000000       40.000000  
75%        0.000000       45.000000  
max     4356.000000       99.000000  

After Min-Max Scaling:
    