In [8]:
import pandas as pd
df = pd.read_csv('housing.csv')

print("Information of all columns:")
print(df.info())

print("\nStatistical information of all numerical columns:")
print(df.describe())

print("\nCount of unique labels for 'Ocean Proximity' column:")
print(df['ocean_proximity'].value_counts())

print("\nColumns with missing values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])


Information of all columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None

Statistical information of all numerical columns:
          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     3

In [10]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("wenruliu/adult-income-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/wenruliu/adult-income-dataset/versions/2


In [13]:
import os

# List all files in the directory to find the actual CSV file
directory_path = '/root/.cache/kagglehub/datasets/wenruliu/adult-income-dataset/versions/2'
files = os.listdir(directory_path)
print(files)


['adult.csv']


In [17]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load datasets
diab = pd.read_csv('diadata.csv')
adult = pd.read_csv('/root/.cache/kagglehub/datasets/wenruliu/adult-income-dataset/versions/2/adult.csv')

# ===========================================================
# Diabetes Data Preprocessing

# 1. Handle missing values for numeric columns: Use median for numerical columns
ncol = diab.select_dtypes(include=['float64', 'int64']).columns
imputer_numeric = SimpleImputer(strategy='median')
diab[ncol] = imputer_numeric.fit_transform(diab[ncol])

# 2. Handle missing values for categorical columns: Use most frequent value for categorical columns
catcol = diab.select_dtypes(include=['object']).columns
imputer_categorical = SimpleImputer(strategy='most_frequent')
diab[catcol] = imputer_categorical.fit_transform(diab[catcol])

# 3. Handle categorical data: Encode 'Gender' and 'CLASS' columns
label_encoder = LabelEncoder()
diab['Gender'] = label_encoder.fit_transform(diab['Gender'])
diab['CLASS'] = label_encoder.fit_transform(diab['CLASS'])

# 4. Handle outliers: Cap values for 'Urea' column
lower, upper = diab['Urea'].quantile([0.01, 0.99])
diab['Urea'] = np.clip(diab['Urea'], lower, upper)

# 5. Apply Min-Max scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
diabetes_scaled_minmax = pd.DataFrame(scaler.fit_transform(diab), columns=diab.columns)

# ===========================================================
# Adult Income Data Preprocessing

# 1. Handle missing values for numeric columns: Use median for numerical columns
ncol_adult = adult.select_dtypes(include=['float64', 'int64']).columns
imputer_numeric_adult = SimpleImputer(strategy='median')
adult[ncol_adult] = imputer_numeric_adult.fit_transform(adult[ncol_adult])

# 2. Handle missing values for categorical columns: Use most frequent value for categorical columns
catcol_adult = adult.select_dtypes(include=['object']).columns
imputer_categorical_adult = SimpleImputer(strategy='most_frequent')
adult[catcol_adult] = imputer_categorical_adult.fit_transform(adult[catcol_adult])

# 3. Handle categorical data: Encode categorical columns
for col in catcol_adult:
    adult[col] = label_encoder.fit_transform(adult[col])

# 4. Handle outliers: Cap values for 'age' column
lower, upper = adult['age'].quantile([0.01, 0.99])
adult['age'] = np.clip(adult['age'], lower, upper)

# 5. Apply Min-Max scaling
adult_income_scaled_minmax = pd.DataFrame(scaler.fit_transform(adult), columns=adult.columns)

# Display the first few rows of the scaled data
print("Diabetes dataset after Min-Max scaling:")
print(diabetes_scaled_minmax.head())

print("Adult Income dataset after Min-Max scaling:")
print(adult_income_scaled_minmax.head())

Diabetes dataset after Min-Max scaling:
         ID  No_Pation  Gender       AGE      Urea        Cr     HbA1c  \
0  0.627034   0.000237     0.0  0.508475  0.143617  0.050378  0.264901   
1  0.918648   0.000452     0.5  0.101695  0.132979  0.070529  0.264901   
2  0.524406   0.000634     0.0  0.508475  0.143617  0.050378  0.264901   
3  0.849812   0.001160     0.0  0.508475  0.143617  0.050378  0.264901   
4  0.629537   0.000452     0.5  0.220339  0.271277  0.050378  0.264901   

       Chol        TG       HDL       LDL      VLDL       BMI  CLASS  
0  0.407767  0.044444  0.226804  0.114583  0.011461  0.173913    0.0  
1  0.359223  0.081481  0.092784  0.187500  0.014327  0.139130    0.0  
2  0.407767  0.044444  0.226804  0.114583  0.011461  0.173913    0.0  
3  0.407767  0.044444  0.226804  0.114583  0.011461  0.173913    0.0  
4  0.475728  0.051852  0.061856  0.177083  0.008596  0.069565    0.0  
Adult Income dataset after Min-Max scaling:
        age  workclass    fnlwgt  education  

Min-Max Scaling scales the data to a fixed range, typically [0, 1], by subtracting the minimum and dividing by the range. It is sensitive to outliers since they can affect the min and max values. It’s ideal for algorithms that require data in a specific range, such as neural networks and k-nearest neighbors (KNN).

Standardization, on the other hand, transforms the data by subtracting the mean and dividing by the standard deviation, resulting in a distribution with a mean of 0 and a standard deviation of 1. It’s less sensitive to outliers compared to Min-Max scaling and is preferred for algorithms that assume normally distributed data, like linear regression, logistic regression, and PCA. Standardization works better when the data has outliers or is not bounded.