In [2]:
import pandas as pd

# Load the dataset
file_path = 'diabetes_prediction_dataset.csv'
diabetes_data = pd.read_csv(file_path)

{'Number of Rows': 100000,
 'Number of Columns': 9,
 'Column Names': ['gender',
  'age',
  'hypertension',
  'heart_disease',
  'smoking_history',
  'bmi',
  'HbA1c_level',
  'blood_glucose_level',
  'diabetes'],
 'First 5 Rows':    gender   age  hypertension  heart_disease smoking_history    bmi  \
 0  Female  80.0             0              1           never  25.19   
 1  Female  54.0             0              0         No Info  27.32   
 2    Male  28.0             0              0           never  27.32   
 3  Female  36.0             0              0         current  23.45   
 4    Male  76.0             1              1         current  20.14   
 
    HbA1c_level  blood_glucose_level  diabetes  
 0          6.6                  140         0  
 1          6.6                   80         0  
 2          5.7                  158         0  
 3          5.0                  155         0  
 4          4.8                  155         0  ,
 'Data Summary':                  age  hyp

In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Columns to be one-hot encoded and scaled
categorical_cols = ['gender', 'smoking_history']
numerical_cols = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Apply the transformations to the dataset
diabetes_data_transformed = preprocessor.fit_transform(diabetes_data)

# Since the transformed data is in the array form, let's convert it back to a DataFrame for better readability
columns_transformed = (numerical_cols + 
                       list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)))

diabetes_data_preprocessed = pd.DataFrame(diabetes_data_transformed, columns=columns_transformed)

# Display the first few rows of the preprocessed data
diabetes_data_preprocessed.head()

Unnamed: 0,age,bmi,HbA1c_level,blood_glucose_level,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,1.692704,-0.321056,1.001706,0.047704,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.538006,-0.000116,1.001706,-1.42621,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-0.616691,-0.000116,0.161108,0.489878,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.261399,-0.583232,-0.49269,0.416183,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.515058,-1.08197,-0.67949,0.416183,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [4]:
diabetes_data_preprocessed_info = {
    "Number of Rows": diabetes_data_preprocessed.shape[0],
    "Number of Columns": diabetes_data_preprocessed.shape[1],
    "Column Names": diabetes_data_preprocessed.columns.tolist(),
    "First 5 Rows": diabetes_data_preprocessed.head(),
    "Data Summary": diabetes_data_preprocessed.describe()
}

diabetes_data_preprocessed_info

{'Number of Rows': 100000,
 'Number of Columns': 13,
 'Column Names': ['age',
  'bmi',
  'HbA1c_level',
  'blood_glucose_level',
  'gender_Female',
  'gender_Male',
  'gender_Other',
  'smoking_history_No Info',
  'smoking_history_current',
  'smoking_history_ever',
  'smoking_history_former',
  'smoking_history_never',
  'smoking_history_not current'],
 'First 5 Rows':         age       bmi  HbA1c_level  blood_glucose_level  gender_Female  \
 0  1.692704 -0.321056     1.001706             0.047704            1.0   
 1  0.538006 -0.000116     1.001706            -1.426210            1.0   
 2 -0.616691 -0.000116     0.161108             0.489878            0.0   
 3 -0.261399 -0.583232    -0.492690             0.416183            1.0   
 4  1.515058 -1.081970    -0.679490             0.416183            0.0   
 
    gender_Male  gender_Other  smoking_history_No Info  \
 0          0.0           0.0                      0.0   
 1          0.0           0.0                      1.0   
 2