In [9]:
import pandas as pd

# Load the dataset
file_path = 'data/diabetes_prediction_dataset.csv'
diabetes_data = pd.read_csv(file_path)

In [13]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Columns to be one-hot encoded and scaled
categorical_cols = ['gender', 'smoking_history']
numerical_cols = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Apply the transformations to the dataset
diabetes_data_transformed = preprocessor.fit_transform(diabetes_data)

# Since the transformed data is in the array form, let's convert it back to a DataFrame for better readability
columns_transformed = (numerical_cols + 
                       list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)))

diabetes_data_preprocessed = pd.DataFrame(diabetes_data_transformed, columns=columns_transformed)
diabetes_data_preprocessed['diabetes'] = diabetes_data['diabetes']

# Display the first few rows of the preprocessed data
diabetes_data_preprocessed.head()
# Saving Preprocessed data
diabetes_data_preprocessed.to_csv('data/preprocessed_dataset.csv', index=False)




In [14]:
diabetes_data_preprocessed_info = {
    "Number of Rows": diabetes_data_preprocessed.shape[0],
    "Number of Columns": diabetes_data_preprocessed.shape[1],
    "Column Names": diabetes_data_preprocessed.columns.tolist(),
    "First 5 Rows": diabetes_data_preprocessed.head(),
    "Data Summary": diabetes_data_preprocessed.describe()
}

diabetes_data_preprocessed_info

{'Number of Rows': 100000,
 'Number of Columns': 14,
 'Column Names': ['age',
  'bmi',
  'HbA1c_level',
  'blood_glucose_level',
  'gender_Female',
  'gender_Male',
  'gender_Other',
  'smoking_history_No Info',
  'smoking_history_current',
  'smoking_history_ever',
  'smoking_history_former',
  'smoking_history_never',
  'smoking_history_not current',
  'diabetes'],
 'First 5 Rows':         age       bmi  HbA1c_level  blood_glucose_level  gender_Female  \
 0  1.692704 -0.321056     1.001706             0.047704            1.0   
 1  0.538006 -0.000116     1.001706            -1.426210            1.0   
 2 -0.616691 -0.000116     0.161108             0.489878            0.0   
 3 -0.261399 -0.583232    -0.492690             0.416183            1.0   
 4  1.515058 -1.081970    -0.679490             0.416183            0.0   
 
    gender_Male  gender_Other  smoking_history_No Info  \
 0          0.0           0.0                      0.0   
 1          0.0           0.0                 