 data visualization

## Data Manipulation

### Import Data

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [30]:
df = pd.read_csv('diabetes_prediction_dataset.csv')
print(f"Shape: {df.shape}")
df.head()

Shape: (100000, 9)


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


### Basic Stats

In [31]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [33]:
(
    df
    .isna()
    .sum(axis=1)
 )

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Length: 100000, dtype: int64

In [34]:
print(f"Total missing values: {df.isnull().sum().sum()}")

Total missing values: 0


In [35]:
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())
    print(f"Unique values: {df[col].nunique()}")


gender:
gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64
Unique values: 3

smoking_history:
smoking_history
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: count, dtype: int64
Unique values: 6


In [36]:
df['gender'].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [37]:
df['smoking_history'].unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [38]:
# Check for missing values
print("=== MISSING VALUES CHECK ===")
missing_count = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100

missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': missing_count,
    'Missing_Percent': missing_percent
}).sort_values('Missing_Count', ascending=False)

print(missing_data)
print(f"\nTotal missing values in dataset: {df.isnull().sum().sum()}")

=== MISSING VALUES CHECK ===
                                  Column  Missing_Count  Missing_Percent
gender                            gender              0              0.0
age                                  age              0              0.0
hypertension                hypertension              0              0.0
heart_disease              heart_disease              0              0.0
smoking_history          smoking_history              0              0.0
bmi                                  bmi              0              0.0
HbA1c_level                  HbA1c_level              0              0.0
blood_glucose_level  blood_glucose_level              0              0.0
diabetes                        diabetes              0              0.0

Total missing values in dataset: 0


In [39]:
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())
    print(f"Unique values: {df[col].nunique()}")


gender:
gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64
Unique values: 3

smoking_history:
smoking_history
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: count, dtype: int64
Unique values: 6


- no missing values
- 2 categorial columns

### Scaling

In [40]:
# First, let's identify numerical columns that need scaling
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numerical columns:", numerical_cols)

# Show current ranges
print("\n=== CURRENT RANGES ===")
for col in numerical_cols:
    print(f"{col}: {df[col].min():.2f} to {df[col].max():.2f} (range: {df[col].max() - df[col].min():.2f})")

Numerical columns: ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']

=== CURRENT RANGES ===
age: 0.08 to 80.00 (range: 79.92)
hypertension: 0.00 to 1.00 (range: 1.00)
heart_disease: 0.00 to 1.00 (range: 1.00)
bmi: 10.01 to 95.69 (range: 85.68)
HbA1c_level: 3.50 to 9.00 (range: 5.50)
blood_glucose_level: 80.00 to 300.00 (range: 220.00)
diabetes: 0.00 to 1.00 (range: 1.00)


In [41]:
from sklearn.preprocessing import StandardScaler

# Create a copy for scaling (exclude target variable)
df_for_scaling = df.copy()
target_col = df.columns[-1]  # Assuming last column is target
features_to_scale = [col for col in numerical_cols if col != target_col]

print(f"Features to scale: {features_to_scale}")
print(f"Target column (not scaled): {target_col}")

# Apply StandardScaler
scaler_standard = StandardScaler()
df_standard = df_for_scaling.copy()
df_standard[features_to_scale] = scaler_standard.fit_transform(df_for_scaling[features_to_scale])

Features to scale: ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level']
Target column (not scaled): diabetes


In [42]:
df_scaled = df_standard.copy()
print("\nFinal scaled ranges:")
for col in features_to_scale:
    print(f"{col}: {df_scaled[col].min():.2f} to {df_scaled[col].max():.2f} (mean: {df_scaled[col].mean():.2f})")


Final scaled ranges:
age: -1.86 to 1.69 (mean: 0.00)
hypertension: -0.28 to 3.52 (mean: 0.00)
heart_disease: -0.20 to 4.94 (mean: 0.00)
bmi: -2.61 to 10.30 (mean: -0.00)
HbA1c_level: -1.89 to 3.24 (mean: -0.00)
blood_glucose_level: -1.43 to 3.98 (mean: -0.00)


### One Hot Encoding

In [44]:
categorical_cols

Index(['gender', 'smoking_history'], dtype='object')

In [None]:
df_encoded = df_scaled.copy()

print("=== APPLYING ONE HOT ENCODING ===")

# Using pandas get_dummies
df_encoded = pd.get_dummies(df_scaled, columns=categorical_cols, drop_first=True)

print(f"Original shape: {df_scaled.shape}")
print(f"After encoding: {df_encoded.shape}")
print(f"New columns added: {df_encoded.shape[1] - df_scaled.shape[1]}")

# Show new column names
new_cols = [col for col in df_encoded.columns if col not in df_scaled.columns]
print(f"New encoded columns: {new_cols}")
    

=== APPLYING ONE HOT ENCODING ===
Original shape: (100000, 9)
After encoding: (100000, 14)
New columns added: 5
New encoded columns: ['gender_Male', 'gender_Other', 'smoking_history_current', 'smoking_history_ever', 'smoking_history_former', 'smoking_history_never', 'smoking_history_not current']


In [48]:
df_final = df_encoded

In [49]:
print("=== FINAL DATASET READY ===")
print(f"Shape: {df_final.shape}")
print(f"All columns: {list(df_final.columns)}")
print(f"Data types: {df_final.dtypes.value_counts()}")

=== FINAL DATASET READY ===
Shape: (100000, 14)
All columns: ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes', 'gender_Male', 'gender_Other', 'smoking_history_current', 'smoking_history_ever', 'smoking_history_former', 'smoking_history_never', 'smoking_history_not current']
Data types: bool       7
float64    6
int64      1
Name: count, dtype: int64


correlation