In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot styles
sns.set(style="whitegrid")


In [None]:
# Load the dataset
data_path = 'data/diabetes_generated.csv'
data = pd.read_csv(data_path)

# Display the first few rows
data.head()


In [None]:
# Check for missing values
print("Missing values per column:")
print(data.isnull().sum())

# Dataset summary
print("\nDataset summary:")
print(data.describe())


In [None]:
# Plot the distribution of the target variable
sns.countplot(x='Outcome', data=data, palette='viridis')
plt.title("Distribution of Diabetes Outcome")
plt.xlabel("Outcome (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()


In [None]:
# Pairplot to explore relationships between features
sns.pairplot(data, hue="Outcome", diag_kind="kde", palette="viridis")
plt.suptitle("Feature Relationships by Outcome", y=1.02)
plt.show()


In [None]:
# Compute correlation matrix
correlation_matrix = data.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
# Plot the distribution of numerical features
numerical_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age']

for feature in numerical_features:
    plt.figure(figsize=(6, 4))
    sns.histplot(data[feature], kde=True, color="blue")
    plt.title(f"Distribution of {feature}")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.show()


In [None]:
# Insights and Observations

1. **Missing Values:**
   - Check for missing or null values and handle them during preprocessing.

2. **Imbalanced Dataset:**
   - Observe if the `Outcome` variable is imbalanced. If yes, apply techniques like oversampling or undersampling.

3. **Correlations:**
   - Features like `Glucose`, `BMI`, and `Age` might have strong correlations with `Outcome`.

4. **Feature Distributions:**
   - Many features, such as `Insulin`, appear skewed, requiring transformations or scaling.
