In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from pathlib import Path
import sys

# Add src to path
sys.path.append('../src')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

print("‚úÖ Libraries imported successfully")

‚úÖ Libraries imported successfully


In [3]:
#  Load data with proper error handling
def load_diabetes_data():
    """Load diabetes dataset with validation"""
    data_path = Path('../data/raw/Diabetes Missing Data.csv')
    
    if not data_path.exists():
        raise FileNotFoundError(f"‚ùå Data file not found at {data_path}")
    
    try:
        df = pd.read_csv(data_path)
        print(f"‚úÖ Data loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns")
        return df
    except Exception as e:
        print(f"‚ùå Error loading data: {e}")
        return None

df = load_diabetes_data()

‚úÖ Data loaded successfully: 768 rows, 9 columns


In [4]:
# Basic dataset information
def basic_data_assessment(df):
    """Perform comprehensive initial data assessment"""
    
    print("üìä DATASET BASIC INFORMATION")
    print("=" * 50)
    
    # Basic info
    print(f"Dataset shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    print("\nüìã DATA TYPES AND MISSING VALUES:")
    print("=" * 40)
    info_df = pd.DataFrame({
        'dtype': df.dtypes,
        'non_null_count': df.count(),
        'null_count': df.isnull().sum(),
        'null_percentage': (df.isnull().sum() / len(df)) * 100
    })
    print(info_df)
    
    return info_df

info_df = basic_data_assessment(df)

üìä DATASET BASIC INFORMATION
Dataset shape: (768, 9)
Memory usage: 0.05 MB

üìã DATA TYPES AND MISSING VALUES:
                            dtype  non_null_count  null_count  null_percentage
Pregnancies                 int64             768           0              0.0
Glucose                     int64             768           0              0.0
BloodPressure               int64             768           0              0.0
SkinThickness               int64             768           0              0.0
Insulin                     int64             768           0              0.0
BMI                       float64             768           0              0.0
DiabetesPedigreeFunction  float64             768           0              0.0
Age                         int64             768           0              0.0
Outcome                     int64             768           0              0.0


In [5]:
# Statistical summary
def statistical_summary(df):
    """Generate detailed statistical summary"""
    
    print("üìà STATISTICAL SUMMARY")
    print("=" * 40)
    
    # Descriptive statistics
    stats = df.describe().T
    stats['median'] = df.median()
    stats['variance'] = df.var()
    stats['skewness'] = df.skew()
    stats['kurtosis'] = df.kurtosis()
    
    print(stats.round(3))
    
    return stats

stats_df = statistical_summary(df)

üìà STATISTICAL SUMMARY
                          count     mean      std     min     25%      50%  \
Pregnancies               768.0    3.845    3.370   0.000   1.000    3.000   
Glucose                   768.0  120.895   31.973   0.000  99.000  117.000   
BloodPressure             768.0   69.105   19.356   0.000  62.000   72.000   
SkinThickness             768.0   20.536   15.952   0.000   0.000   23.000   
Insulin                   768.0   79.799  115.244   0.000   0.000   30.500   
BMI                       768.0   31.993    7.884   0.000  27.300   32.000   
DiabetesPedigreeFunction  768.0    0.472    0.331   0.078   0.244    0.372   
Age                       768.0   33.241   11.760  21.000  24.000   29.000   
Outcome                   768.0    0.349    0.477   0.000   0.000    0.000   

                              75%     max   median   variance  skewness  \
Pregnancies                 6.000   17.00    3.000     11.354     0.902   
Glucose                   140.250  199.00  1