In [15]:
# Import the essential libraries
import pandas as pd
import numpy as np

# Display settings for better output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries imported successfully!")
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")

Libraries imported successfully!
pandas version: 2.3.3
numpy version: 2.4.1


In [16]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [17]:
# Load the Diabetes dataset
df = pd.read_csv('diabetes.csv')

print("Dataset loaded successfully!")

Dataset loaded successfully!


In [18]:
# Point 1: Shape
print("Dataset Shape:")
print(df.shape)

print(f"\nüìä We have {df.shape[0]:,} observations and {df.shape[1]} features")
print(f"üìä Total data points: {df.shape[0] * df.shape[1]:,}")

Dataset Shape:
(768, 9)

üìä We have 768 observations and 9 features
üìä Total data points: 6,912


In [19]:
# Point 2: Column Names
print("Column Names:")
print(df.columns.tolist())

print(f"\nüìã Total columns: {len(df.columns)}")

Column Names:
['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

üìã Total columns: 9


In [20]:
# Point 3: Data Types
print("Data Types:")
print(df.dtypes)

print("\n" + "="*50)
print("Data Type Summary:")
print(df.dtypes.value_counts())

Data Types:
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

Data Type Summary:
int64      7
float64    2
Name: count, dtype: int64


In [21]:
# Point 4: First Look
print("First 5 Rows:")
df.head()

First 5 Rows:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [22]:
# Point 5: Last Look
print("Last 5 Rows:")
df.tail()

Last 5 Rows:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [23]:
# Point 6: Memory Usage
print("Memory Usage by Column:")
print(df.memory_usage(deep=True))

# Total memory in MB
total_memory_mb = df.memory_usage(deep=True).sum() / 1e6
print(f"\nüíæ Total Memory Usage: {total_memory_mb:.2f} MB")

Memory Usage by Column:
Index                        132
Pregnancies                 6144
Glucose                     6144
BloodPressure               6144
SkinThickness               6144
Insulin                     6144
BMI                         6144
DiabetesPedigreeFunction    6144
Age                         6144
Outcome                     6144
dtype: int64

üíæ Total Memory Usage: 0.06 MB


In [24]:
# Point 7: Missing Values
print("Missing Values by Column:")
missing = df.isnull().sum()
print(missing)

print("\n" + "="*50)
total_missing = missing.sum()
print(f"‚ùì Total Missing Values: {total_missing}")

if total_missing == 0:
    print("‚úÖ Great! No missing values - complete dataset!")
else:
    print(f"‚ö†Ô∏è {total_missing} missing values need attention")

Missing Values by Column:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

‚ùì Total Missing Values: 0
‚úÖ Great! No missing values - complete dataset!


In [25]:
# Point 8: Duplicates
duplicate_count = df.duplicated().sum()
duplicate_pct = (duplicate_count / len(df)) * 100

print(f"üìë Duplicate Rows: {duplicate_count:,}")
print(f"üìë Percentage: {duplicate_pct:.2f}%")

if duplicate_count > 0:
    print(f"\n‚ö†Ô∏è Warning: {duplicate_pct:.2f}% of rows are duplicates!")
    print("   This needs investigation in Week 4 (Data Cleaning)")

üìë Duplicate Rows: 0
üìë Percentage: 0.00%


In [26]:
# Let's look at some duplicate examples
print("Example of duplicate rows:")
duplicates = df[df.duplicated(keep=False)]
duplicates.head(10)

Example of duplicate rows:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [27]:
# Point 9: Descriptive Statistics
print("Descriptive Statistics:")
df.describe()

Descriptive Statistics:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [28]:
# Transpose for easier reading
print("Descriptive Statistics (Transposed for readability):")
df.describe().T

Descriptive Statistics (Transposed for readability):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [29]:
# Point 10: Unique Values
print("Unique Values per Column:")
unique_counts = df.nunique().sort_values()
print(unique_counts)

Unique Values per Column:
Outcome                       2
Pregnancies                  17
BloodPressure                47
SkinThickness                51
Age                          52
Glucose                     136
Insulin                     186
BMI                         248
DiabetesPedigreeFunction    517
dtype: int64


In [30]:
# Categorize columns by unique count
print("\n" + "="*50)
print("Feature Classification by Unique Values:")
print("="*50)

binary = unique_counts[unique_counts == 2].index.tolist()
low_cardinality = unique_counts[(unique_counts > 2) & (unique_counts <= 5)].index.tolist()
high_cardinality = unique_counts[unique_counts > 5].index.tolist()

print(f"\nüü¢ BINARY (2 values): {binary}")
print(f"\nüîµ LOW CARDINALITY (3-5 values): {low_cardinality}")
print(f"\nüü£ HIGH CARDINALITY (>5 values): {high_cardinality}")


Feature Classification by Unique Values:

üü¢ BINARY (2 values): ['Outcome']

üîµ LOW CARDINALITY (3-5 values): []

üü£ HIGH CARDINALITY (>5 values): ['Pregnancies', 'BloodPressure', 'SkinThickness', 'Age', 'Glucose', 'Insulin', 'BMI', 'DiabetesPedigreeFunction']


In [31]:
# Bonus: df.info() provides a quick summary
print("DataFrame Info:")
df.info()

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [32]:
def ten_point_inspection(df, name="Dataset"):
    """
    Perform a comprehensive 10-point inspection on a DataFrame.
    
    Parameters:
    -----------
    df : pandas DataFrame
        The dataset to inspect
    name : str
        Name of the dataset for display purposes
    """
    print("="*60)
    print(f"üìä 10-POINT INSPECTION: {name}")
    print("="*60)
    
    # 1. Shape
    print(f"\n1Ô∏è‚É£  SHAPE: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
    
    # 2. Columns
    print(f"\n2Ô∏è‚É£  COLUMNS: {list(df.columns)}")
    
    # 3. Data Types
    print(f"\n3Ô∏è‚É£  DATA TYPES:")
    print(df.dtypes.value_counts().to_string())
    
    # 4 & 5. First and Last rows (just noting)
    print(f"\n4Ô∏è‚É£  FIRST ROW: {dict(df.iloc[0])}")
    print(f"\n5Ô∏è‚É£  LAST ROW: {dict(df.iloc[-1])}")
    
    # 6. Memory
    memory_mb = df.memory_usage(deep=True).sum() / 1e6
    print(f"\n6Ô∏è‚É£  MEMORY: {memory_mb:.2f} MB")
    
    # 7. Missing Values
    missing = df.isnull().sum().sum()
    print(f"\n7Ô∏è‚É£  MISSING VALUES: {missing:,}")
    
    # 8. Duplicates
    dupes = df.duplicated().sum()
    dupe_pct = (dupes / len(df)) * 100
    print(f"\n8Ô∏è‚É£  DUPLICATES: {dupes:,} ({dupe_pct:.2f}%)")
    
    # 9. Key Statistics
    print(f"\n9Ô∏è‚É£  KEY STATISTICS:")
    numeric_cols = df.select_dtypes(include=[np.number]).columns[:5]  # First 5 numeric
    for col in numeric_cols:
        print(f"    {col}: min={df[col].min()}, max={df[col].max()}, mean={df[col].mean():.2f}")
    
    # 10. Unique Values
    print(f"\nüîü UNIQUE VALUE RANGES:")
    unique = df.nunique()
    print(f"    Binary (2): {list(unique[unique == 2].index)}")
    print(f"    Low (3-5): {list(unique[(unique > 2) & (unique <= 5)].index)}")
    print(f"    High (>5): {list(unique[unique > 5].index)}")
    
    print("\n" + "="*60)
    print("‚úÖ 10-Point Inspection Complete!")
    print("="*60)

In [33]:
# Run the inspection function
ten_point_inspection(df, "Pima Indian Diabetes")

üìä 10-POINT INSPECTION: Pima Indian Diabetes

1Ô∏è‚É£  SHAPE: 768 rows √ó 9 columns

2Ô∏è‚É£  COLUMNS: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

3Ô∏è‚É£  DATA TYPES:
int64      7
float64    2

4Ô∏è‚É£  FIRST ROW: {'Pregnancies': np.float64(6.0), 'Glucose': np.float64(148.0), 'BloodPressure': np.float64(72.0), 'SkinThickness': np.float64(35.0), 'Insulin': np.float64(0.0), 'BMI': np.float64(33.6), 'DiabetesPedigreeFunction': np.float64(0.627), 'Age': np.float64(50.0), 'Outcome': np.float64(1.0)}

5Ô∏è‚É£  LAST ROW: {'Pregnancies': np.float64(1.0), 'Glucose': np.float64(93.0), 'BloodPressure': np.float64(70.0), 'SkinThickness': np.float64(31.0), 'Insulin': np.float64(0.0), 'BMI': np.float64(30.4), 'DiabetesPedigreeFunction': np.float64(0.315), 'Age': np.float64(23.0), 'Outcome': np.float64(0.0)}

6Ô∏è‚É£  MEMORY: 0.06 MB

7Ô∏è‚É£  MISSING VALUES: 0

8Ô∏è‚É£  DUPLICATES: 0 (0.00%)

9Ô∏è‚É£  KEY STATISTICS