In [4]:
import pandas as pd 


In [5]:
# Load the MymensingUniversity dataset for analysis
df = pd.read_csv("MymensingUniversity.csv")
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

Dataset loaded successfully!
Shape: (1529, 22)
Columns: ['Sex', 'Age', 'Weight (kg)', 'Height (m)', 'BMI', 'Abdominal Circumference (cm)', 'Blood Pressure (mmHg)', 'Total Cholesterol (mg/dL)', 'HDL (mg/dL)', 'Fasting Blood Sugar (mg/dL)', 'Smoking Status', 'Diabetes Status', 'Physical Activity Level', 'Family History of CVD', 'CVD Risk Level', 'Height (cm)', 'Waist-to-Height Ratio', 'Systolic BP', 'Diastolic BP', 'Blood Pressure Category', 'Estimated LDL (mg/dL)', 'CVD Risk Score']


In [6]:
# Basic information about the dataset
print("=== BASIC DATASET INFO ===")
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print()

print("=== DATA TYPES ===")
print(df.dtypes)
print()

print("=== MISSING VALUES ===")
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)
print(missing_df[missing_df['Missing Count'] > 0])
print()

print("=== TARGET VARIABLE DISTRIBUTION ===")
print("CVD Risk Level distribution:")
print(df['CVD Risk Level'].value_counts())
print()
print("CVD Risk Level percentages:")
print(df['CVD Risk Level'].value_counts(normalize=True) * 100)

=== BASIC DATASET INFO ===
Shape: (1529, 22)
Memory usage: 0.88 MB

=== DATA TYPES ===
Sex                              object
Age                             float64
Weight (kg)                     float64
Height (m)                      float64
BMI                             float64
Abdominal Circumference (cm)    float64
Blood Pressure (mmHg)            object
Total Cholesterol (mg/dL)       float64
HDL (mg/dL)                     float64
Fasting Blood Sugar (mg/dL)     float64
Smoking Status                   object
Diabetes Status                  object
Physical Activity Level          object
Family History of CVD            object
CVD Risk Level                   object
Height (cm)                     float64
Waist-to-Height Ratio           float64
Systolic BP                     float64
Diastolic BP                    float64
Blood Pressure Category          object
Estimated LDL (mg/dL)           float64
CVD Risk Score                  float64
dtype: object

=== MISSING VALUES

In [7]:
# Analyze categorical columns
print("=== CATEGORICAL COLUMNS ANALYSIS ===")

categorical_cols = ['Sex', 'Smoking Status', 'Diabetes Status', 'Physical Activity Level', 
                   'Family History of CVD', 'Blood Pressure Category']

for col in categorical_cols:
    if col in df.columns:
        print(f"\n{col}:")
        print(df[col].value_counts(dropna=False))
        print(f"Unique values: {df[col].nunique()}")
        print(f"Missing: {df[col].isnull().sum()}")
    else:
        print(f"\n{col}: Column not found in dataset")


=== CATEGORICAL COLUMNS ANALYSIS ===

Sex:
Sex
F    773
M    756
Name: count, dtype: int64
Unique values: 2
Missing: 0

Smoking Status:
Smoking Status
Y    789
N    740
Name: count, dtype: int64
Unique values: 2
Missing: 0

Diabetes Status:
Diabetes Status
Y    777
N    752
Name: count, dtype: int64
Unique values: 2
Missing: 0

Physical Activity Level:
Physical Activity Level
High        521
Moderate    512
Low         496
Name: count, dtype: int64
Unique values: 3
Missing: 0

Family History of CVD:
Family History of CVD
N    780
Y    749
Name: count, dtype: int64
Unique values: 2
Missing: 0

Blood Pressure Category:
Blood Pressure Category
Hypertension Stage 2    632
Hypertension Stage 1    497
Normal                  300
Elevated                100
Name: count, dtype: int64
Unique values: 4
Missing: 0


In [8]:
# Analyze numerical columns
print("=== NUMERICAL COLUMNS ANALYSIS ===")

numerical_cols = ['Age', 'Weight (kg)', 'Height (m)', 'BMI', 'Abdominal Circumference (cm)',
                 'Total Cholesterol (mg/dL)', 'HDL (mg/dL)', 'Fasting Blood Sugar (mg/dL)',
                 'Height (cm)', 'Waist-to-Height Ratio', 'Systolic BP', 'Diastolic BP',
                 'Estimated LDL (mg/dL)', 'CVD Risk Score']

for col in numerical_cols:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Data type: {df[col].dtype}")
        print(f"  Missing: {df[col].isnull().sum()} ({df[col].isnull().sum()/len(df)*100:.1f}%)")
        if df[col].dtype in ['object']:
            print(f"  Unique values: {df[col].unique()[:10]}...")  # First 10 unique values
        else:
            print(f"  Min: {df[col].min()}, Max: {df[col].max()}")
            print(f"  Mean: {df[col].mean():.2f}, Std: {df[col].std():.2f}")
    else:
        print(f"\n{col}: Column not found in dataset")


=== NUMERICAL COLUMNS ANALYSIS ===

Age:
  Data type: float64
  Missing: 78 (5.1%)
  Min: 25.0, Max: 79.0
  Mean: 47.03, Std: 12.42

Weight (kg):
  Data type: float64
  Missing: 81 (5.3%)
  Min: 50.1, Max: 120.0
  Mean: 85.92, Std: 21.01

Height (m):
  Data type: float64
  Missing: 67 (4.4%)
  Min: 1.502, Max: 2.0
  Mean: 1.75, Std: 0.11

BMI:
  Data type: float64
  Missing: 64 (4.2%)
  Min: 15.0, Max: 46.2
  Mean: 28.47, Std: 7.04

Abdominal Circumference (cm):
  Data type: float64
  Missing: 67 (4.4%)
  Min: 70.0, Max: 119.996
  Mean: 91.77, Std: 12.82

Total Cholesterol (mg/dL):
  Data type: float64
  Missing: 73 (4.8%)
  Min: 100.0, Max: 300.0
  Mean: 198.54, Std: 57.79

HDL (mg/dL):
  Data type: float64
  Missing: 80 (5.2%)
  Min: 30.0, Max: 89.0
  Mean: 56.20, Std: 16.07

Fasting Blood Sugar (mg/dL):
  Data type: float64
  Missing: 67 (4.4%)
  Min: 70.0, Max: 198.0
  Mean: 117.49, Std: 30.29

Height (cm):
  Data type: float64
  Missing: 74 (4.8%)
  Min: 150.0, Max: 199.96
  Mean:

In [9]:
# Check for data quality issues
print("=== DATA QUALITY ISSUES ===")

# Check for inconsistent data formats
print("\n1. Blood Pressure column analysis:")
print("Sample values:", df['Blood Pressure (mmHg)'].dropna().head(10).tolist())
print("Unique pattern count:", df['Blood Pressure (mmHg)'].nunique())

# Check for duplicate columns (Height in meters vs cm)
print("\n2. Height columns comparison:")
if 'Height (m)' in df.columns and 'Height (cm)' in df.columns:
    # Convert height(m) to cm and compare
    height_m_to_cm = df['Height (m)'] * 100
    height_cm = pd.to_numeric(df['Height (cm)'], errors='coerce')
    diff = abs(height_m_to_cm - height_cm)
    print(f"Height difference (converted m to cm vs cm column): {diff.describe()}")

# Check for outliers in numerical columns
print("\n3. Potential outliers (values outside 3 standard deviations):")
for col in ['Age', 'BMI', 'Total Cholesterol (mg/dL)', 'HDL (mg/dL)', 'Fasting Blood Sugar (mg/dL)']:
    if col in df.columns and df[col].dtype in ['int64', 'float64']:
        mean_val = df[col].mean()
        std_val = df[col].std()
        outliers = df[(df[col] < mean_val - 3*std_val) | (df[col] > mean_val + 3*std_val)][col]
        if len(outliers) > 0:
            print(f"  {col}: {len(outliers)} outliers")

# Check for impossible values
print("\n4. Impossible/Invalid values:")
if 'Age' in df.columns:
    invalid_age = df[(df['Age'] < 0) | (df['Age'] > 120)]
    print(f"  Invalid Age values: {len(invalid_age)}")

if 'BMI' in df.columns:
    invalid_bmi = df[(df['BMI'] < 10) | (df['BMI'] > 80)]
    print(f"  Invalid BMI values: {len(invalid_bmi)}")

if 'Systolic BP' in df.columns and 'Diastolic BP' in df.columns:
    invalid_bp = df[df['Systolic BP'] <= df['Diastolic BP']]
    print(f"  Invalid BP (systolic <= diastolic): {len(invalid_bp)}")


=== DATA QUALITY ISSUES ===

1. Blood Pressure column analysis:
Sample values: ['125/79', '139/70', '104/77', '140/83', '144/83', '142/90', '96/63', '115/73', '107/95', '133/83']
Unique pattern count: 1254

2. Height columns comparison:
Height difference (converted m to cm vs cm column): count    1394.000000
mean        0.008785
std         0.014777
min         0.000000
25%         0.000000
50%         0.000000
75%         0.014000
max         0.050000
dtype: float64

3. Potential outliers (values outside 3 standard deviations):

4. Impossible/Invalid values:
  Invalid Age values: 0
  Invalid BMI values: 0
  Invalid BP (systolic <= diastolic): 65


In [10]:
# Sample data for manual inspection
print("=== SAMPLE DATA INSPECTION ===")
print("\nFirst 5 rows:")
print(df.head())
print("\nLast 5 rows:")
print(df.tail())
print("\nRandom 5 rows:")
print(df.sample(5))


=== SAMPLE DATA INSPECTION ===

First 5 rows:
  Sex   Age  Weight (kg)  Height (m)   BMI  Abdominal Circumference (cm)  \
0   F  32.0         69.1        1.71  23.6                          86.2   
1   F  55.0        118.7        1.69  41.6                          82.5   
2   M   NaN          NaN        1.83  26.9                         106.7   
3   M  44.0        108.3        1.80  33.4                          96.6   
4   F  32.0         99.5        1.86  28.8                         102.7   

  Blood Pressure (mmHg)  Total Cholesterol (mg/dL)  HDL (mg/dL)  \
0                125/79                      248.0         78.0   
1                139/70                      162.0         50.0   
2                104/77                      103.0         73.0   
3                140/83                      134.0         46.0   
4                144/83                      146.0         64.0   

   Fasting Blood Sugar (mg/dL)  ... Physical Activity Level  \
0                        111.0 

In [11]:
# Summary and recommendations for cleaning
print("=== CLEANING RECOMMENDATIONS ===")
print("\nBased on the analysis, the following cleaning steps are needed:")
print("1. Handle missing values (imputation or removal)")
print("2. Convert categorical variables to numerical (Sex: F/M -> 0/1, etc.)")
print("3. Parse Blood Pressure column to extract Systolic and Diastolic values")
print("4. Handle duplicate Height columns (choose one or verify consistency)")
print("5. Convert CVD Risk Level to numerical (LOW=0, INTERMEDIARY=1, HIGH=2)")
print("6. Handle outliers and invalid values")
print("7. Standardize data types")
print("8. Create additional engineered features if needed")
print("\nNext step: Create a comprehensive cleaning script!")


=== CLEANING RECOMMENDATIONS ===

Based on the analysis, the following cleaning steps are needed:
1. Handle missing values (imputation or removal)
2. Convert categorical variables to numerical (Sex: F/M -> 0/1, etc.)
3. Parse Blood Pressure column to extract Systolic and Diastolic values
4. Handle duplicate Height columns (choose one or verify consistency)
5. Convert CVD Risk Level to numerical (LOW=0, INTERMEDIARY=1, HIGH=2)
6. Handle outliers and invalid values
7. Standardize data types
8. Create additional engineered features if needed

Next step: Create a comprehensive cleaning script!
