In [1]:
import os
import pandas as pd

FOLDER_PATH = "dataset\MIA_SDG_Exercise"
SYNTHETIC_FILES = [f"synthetic_data{i}.csv" for i in range(1, 5)]
TEST_FILES = ["test_data_with_outliers.csv", "test_data_wto_outliers.csv"]

ALL_FILES = SYNTHETIC_FILES + TEST_FILES


def analyze_dataset(df, name):
    print(f"\n{'=' * 40}\nDataset: {name}\n{'=' * 40}")

    # 1. Shape
    print(f"Shape: {df.shape}")

    # 2. Missing values
    missing = df.isnull().sum()
    missing_pct = df.isnull().mean() * 100
    missing_summary = pd.DataFrame({
        'MissingCount': missing,
        'MissingPct': missing_pct
    }).sort_values('MissingCount', ascending=False)
    print("\nMissing Values:")
    print(missing_summary[missing_summary.MissingCount > 0])

    # 3. Data types
    print("\nData Types:")
    print(df.dtypes)

    # 4. Number of unique values
    unique_counts = df.nunique().sort_values(ascending=False)
    print("\nDistinct Values per Column:")
    print(unique_counts)

    # 5. Top 5 frequent values per column
    print("\nTop-5 Frequent Values per Column:")
    for col in df.columns:
        print(f"\n[{col}]")
        print(df[col].value_counts(dropna=False).head(5))

    # 6. Summary statistics for numerical data
    print("\nNumerical Summary:")
    print(df.describe(include=[float, int]))

    # 7. Summary statistics for categorical data
    print("\nCategorical Summary:")
    print(df.describe(include=[object, "category"]))


def main():
    for file_name in ALL_FILES:
        path = os.path.join(FOLDER_PATH, file_name)
        try:
            df = pd.read_csv(path)
            analyze_dataset(df, file_name)
        except Exception as e:
            print(f"Error reading {file_name}: {e}")


if __name__ == "__main__":
    main()


  FOLDER_PATH = "dataset\MIA_SDG_Exercise"



Dataset: synthetic_data1.csv
Shape: (4800, 31)

Missing Values:
                     MissingCount  MissingPct
body_weight                  4640   96.666667
hba1c_result                 3928   81.833333
provider_specialty           2417   50.354167
insurance_type_code          1885   39.270833
ethnic_group                  115    2.395833

Data Types:
lab_test_count          int64
primary_diagnosis      object
secondary_diagnosis    object
tertiary_diagnosis     object
medication_count        int64
inpatient_visits        int64
stay_duration_days      int64
age_range              object
provider_specialty     object
insurance_type_code    object
diagnosis_count         int64
exit_status_code        int64
procedure_count         int64
entry_type_code         int64
drug_013               object
entry_origin_code       int64
ethnic_group           object
outpatient_visits       int64
emergency_visits        int64
sex                    object
drug_014               object
hba1c_result    

In [2]:
import os
import pandas as pd

LABEL_COLUMN = "is_member"

def load_columns(filepath, drop_label=False):
    df = pd.read_csv(filepath, nrows=1)  # Only read header
    cols = list(df.columns)
    if drop_label and LABEL_COLUMN in cols:
        cols.remove(LABEL_COLUMN)
    return set(cols)

def main():
    synthetic_columns = {}
    test_columns = {}

    # Load synthetic dataset columns
    for fname in SYNTHETIC_FILES:
        path = os.path.join(FOLDER_PATH, fname)
        try:
            synthetic_columns[fname] = load_columns(path)
        except Exception as e:
            print(f"Error reading {fname}: {e}")

    # Load test dataset columns (excluding label)
    for fname in TEST_FILES:
        path = os.path.join(FOLDER_PATH, fname)
        try:
            test_columns[fname] = load_columns(path, drop_label=True)
        except Exception as e:
            print(f"Error reading {fname}: {e}")

    print("\n=== Checking Synthetic Dataset Column Consistency ===")
    ref_cols = next(iter(synthetic_columns.values()))
    for fname, cols in synthetic_columns.items():
        if cols != ref_cols:
            diff = sorted(ref_cols.symmetric_difference(cols))
            print(f"❌ {fname} differs from others. Difference: {diff}")
        else:
            print(f"✅ {fname} matches reference synthetic schema.")

    print("\n=== Comparing Test Datasets to Synthetic Columns ===")
    for fname, cols in test_columns.items():
        if cols != ref_cols:
            diff = sorted(ref_cols.symmetric_difference(cols))
            print(f"❌ {fname} differs from synthetic schema. Difference: {diff}")
        else:
            print(f"✅ {fname} matches synthetic schema.")

if __name__ == "__main__":
    main()



=== Checking Synthetic Dataset Column Consistency ===
✅ synthetic_data1.csv matches reference synthetic schema.
❌ synthetic_data2.csv differs from others. Difference: ['body_weight', 'max_glucose_level']
✅ synthetic_data3.csv matches reference synthetic schema.
✅ synthetic_data4.csv matches reference synthetic schema.

=== Comparing Test Datasets to Synthetic Columns ===
❌ test_data_with_outliers.csv differs from synthetic schema. Difference: ['body_weight', 'max_glucose_level']
❌ test_data_wto_outliers.csv differs from synthetic schema. Difference: ['body_weight', 'max_glucose_level']
