In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [17]:
# === 1) Set your file paths here ===
CSV_PATHS = [
    "resources/dataset_project_eHealth20252026.csv",
    "resources/questionnaire_codebook_eHealth20252026.csv",
]
 
# (Optional) If you know the ID column name, set it here; otherwise the code will try to guess.
KNOWN_ID_COL = None  # e.g., "ID" or "RespondentID" or "participant_id"


In [18]:
# === 2) Helper: robust CSV loader (handles common encodings/delimiters) 
def load_csv_robust(path: str) -> pd.DataFrame:
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")
    # Try default; if it fails, try common alternatives
    trials = [
        dict(encoding=None, sep=None, engine="python"),          # auto-detect sep
        dict(encoding="utf-8", sep=None, engine="python"),
        dict(encoding="latin-1", sep=None, engine="python"),
        dict(encoding=None, sep=",", engine="python"),
        dict(encoding=None, sep=";", engine="python"),
    ]
    last_err = None
    for kw in trials:
        try:
            return pd.read_csv(path, **kw)
        except Exception as e:
            last_err = e
    raise RuntimeError(f"Could not read {path} – last error:\n{last_err}")
 

In [24]:
# === 3) Load both CSVs and show basic info ===
dfs = []
for p in CSV_PATHS:
    df = load_csv_robust(p)
    print(f"\n=== Loaded: {p} ===")
    print(f"Shape: {df.shape[0]} rows × {df.shape[1]} cols")
    print("First 20 column names:", list(df.columns[:20]))
    print("\nDtypes:")
    print(df.dtypes.head(15))
    dfs.append(df)


=== Loaded: resources/dataset_project_eHealth20252026.csv ===
Shape: 221 rows × 96 cols
First 20 column names: ['age', 'gender', 'education', 'marital', 'income', 'audit_1', 'audit_2', 'audit_3', 'audit_4', 'audit_5', 'audit_6', 'audit_7', 'audit_8', 'audit_9', 'audit_10', 'dast_1', 'dast_2', 'dast_3', 'dast_4', 'dast_5']

Dtypes:
age          float64
gender         int64
education    float64
marital      float64
income         int64
audit_1        int64
audit_2        int64
audit_3      float64
audit_4        int64
audit_5      float64
audit_6        int64
audit_7      float64
audit_8      float64
audit_9      float64
audit_10     float64
dtype: object

=== Loaded: resources/questionnaire_codebook_eHealth20252026.csv ===
Shape: 96 rows × 4 cols
First 20 column names: ['\ufeffquestion', 'type', 'name of column', '[codification] options (if applicable)']

Dtypes:
﻿question                                 object
type                                      object
name of column            

In [12]:
#Clean data

data.dropna(how='all', inplace=True)