In [17]:
import os
import pandas as pd
import numpy as np


# Define folder paths relative to this notebook
raw_dir = './data/raw'
processed_dir = './data/processed'

# Create folders if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Define the modified sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41, 28, np.nan, 60],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000, 61000, 72000, np.nan],
    'credit_score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79, np.nan, 0.93, 0.55],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105', '33101', '73301', np.nan],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco', 'Miami', 'Austin', 'Unknown'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan, 99, np.nan, np.nan],
    'category': ['A', 'B', 'A', np.nan, 'C', 'B', 'C', 'A', np.nan, 'B']  # new categorical col
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in raw data folder
csv_path = os.path.join(raw_dir, 'sample_data_modified.csv')
df.to_csv(csv_path, index=False)

print(f"✅ Modified sample dataset created and saved to {csv_path}")
print(df.head(10))


✅ Modified sample dataset created and saved to ./data/raw/sample_data_modified.csv
    age   income  credit_score zipcode           city  extra_data category
0  34.0  55000.0          0.82   90210        Beverly         NaN        A
1  45.0      NaN          0.91   10001       New York        42.0        B
2  29.0  42000.0           NaN   60614        Chicago         NaN        A
3  50.0  58000.0          0.76   94103             SF         NaN      NaN
4  38.0      NaN          0.88   73301         Austin         NaN        C
5   NaN      NaN          0.65   12345        Unknown         5.0        B
6  41.0  49000.0          0.79   94105  San Francisco         NaN        C
7  28.0  61000.0           NaN   33101          Miami        99.0        A
8   NaN  72000.0          0.93   73301         Austin         NaN      NaN
9  60.0      NaN          0.55     NaN        Unknown         NaN        B


In [18]:
import os, sqlite3, pandas as pd

raw_dir = './data/raw'
processed_dir = './data/processed'
os.makedirs(processed_dir, exist_ok=True)

CSV_PATH = os.path.join(raw_dir, 'sample_data_modified.csv')   # you just created this
DB_PATH  = os.path.join(processed_dir, 'project.db')
TABLE_RAW = 'raw_sample'

# Create (or replace) table from CSV
df_raw = pd.read_csv(CSV_PATH)
with sqlite3.connect(DB_PATH) as con:
    df_raw.to_sql(TABLE_RAW, con, if_exists='replace', index=False)

print(f"✅ Wrote {len(df_raw)} rows to '{TABLE_RAW}' in {DB_PATH}")


✅ Wrote 10 rows to 'raw_sample' in ./data/processed/project.db


In [19]:
import pandas as pd, sqlite3

with sqlite3.connect(DB_PATH) as con:
    tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", con)
    preview = pd.read_sql_query(f"SELECT * FROM {TABLE_RAW} LIMIT 5;", con)

display(tables)
display(preview)


Unnamed: 0,name
0,raw_sample


Unnamed: 0,age,income,credit_score,zipcode,city,extra_data,category
0,34.0,55000.0,0.82,90210.0,Beverly,,A
1,45.0,,0.91,10001.0,New York,42.0,B
2,29.0,42000.0,,60614.0,Chicago,,A
3,50.0,58000.0,0.76,94103.0,SF,,
4,38.0,,0.88,73301.0,Austin,,C


In [20]:
import numpy as np
import pandas as pd
import sqlite3
import sys, os

# ensure we can import from src/
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

try:
    from src.cleaning import drop_missing, fill_missing_median, normalize_data
except Exception as e:
    # minimal fallbacks so you can run even if src/cleaning.py isn't ready yet
    import numpy as np
    def drop_missing(df, thresh=0.95):
        keep = (1 - df.isna().mean()) >= thresh
        return df.loc[:, keep.index[keep]]
    def fill_missing_median(df, cols=None):
        out = df.copy()
        cols = cols or out.select_dtypes(include=[np.number]).columns
        for c in cols:
            out[c] = out[c].fillna(out[c].median())
        return out
    def normalize_data(df, cols=None, method='zscore'):
        out = df.copy()
        cols = cols or out.select_dtypes(include=[np.number]).columns
        for c in cols:
            s = out[c].astype(float)
            if method == 'zscore':
                mu, sd = s.mean(), s.std(ddof=0)
                out[c] = (s - mu) / sd if sd != 0 else 0.0
            else:
                mn, mx = s.min(), s.max()
                out[c] = (s - mn) / (mx - mn) if mx != mn else 0.0
        return out

NUMERIC_ONLY_LATER = ['age', 'income', 'credit_score']  # will normalize only these

with sqlite3.connect(DB_PATH) as con:
    df = pd.read_sql_query(f"SELECT * FROM {TABLE_RAW};", con)

print("Before:", df.shape)
df1 = drop_missing(df, thresh=0.90)                 # drop columns with too many NaNs
df2 = fill_missing_median(df1)                      # numeric NaNs → median
df_clean = normalize_data(df2, cols=NUMERIC_ONLY_LATER, method='zscore')  # scale numeric features

print("After :", df_clean.shape)
df_clean.head()


Before: (10, 7)
After : (10, 2)


Unnamed: 0,zipcode,city
0,90210.0,Beverly
1,10001.0,New York
2,60614.0,Chicago
3,94103.0,SF
4,73301.0,Austin


In [21]:
compare = pd.DataFrame({
    'missing_before': df.isna().mean(),
    'missing_after' : df_clean.isna().mean()
}).sort_values('missing_before', ascending=False)

display(compare)


Unnamed: 0,missing_before,missing_after
extra_data,0.7,
income,0.4,
age,0.2,
category,0.2,
credit_score,0.2,
zipcode,0.1,0.0
city,0.0,0.0


In [22]:
OUT_CSV = os.path.join(processed_dir, 'sample_data_cleaned.csv')
OUT_PQ  = os.path.join(processed_dir, 'sample_data_cleaned.parquet')

df_clean.to_csv(OUT_CSV, index=False)
df_clean.to_parquet(OUT_PQ, index=False)

with sqlite3.connect(DB_PATH) as con:
    df_clean.to_sql('cleaned_sample', con, if_exists='replace', index=False)

print(f"✅ Saved cleaned CSV → {OUT_CSV}")
print(f"✅ Saved cleaned Parquet → {OUT_PQ}")
print(f"✅ Wrote cleaned table 'cleaned_sample' to {DB_PATH}")


✅ Saved cleaned CSV → ./data/processed/sample_data_cleaned.csv
✅ Saved cleaned Parquet → ./data/processed/sample_data_cleaned.parquet
✅ Wrote cleaned table 'cleaned_sample' to ./data/processed/project.db


Assumptions & Cleaning Strategy (Stage 06)

Dropped columns where non-null ratio < 90% (thresh=0.90).

Imputed numeric NaNs with column median (robust to outliers).

Left categorical NaNs (city, category) for later encoding; avoids fabricating categories.

Normalized only numeric features (age, income, credit_score) with z-score.

Outputs saved to data/processed/ (CSV & Parquet) and to SQLite (cleaned_sample).

Risks: if distributions are extremely skewed, consider log/Box-Cox; if Unknown is a real class, encode it explicitly.