## 1. Setup and Imports

In [1]:
from pathlib import Path
import pandas as pd
import sys

sys.path.insert(0, str(Path().resolve().parent / "src"))

from bootcamp_data.io import read_orders_csv
from bootcamp_data.config import make_paths

# Get paths
root = Path().resolve().parent
paths = make_paths(root)

print(f"Working from: {root}")

Working from: C:\Users\koolg\week 2


## 2. Expected Schema (BEFORE Reading)

In [3]:
# Define expected schema before reading
expected_schema = {
    'order_id': 'string',
    'user_id': 'string',
    'amount': 'float',
    'quantity': 'float',
    'created_at': 'object',
    'status': 'object'
}

print("=== EXPECTED SCHEMA (BEFORE READING) ===")
for col, dtype in expected_schema.items():
    print(f"  {col}: {dtype}")

=== EXPECTED SCHEMA (BEFORE READING) ===
  order_id: string
  user_id: string
  amount: float
  quantity: float
  created_at: object
  status: object


## 3. Read CSV File

In [5]:
# Read orders CSV
csv_path = paths.raw / "orders.csv"
df = read_orders_csv(csv_path)

print(f"✓ Loaded CSV from: {csv_path}")
print(f"Shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

✓ Loaded CSV from: C:\Users\koolg\week 2\data\raw\orders.csv
Shape: (5, 6)

First few rows:


Unnamed: 0,order_id,user_id,amount,quantity,created_at,status
0,A0001,1,12.5,1.0,2025-12-01T10:05:00Z,Paid
1,A0002,2,8.0,2.0,2025-12-01T11:10:00Z,paid
2,A0003,3,,1.0,2025-12-02T09:00:00Z,Refund
3,A0004,1,25.0,,2025-12-03T14:30:00Z,PAID
4,A0005,4,100.0,1.0,not_a_date,paid


## 4. Actual Schema (AFTER Reading)

In [6]:
# Check actual schema after reading
actual_schema = df.dtypes.to_dict()

print("=== ACTUAL SCHEMA (AFTER READING) ===")
for col, dtype in actual_schema.items():
    print(f"  {col}: {dtype}")

=== ACTUAL SCHEMA (AFTER READING) ===
  order_id: string
  user_id: string
  amount: float64
  quantity: float64
  created_at: object
  status: object


## 5. Schema Comparison

In [7]:
# Compare expected vs actual
print("=== SCHEMA COMPARISON ===")
print(f"{'Column':<15} {'Expected':<20} {'Actual':<20} {'Match':<10}")
print("-" * 65)

for col in expected_schema.keys():
    expected = expected_schema[col]
    actual = str(actual_schema.get(col, 'MISSING'))
    match = '✓' if expected.lower() in actual.lower() else '✗'
    print(f"{col:<15} {expected:<20} {actual:<20} {match:<10}")

=== SCHEMA COMPARISON ===
Column          Expected             Actual               Match     
-----------------------------------------------------------------
order_id        string               string               ✓         
user_id         string               string               ✓         
amount          float                float64              ✓         
quantity        float                float64              ✓         
created_at      object               object               ✓         
status          object               object               ✓         


## 6. Data Quality Summary

In [8]:
# Data quality metrics
print("=== DATA QUALITY SUMMARY ===")
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nData types:")
print(df.dtypes)

=== DATA QUALITY SUMMARY ===
Total rows: 5
Total columns: 6

Missing values:
order_id      0
user_id       0
amount        1
quantity      1
created_at    0
status        0
dtype: int64

Data types:
order_id      string[python]
user_id       string[python]
amount               float64
quantity             float64
created_at            object
status                object
dtype: object


## 7. Read Users CSV and Compare

In [9]:
# Read users CSV
users_path = paths.raw / "users.csv"
users_df = read_orders_csv(users_path)

print("=== USERS CSV ===")
print(f"Shape: {users_df.shape}")
print(f"\nSchema:")
print(users_df.dtypes)
print(f"\nData:")
users_df.head()

=== USERS CSV ===
Shape: (4, 3)

Schema:
user_id        string[python]
country                object
signup_date            object
dtype: object

Data:


Unnamed: 0,user_id,country,signup_date
0,1,SA,2025-11-15
1,2,SA,2025-11-20
2,3,AE,2025-11-22
3,4,SA,2025-11-25
