# Imports


In [1]:
import pandas as pd

# Read all data 

In [15]:
# Data loading configuration
data_files = {
    # 2013-2014 data
    "cities": "../data/bronze/2013-2014/application.cities.csv",
    "countries": "../data/bronze/2013-2014/application.countries.csv",
    "deliverymethods": "../data/bronze/2013-2014/application.deliverymethods.csv",
    "paymentmethod": "../data/bronze/2013-2014/application.paymentmethod.csv",
    "people": "../data/bronze/2013-2014/application.people.csv",
    "province": "../data/bronze/2013-2014/application.province.csv",
    "transactiontypes": "../data/bronze/2013-2014/application.transactiontypes.csv",
    "purchase_order": "../data/bronze/2013-2014/purchase.order.csv",
    "purchase_orderline": "../data/bronze/2013-2014/purchase.orderline.csv",
    "suppliers": "../data/bronze/2013-2014/purchasing.suppliers.csv",
    "supplierstransactions": "../data/bronze/2013-2014/purchasing.supplierstransactions.csv",
    "customer": "../data/bronze/2013-2014/sales.customer.csv",
    "invoiceslines": "../data/bronze/2013-2014/sales.incvoiceslines.csv",
    "invoices": "../data/bronze/2013-2014/sales.invoices.csv",
    "sales_order": "../data/bronze/2013-2014/sales.order.csv",
    "sales_orderline": "../data/bronze/2013-2014/sales.orderline.csv",
    "colors": "../data/bronze/2013-2014/warehouse.colors.csv",
    "packagetypes": "../data/bronze/2013-2014/warehouse.packagetypes.csv",
    "stockgroups": "../data/bronze/2013-2014/warehouse.stockgroups.csv",
    "stockitemholdings": "../data/bronze/2013-2014/warehouse.stockitemholdings.csv",
    "stockitems": "../data/bronze/2013-2014/warehouse.stockitems.csv",
    # Upcoming data
    "people_next": "../data/bronze/upcoming/application.people.csv",
    "purchase_order_next": "../data/bronze/upcoming/purchase.order.csv",
    "purchase_orderline_next": "../data/bronze/upcoming/purchase.orderline.csv",
    "customer_next": "../data/bronze/upcoming/sales.customer.csv",
    "invoiceslines_next": "../data/bronze/upcoming/sales.incvoiceslines.csv",
    "invoices_next": "../data/bronze/upcoming/sales.invoices.csv",
    "sales_order_next": "../data/bronze/upcoming/sales.order.csv",
    "sales_orderline_next": "../data/bronze/upcoming/sales.orderline.csv",
}

for name, filepath in data_files.items():
    globals()[f"df_{name}"] = pd.read_csv(filepath)

# Functions

In [29]:
def dataframe_health_check(df, name="DataFrame"):
    """
    Comprehensive overview of DataFrame quality and structure
    """
    print(f"\n{'='*80}")
    print(f"DATAFRAME HEALTH CHECK: {name}")
    print(f"{'='*80}\n")
    
    # 1. DIMENSIONS & STRUCTURE
    print(f"DIMENSIONS:")
    print(f"Rows: {df.shape[0]:,} | Columns: {df.shape[1]}")
    print(f"Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB\n")
    
    # 2. COLUMN DATA TYPES
    print(f"COLUMN TYPES:")
    dtype_summary = df.dtypes.value_counts()
    for dtype, count in dtype_summary.items():
        print(f"   {str(dtype):15} : {count:3} columns")
    print()
    
    # 3. DUPLICATES
    print(f"DUPLICATES:")
    duplicates = df.duplicated().sum()
    print(f"Rows: {duplicates:,} ({duplicates/len(df)*100:.2f}%)")
    print(f"Complete duplicates: {df.duplicated(keep=False).sum():,}\n")
    
    # 4. COLUMN DETAILS
    print(f"COLUMN DETAILS:")
    print(f"{'Column':<25} {'Type':<12} {'Unique':<10} {'Nulls %':<10}")
    print("-" * 65)
    for col in df.columns:
        unique = df[col].nunique()
        null_pct = (df[col].isnull().sum() / len(df) * 100)
        print(f"{col:<25} {str(df[col].dtype):<12} {unique:<10} {null_pct:>6.2f}%")
    
    
    print(f"{'='*80}\n")

# Example: Health check for a table
dataframe_health_check(df_people, "application.people")


DATAFRAME HEALTH CHECK: application.people

DIMENSIONS:
Rows: 906 | Columns: 22
Memory: 0.31 MB

COLUMN TYPES:
   str             :  10 columns
   int64           :   7 columns
   float64         :   5 columns

DUPLICATES:
Rows: 0 (0.00%)
Complete duplicates: 0

COLUMN DETAILS:
Column                    Type         Unique     Nulls %   
-----------------------------------------------------------------
PersonID                  int64        906          0.00%
FullName                  str          906          0.00%
PreferredName             str          750          0.00%
SearchName                str          906          0.00%
IsPermittedToLogon        int64        1            0.00%
LogonName                 str          1            0.00%
IsExternalLogonProvider   int64        1            0.00%
HashedPassword            float64      0          100.00%
IsSystemUser              int64        1            0.00%
IsEmployee                int64        1            0.00%
IsSalesperson

In [None]:
# Describe loop for all DataFrames (2013-2014 only)
bronze_2013_2014_tables = {k: v for k, v in globals().items() if k.startswith('df_') and not k.endswith('_next') and isinstance(v, pd.DataFrame)}

print(f"\n{'='*80}")
print(f"ðŸ“Š DESCRIBE FOR ALL TABLES ({len(bronze_2013_2014_tables)} tables)")
print(f"{'='*80}\n")

for table_name, df in sorted(bronze_2013_2014_tables.items()):
    table_label = table_name.replace('df_', '')
    print(f"\n{'â”€'*80}")
    print(f"ðŸ“ˆ {table_label.upper()}")
    print(f"{'â”€'*80}")
    print(df.describe().round(2).to_string())

In [None]:
# Head(5) loop for all DataFrames (2013-2014 only)
print(f"\n{'='*80}")
print(f"ðŸ‘€ FIRST 5 ROWS FOR ALL TABLES ({len(bronze_2013_2014_tables)} tables)")
print(f"{'='*80}\n")

for table_name, df in sorted(bronze_2013_2014_tables.items()):
    table_label = table_name.replace('df_', '')
    print(f"\n{'â”€'*80}")
    print(f"ðŸ“‹ {table_label.upper()} - First 5 rows")
    print(f"{'â”€'*80}")
    print(df.head(5).to_string())
    print(f"Shape: {df.shape[0]:,} rows Ã— {df.shape[1]} columns")

In [24]:
df_people.head()

Unnamed: 0,PersonID,FullName,PreferredName,SearchName,IsPermittedToLogon,LogonName,IsExternalLogonProvider,HashedPassword,IsSystemUser,IsEmployee,...,PhoneNumber,FaxNumber,EmailAddress,Photo,CustomFields,OtherLanguages,LastEditedBy,ValidFrom,ValidTo,ValidFrom_parsed
0,1002,Laimonis Berzins,Laimonis,Laimonis Laimonis Berzins,0,NO LOGON,0,,0,0,...,(308) 555-0100,(308) 555-0101,laimonis@tailspintoys.com,,,,1,2013-01-01 00:00:00.0000000,9999-12-31 23:59:59.9999999,2013-01-01T00:00:00.000000
1,1003,Lorena Cindric,Lorena,Lorena Lorena Cindric,0,NO LOGON,0,,0,0,...,(406) 555-0100,(406) 555-0101,lorena@tailspintoys.com,,,,1,2013-01-01 00:00:00.0000000,9999-12-31 23:59:59.9999999,2013-01-01T00:00:00.000000
2,1004,Hung Van Groesen,Hung,Hung Hung Van Groesen,0,NO LOGON,0,,0,0,...,(406) 555-0100,(406) 555-0101,hung@tailspintoys.com,,,,1,2013-01-01 00:00:00.0000000,9999-12-31 23:59:59.9999999,2013-01-01T00:00:00.000000
3,1005,Bhaargav Rambhatla,Bhaargav,Bhaargav Bhaargav Rambhatla,0,NO LOGON,0,,0,0,...,(480) 555-0100,(480) 555-0101,bhaargav@tailspintoys.com,,,,1,2013-01-01 00:00:00.0000000,9999-12-31 23:59:59.9999999,2013-01-01T00:00:00.000000
4,1006,Mudar Jevtic,Mudar,Mudar Mudar Jevtic,0,NO LOGON,0,,0,0,...,(480) 555-0100,(480) 555-0101,mudar@tailspintoys.com,,,,1,2013-01-01 00:00:00.0000000,9999-12-31 23:59:59.9999999,2013-01-01T00:00:00.000000


In [25]:
df_people.describe()

Unnamed: 0,PersonID,IsPermittedToLogon,IsExternalLogonProvider,HashedPassword,IsSystemUser,IsEmployee,IsSalesperson,UserPreferences,Photo,CustomFields,OtherLanguages,LastEditedBy
count,906.0,906.0,906.0,0.0,906.0,906.0,906.0,0.0,0.0,0.0,0.0,906.0
mean,2020.357616,0.0,0.0,,0.0,0.0,0.0,,,,,1.0
std,744.497738,0.0,0.0,,0.0,0.0,0.0,,,,,0.0
min,1002.0,0.0,0.0,,0.0,0.0,0.0,,,,,1.0
25%,1259.25,0.0,0.0,,0.0,0.0,0.0,,,,,1.0
50%,2116.5,0.0,0.0,,0.0,0.0,0.0,,,,,1.0
75%,2376.75,0.0,0.0,,0.0,0.0,0.0,,,,,1.0
max,3238.0,0.0,0.0,,0.0,0.0,0.0,,,,,1.0


In [28]:
df_people.info()

<class 'pandas.DataFrame'>
RangeIndex: 906 entries, 0 to 905
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   PersonID                 906 non-null    int64  
 1   FullName                 906 non-null    str    
 2   PreferredName            906 non-null    str    
 3   SearchName               906 non-null    str    
 4   IsPermittedToLogon       906 non-null    int64  
 5   LogonName                906 non-null    str    
 6   IsExternalLogonProvider  906 non-null    int64  
 7   HashedPassword           0 non-null      float64
 8   IsSystemUser             906 non-null    int64  
 9   IsEmployee               906 non-null    int64  
 10  IsSalesperson            906 non-null    int64  
 11  UserPreferences          0 non-null      float64
 12  PhoneNumber              906 non-null    str    
 13  FaxNumber                906 non-null    str    
 14  EmailAddress             906 non-null