## Detect Schema Mismatches in Data Pipelines
**Objective**: Identify and resolve schema mismatches that commonly occur in data pipelines.

**Task**: Missing Column

1. Load the source DataFrame with the below schema:
    - id : Integer
    - email : String
    - signup_date : Date
2. Load the target DataFrame with the below schema:
    - id : Integer
    - email : String
3. Implement a check to identify any columns that are present in the source DataFrame but missing in the target.
4. Add the missing `signup_date` column to the target DataFrame.

In [2]:
# write your code from here

import pandas as pd
from datetime import datetime

def detect_missing_columns(source_df: pd.DataFrame, target_df: pd.DataFrame) -> set:
    """
    Returns a set of columns present in source_df but missing in target_df.
    """
    return set(source_df.columns) - set(target_df.columns)

def add_missing_columns(target_df: pd.DataFrame, missing_cols: set, source_df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds missing columns to target_df with default values based on source_df dtypes.
    """
    for col in missing_cols:
        if col in source_df.columns:
            dtype = source_df[col].dtype
            # Provide default values based on dtype
            if pd.api.types.is_datetime64_any_dtype(dtype):
                # Use current date or NaT as default (choose NaT here)
                target_df[col] = pd.NaT
            elif pd.api.types.is_numeric_dtype(dtype):
                target_df[col] = 0
            else:
                target_df[col] = ''
        else:
            # Column not in source_df, just add empty column
            target_df[col] = None
    return target_df

def main():
    # Create source DataFrame with id, email, signup_date (datetime)
    source_data = {
        'id': [1, 2, 3],
        'email': ['a@example.com', 'b@example.com', 'c@example.com'],
        'signup_date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03'])
    }
    source_df = pd.DataFrame(source_data)

    # Create target DataFrame with id, email (missing signup_date)
    target_data = {
        'id': [1, 2, 3],
        'email': ['a@example.com', 'b@example.com', 'c@example.com']
    }
    target_df = pd.DataFrame(target_data)

    # Detect missing columns
    missing_cols = detect_missing_columns(source_df, target_df)
    print(f"Missing columns in target DataFrame: {missing_cols}")

    # Add missing columns to target
    if missing_cols:
        target_df = add_missing_columns(target_df, missing_cols, source_df)
        print("\nTarget DataFrame after adding missing columns:")
        print(target_df)
    else:
        print("\nNo missing columns detected.")

# Basic tests
def test_detect_missing_columns():
    df1 = pd.DataFrame({'a': [1], 'b': [2]})
    df2 = pd.DataFrame({'a': [1]})
    missing = detect_missing_columns(df1, df2)
    assert missing == {'b'}

def test_add_missing_columns():
    source = pd.DataFrame({
        'num_col': [1,2],
        'str_col': ['x','y'],
        'date_col': pd.to_datetime(['2020-01-01','2020-01-02'])
    })
    target = pd.DataFrame({'num_col': [10, 20]})
    missing = detect_missing_columns(source, target)
    target = add_missing_columns(target, missing, source)
    assert 'str_col' in target.columns
    assert 'date_col' in target.columns
    assert target['str_col'].iloc[0] == ''
    assert pd.isna(target['date_col'].iloc[0])

if __name__ == "__main__":
    test_detect_missing_columns()
    test_add_missing_columns()
    main()

Missing columns in target DataFrame: {'signup_date'}

Target DataFrame after adding missing columns:
   id          email signup_date
0   1  a@example.com         NaT
1   2  b@example.com         NaT
2   3  c@example.com         NaT
