## Detect Schema Mismatches in Data Pipelines
**Objective**: Identify and resolve schema mismatches that commonly occur in data pipelines.

**Task**: Column Name Mismatch

**Steps**:
1. Load the source DataFrame with the below schema:
    - id : Integer
    - name : String
    - age : Integer
2. Load the target DataFrame with the below schema:
    - id : Integer
    - fullname : String
    - age : Integer
3. Use a schema comparison tool or write a simple function to detect mismatches in column names.
4. Resolve the mismatch by renaming the `fullname` column in the target DataFrame to `name` .

In [2]:
# write your code from here

import pandas as pd

def detect_schema_mismatch(source_df: pd.DataFrame, target_df: pd.DataFrame) -> dict:
    """
    Detect column name mismatches between source and target DataFrames.
    Returns a dict with keys:
        - 'missing_in_target': columns in source but missing in target
        - 'extra_in_target': columns in target but missing in source
        - 'common_columns': columns common in both
    """
    source_cols = set(source_df.columns)
    target_cols = set(target_df.columns)

    missing_in_target = source_cols - target_cols
    extra_in_target = target_cols - source_cols
    common_columns = source_cols.intersection(target_cols)

    return {
        'missing_in_target': missing_in_target,
        'extra_in_target': extra_in_target,
        'common_columns': common_columns
    }

def resolve_column_rename(target_df: pd.DataFrame, rename_map: dict) -> pd.DataFrame:
    """
    Rename columns in target_df according to rename_map {old_name: new_name}.
    """
    # Validate rename_map keys are in target columns
    missing_cols = [col for col in rename_map.keys() if col not in target_df.columns]
    if missing_cols:
        raise ValueError(f"Columns {missing_cols} not found in target DataFrame for renaming.")

    return target_df.rename(columns=rename_map)

# Example usage with sample DataFrames
def main():
    # Source DataFrame schema: id (int), name (str), age (int)
    source_data = {
        'id': [1, 2, 3],
        'name': ['Alice', 'Bob', 'Charlie'],
        'age': [25, 30, 35]
    }
    source_df = pd.DataFrame(source_data)

    # Target DataFrame schema: id (int), fullname (str), age (int)
    target_data = {
        'id': [1, 2, 3],
        'fullname': ['Alice', 'Bob', 'Charlie'],
        'age': [25, 30, 35]
    }
    target_df = pd.DataFrame(target_data)

    # Detect mismatches
    mismatches = detect_schema_mismatch(source_df, target_df)
    print("Schema Mismatches:")
    print(f"Missing in target: {mismatches['missing_in_target']}")
    print(f"Extra in target: {mismatches['extra_in_target']}")
    print(f"Common columns: {mismatches['common_columns']}")

    # Resolve mismatch by renaming 'fullname' to 'name'
    if 'fullname' in mismatches['extra_in_target'] and 'name' in mismatches['missing_in_target']:
        rename_map = {'fullname': 'name'}
        target_df_fixed = resolve_column_rename(target_df, rename_map)
        print("\nRenamed target DataFrame columns:")
        print(target_df_fixed.columns)
    else:
        print("\nNo rename needed.")

# Basic tests
def test_detect_schema_mismatch():
    df1 = pd.DataFrame({'a': [1], 'b': [2]})
    df2 = pd.DataFrame({'a': [1], 'c': [3]})
    res = detect_schema_mismatch(df1, df2)
    assert res['missing_in_target'] == {'b'}
    assert res['extra_in_target'] == {'c'}
    assert res['common_columns'] == {'a'}

def test_resolve_column_rename():
    df = pd.DataFrame({'x': [1], 'y': [2]})
    renamed_df = resolve_column_rename(df, {'y': 'z'})
    assert 'z' in renamed_df.columns
    assert 'y' not in renamed_df.columns
    try:
        resolve_column_rename(df, {'a': 'b'})
    except ValueError as e:
        assert str(e) == "Columns ['a'] not found in target DataFrame for renaming."

if __name__ == "__main__":
    test_detect_schema_mismatch()
    test_resolve_column_rename()
    main()

Schema Mismatches:
Missing in target: {'name'}
Extra in target: {'fullname'}
Common columns: {'age', 'id'}

Renamed target DataFrame columns:
Index(['id', 'name', 'age'], dtype='object')
