In [1]:
#%pip install pandas numpy
import pandas as pd
import numpy as np
import os
from datetime import datetime

# Define the base directory for the data files.
data_dir = "../3.RawDataStorage/hagging/"
data_dirk= "../3.RawDataStorage/kaggle/"

# Define the full file paths using the base directory.
file_path_h = os.path.join(data_dir, "h-bank_churn.csv")
file_path_k = os.path.join(data_dirk,"k-bank_churn.csv")

def run_data_validation():
    """
    Combines two datasets and performs a series of data quality checks,
    then generates a summary report.
    """
    
    # Initialize a list to store report findings
    report_data = []

    def add_report_finding(category, description):
        """Helper function to add findings to the report."""
        report_data.append({"Category": category, "Description": description})

    print(f"Starting data validation with time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("-" * 50)
    print("Step 1: Ingesting and Merging Data")

    try:
        # Load the two CSV files into pandas DataFrames
        df_h = pd.read_csv(file_path_h)
        df_k = pd.read_csv(file_path_k)
        
        # Merge the two DataFrames
        df = pd.concat([df_h, df_k], ignore_index=True)

        print(f"Successfully merged {len(df_h)} rows from 'h-bank_churn.csv' and "
              f"{len(df_k)} rows from 'k-bank_churn.csv'.")
        print(f"Total rows in combined dataset: {len(df)}")
        print("-" * 50)
    except FileNotFoundError as e:
        print(f"Error: One of the files was not found. Please ensure both '{file_path_h}' and "
              f"'{file_path_k}' are in the correct directory.")
        add_report_finding("File Ingestion Error", f"Could not find required data file: {e}")
        return

    print("Step 2: Checking for Missing Data (NaN values)")
    # Check for missing values in each column
    missing_data = df.isnull().sum()
    if missing_data.sum() > 0:
        print("Missing values found:")
        print(missing_data[missing_data > 0])
        add_report_finding("Missing Data", "Missing values found in the following columns: "
                           f"{missing_data[missing_data > 0].to_dict()}")
    else:
        print("No missing values found. Data is complete.")
        add_report_finding("Missing Data", "No missing values found.")
    print("-" * 50)

    print("Step 3: Validating Data Types and Initial Structure")
    # Display data types and non-null counts
    print(df.info())
    add_report_finding("Data Types", "Initial data types and structure verified. See log output for details.")
    print("-" * 50)

    print("Step 4: Checking for Inconsistent and Outlier Data")
    # Validate numerical columns for logical ranges and outliers
    numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    
    for col in numeric_cols:
        desc = df[col].describe()
        print(f"\nDescriptive statistics for '{col}':\n{desc}")
        # Identify outliers using IQR
        Q1 = desc['25%']
        Q3 = desc['75%']
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        if not outliers.empty:
            print(f"Found {len(outliers)} outliers in '{col}' based on IQR method.")
            add_report_finding("Outliers", f"Found {len(outliers)} outliers in column '{col}'. "
                               f"Example: {outliers[col].head().tolist()}")
        
    # Check for inconsistent values in categorical columns
    categorical_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'Exited']
    print("\nValue counts for categorical columns:")
    for col in categorical_cols:
        counts = df[col].value_counts()
        print(f"\n'{col}':\n{counts}")
        if col in ['HasCrCard', 'IsActiveMember', 'Exited'] and not set(counts.index).issubset({0, 1}):
            add_report_finding("Inconsistent Data", f"Column '{col}' contains values other than 0 or 1.")
        
    print("-" * 50)

    print("Step 5: Identifying Duplicate Rows")
    # Check for duplicate rows
    duplicate_rows = df.duplicated().sum()
    if duplicate_rows > 0:
        print(f"Found {duplicate_rows} duplicate rows.")
        add_report_finding("Duplicates", f"Found {duplicate_rows} duplicate rows in the dataset.")
    else:
        print("No duplicate rows found.")
        add_report_finding("Duplicates", "No duplicate rows found.")
    print("-" * 50)

    print("Validation complete. Generating data quality report...")
    
    # Generate the comprehensive data quality report
    report_df = pd.DataFrame(report_data)
    
    # Save the report to a Markdown file
    report_file = os.path.join(os.getcwd(), "data_quality_report.md")
    with open(report_file, "w") as f:
        f.write("# Data Quality Report\n\n")
        f.write(f"**Report Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        f.write("This report summarizes the data quality findings from the `bank_churn` dataset.\n\n")
        f.write("## Dataset Overview\n")
        f.write(f"- Total Rows: {len(df)}\n")
        f.write(f"- Total Columns: {len(df.columns)}\n\n")
        
        f.write("## Validation Findings Summary\n")
        f.write(report_df.to_markdown(index=False))

    print(f"\nData quality report saved to {report_file}")
    print("-" * 50)


# Execute the validation process
if __name__ == "__main__":
    run_data_validation()



Starting data validation with time: 2025-08-23 20:46:29
--------------------------------------------------
Step 1: Ingesting and Merging Data
Successfully merged 10000 rows from 'h-bank_churn.csv' and 5010 rows from 'k-bank_churn.csv'.
Total rows in combined dataset: 15010
--------------------------------------------------
Step 2: Checking for Missing Data (NaN values)
Missing values found:
Surname             50
Tenure             101
Balance            100
EstimatedSalary    100
dtype: int64
--------------------------------------------------
Step 3: Validating Data Types and Initial Structure
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15010 entries, 0 to 15009
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        15010 non-null  int64  
 1   CustomerId       15010 non-null  int64  
 2   Surname          14960 non-null  object 
 3   CreditScore      15010 non-null  int64  
 4   Geograph