In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

# ============================
# Helper Functions for Reusability
# ============================

# Function to handle missing data
def handle_missing_data(df, column_name):
    missing_data = df[column_name].isnull().sum()
    total_data = df[column_name].shape[0]
    missing_percentage = (missing_data / total_data) * 100
    print(f"Missing data in {column_name}: {missing_data} records ({missing_percentage:.2f}%)")

# Function to detect duplicate data
def handle_duplicates(df):
    duplicate_count = df.duplicated().sum()
    print(f"Duplicate rows: {duplicate_count}")

# Function for checking and fixing inconsistent formats (e.g., date formats)
def fix_inconsistent_date_formats(df, column_name, target_format='%Y-%m-%d'):
    df[column_name] = pd.to_datetime(df[column_name], errors='coerce').dt.strftime(target_format)
    print(f"Inconsistent formats fixed in {column_name}")

# Function to detect data drift using T-test
def detect_data_drift(data_1, data_2):
    t_stat, p_value = ttest_ind(data_1, data_2)
    return t_stat, p_value

# ============================
# 1. Identifying Missing Data
# ============================
# Example DataFrame with missing data
df = pd.DataFrame({
    'CustomerEmail': ['email1@example.com', 'email2@example.com', None, 'email4@example.com'],
    'TransactionDate': [None, '2021-01-15', '2021-01-16', '2021-01-17'],
})

handle_missing_data(df, 'CustomerEmail')
handle_missing_data(df, 'TransactionDate')

# ============================
# 2. Duplicate Data
# ============================
# Example DataFrame with duplicate data
df_duplicates = pd.DataFrame({
    'CustomerID': [1, 2, 2, 3, 4],
    'OrderAmount': [100, 200, 200, 150, 250]
})

handle_duplicates(df_duplicates)

# ============================
# 3. Inconsistent Formats
# ============================
# Example DataFrame with inconsistent date formats
df_dates = pd.DataFrame({
    'OrderDate': ['01/12/2021', '2021-12-02', '12-03-2021', '2021/12/04']
})
fix_inconsistent_date_formats(df_dates, 'OrderDate')

# ============================
# 4. Data Drift Detection
# ============================

# Example Data for Data Drift
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
revenue = [1000, 1100, 1050, 1200, 1250, 1300]  # Original revenue data
revenue_with_drift = [1000, 1100, 1050, 1200, 1250, 1500]  # Drift introduced in June

t_stat, p_value = detect_data_drift(revenue, revenue_with_drift)
print(f"T-Test for Revenue Drift - Statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

# ============================
# Additional Robust Error Handling Example
# ============================

# Function to ensure valid data types before processing
def validate_data_types(df, column_name, expected_type):
    if not np.issubdtype(df[column_name].dtype, expected_type):
        print(f"Error: {column_name} is not of type {expected_type}")
        return False
    return True

# Check for valid data types
if validate_data_types(df, 'TransactionDate', np.datetime64):
    print("Data type for TransactionDate is valid")

Missing data in CustomerEmail: 1 records (25.00%)
Missing data in TransactionDate: 1 records (25.00%)
Duplicate rows: 1
Inconsistent formats fixed in OrderDate
T-Test for Revenue Drift - Statistic: -0.3780, p-value: 0.7134
Error: TransactionDate is not of type <class 'numpy.datetime64'>
