### Machine Learning for Data Quality Prediction
**Description**: Use a machine learning model to predict data quality issues.

**Steps**:
1. Create a mock dataset with features and label (quality issue/label: 0: good, 1: issue).
2. Train a machine learning model.
3. Evaluate the model performance.

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def train_and_evaluate(data, target_column):
    """
    Trains a RandomForestClassifier model on the given data and evaluates its performance.

    Args:
        data (pd.DataFrame): The input data.
        target_column (str): The name of the target column.

    Returns:
        tuple: A tuple containing the classification report and confusion matrix.  Returns None, None if an error occurs.
    """

    try:
        # Data validation
        if not isinstance(data, pd.DataFrame):
            raise TypeError("Input data must be a pandas DataFrame.")
        if data.empty:
            raise ValueError("Input data cannot be empty.")
        if target_column not in data.columns:
            raise ValueError(f"Target column '{target_column}' not found in the data.")
        if data[target_column].isnull().any():
            raise ValueError(f"Target column '{target_column}' contains missing values.")


        # Prepare data
        X = data.drop(target_column, axis=1)
        y = data[target_column]

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # Model training
        model = RandomForestClassifier(random_state=42) #Add random_state for reproducibility
        model.fit(X_train, y_train)

        # Model prediction
        y_pred = model.predict(X_test)

        # Evaluation
        classification_rep = classification_report(y_test, y_pred)
        confusion_mat = confusion_matrix(y_test, y_pred)

        logging.info("Model training and evaluation completed successfully.")
        return classification_rep, confusion_mat

    except (TypeError, ValueError) as e:
        logging.error(f"Data validation error: {e}")
        return None, None  # Indicate failure

    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        return None, None # Indicate failure


if __name__ == '__main__':
    # Example usage
    # Create a sample dataset
    data = pd.DataFrame({
        'feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'feature2': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
        'target': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
    })

    # Train and evaluate the model
    classification_report_result, confusion_matrix_result = train_and_evaluate(data, 'target')

    if classification_report_result and confusion_matrix_result:
        print("Classification Report:\n", classification_report_result)
        print("Confusion Matrix:\n", confusion_matrix_result)
    else:
        print("Model training and evaluation failed. See logs for details.")

2025-05-15 06:29:01,943 - INFO - Model training and evaluation completed successfully.


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()