## AI & Machine Learning for Data Quality
**Description**: AI and machine learning can automate and enhance data quality checks by learning patterns and identifying anomalies more effectively than static rules.

**Task 1**: Training a model to predict and flag unusual trend patterns in sales data that
deviate from historical norms.

**Task 2**: Using clustering algorithms to detect duplicate records where entries are not
exactly identical.

In [17]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import logging
import unittest

# Configure logging for error tracking
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


# Data preparation with error handling
def prepare_data(df, column_name):
    try:
        if df.empty:
            raise ValueError("Input DataFrame is empty.")
        if column_name not in df.columns:
            raise KeyError(f"Column '{column_name}' not found in DataFrame.")
        if df[column_name].isnull().any():
            df[column_name] = df[column_name].fillna(df[column_name].mean())
            logging.warning(f"Missing values imputed in '{column_name}'.")
        return df[column_name].values.reshape(-1, 1)
    except (KeyError, TypeError, ValueError) as e:
        logging.error(f"prepare_data error: {e}")
        raise


# Model training with error handling
def train_anomaly_model(data, contamination=0.1):
    try:
        model = IsolationForest(contamination=contamination, random_state=42)
        model.fit(data)
        return model
    except Exception as e:
        logging.error(f"train_anomaly_model error: {e}")
        raise


# Detect anomalies with error handling
def detect_anomalies(model, data):
    try:
        scores = model.decision_function(data)
        predictions = model.predict(data)
        return scores, predictions
    except Exception as e:
        logging.error(f"detect_anomalies error: {e}")
        raise


# Complete anomaly detection pipeline
def anomaly_detection(df, column_name):
    try:
        data = prepare_data(df, column_name)
        model = train_anomaly_model(data)
        scores, predictions = detect_anomalies(model, data)
        return scores, predictions
    except Exception as e:
        logging.error(f"anomaly_detection error: {e}")
        return None, None


# Example usage
if __name__ == '__main__':
    # Example data with an outlier and missing value
    data = {'value': [1, 2, 3, 4, 5, 1000, None]}
    df = pd.DataFrame(data)

    scores, preds = anomaly_detection(df, 'value')

    if scores is not None and preds is not None:
        print("Anomaly Scores:", scores)
        print("Anomaly Predictions:", preds)
    else:
        print("Anomaly detection failed. Check logs for details.")


# Unit tests
class TestAnomalyDetection(unittest.TestCase):

    def test_prepare_data_valid(self):
        df = pd.DataFrame({'val': [1, 2, 3]})
        result = prepare_data(df, 'val')
        self.assertEqual(result.shape, (3, 1))

    def test_prepare_data_missing_column(self):
        df = pd.DataFrame({'a': [1, 2]})
        with self.assertRaises(KeyError):
            prepare_data(df, 'missing')

    def test_prepare_data_empty(self):
        df = pd.DataFrame()
        with self.assertRaises(ValueError):
            prepare_data(df, 'any')

    def test_train_anomaly_model(self):
        data = np.array([[1], [2], [3]])
        model = train_anomaly_model(data)
        self.assertIsNotNone(model)

    def test_detect_anomalies(self):
        data = np.array([[1], [2], [3]])
        model = train_anomaly_model(data)
        scores, preds = detect_anomalies(model, data)
        self.assertEqual(len(scores), len(data))
        self.assertEqual(len(preds), len(data))

    def test_full_pipeline(self):
        df = pd.DataFrame({'value': [1, 2, 3, 100]})
        scores, preds = anomaly_detection(df, 'value')
        self.assertIsNotNone(scores)
        self.assertIsNotNone(preds)
        self.assertEqual(len(scores), len(df))
        self.assertEqual(len(preds), len(df))


if __name__ == '__main__':
    unittest.main()



Anomaly Scores: [ 0.25798967  0.3036381   0.31073271  0.30141526  0.258531   -0.07669632
  0.05113088]
Anomaly Predictions: [ 1  1  1  1  1 -1  1]


usage: ipykernel_launcher.py [-h] [-v] [-q] [--locals] [-f] [-c] [-b]
                             [-k TESTNAMEPATTERNS]
                             [tests ...]
ipykernel_launcher.py: error: argument -f/--failfast: ignored explicit argument '/home/vscode/.local/share/jupyter/runtime/kernel-v331ad0828149bc81b57206b41355fca79a661f26a.json'


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [18]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

def load_data(file_path):
    """Loads data from a CSV file."""
    try:
        data = pd.read_csv(file_path)
        return data
    except FileNotFoundError as e:
        logging.error(f"File not found: {file_path}")
        raise e
    except pd.errors.ParserError as e:
        logging.error(f"Parsing error while reading the file: {file_path}")
        raise e

def validate_columns(data, required_columns):
    """Checks if required columns exist in the DataFrame."""
    missing_cols = [col for col in required_columns if col not in data.columns]
    if missing_cols:
        raise ValueError(f"Missing columns: {missing_cols}")

def handle_duplicates(data):
    """Removes duplicate rows from the DataFrame."""
    before = len(data)
    data_deduped = data.drop_duplicates()
    after = len(data_deduped)
    logging.info(f"Removed {before - after} duplicates.")
    return data_deduped

def detect_anomalies(data, feature_cols):
    """Detects anomalies using Isolation Forest."""
    try:
        clf = IsolationForest(contamination=0.05, random_state=42)
        clf.fit(data[feature_cols])
        data['anomaly'] = clf.predict(data[feature_cols])
        # Convert -1 to True for anomalies
        data['is_anomaly'] = data['anomaly'] == -1
        return data
    except Exception as e:
        logging.error("Error during anomaly detection")
        raise e

def train_classifier(data, feature_cols, target_col):
    """Trains a classifier to validate data labels."""
    try:
        X = data[feature_cols]
        y = data[target_col]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        # Example classifier
        from sklearn.ensemble import RandomForestClassifier
        clf = RandomForestClassifier(random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        report = classification_report(y_test, y_pred)
        logging.info(f"Classification report:\n{report}")
        return clf
    except Exception as e:
        logging.error("Error during classifier training")
        raise e

def main(file_path):
    """Main pipeline for data quality assessment."""
    try:
        data = load_data(file_path)
        required_columns = ['feature1', 'feature2', 'label']
        validate_columns(data, required_columns)
        data = handle_duplicates(data)
        data_with_anomalies = detect_anomalies(data, ['feature1', 'feature2'])
        classifier = train_classifier(data_with_anomalies, ['feature1', 'feature2'], 'label')
        print("Data quality assessment completed successfully.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    # Example usage
    file_path = 'your_data.csv'
    main(file_path)

2025-05-15 06:39:07,151 - ERROR - File not found: your_data.csv


An error occurred: [Errno 2] No such file or directory: 'your_data.csv'


In [15]:
# write your code from here

import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack

# 1. Sample dataset simulating slightly different duplicate records
data = {
    'Name': [
        'John Doe', 'Jon Doe', 'Jane Smith', 'Janet Smith', 'Jake Long',
        'Jack Long', 'John Doe', 'J0hn Do', 'Jane Smyth', 'Jake L0ng'
    ],
    'Email': [
        'john@example.com', 'jon@example.com', 'jane.smith@example.com', 'janet@example.com',
        'jake.long@example.com', 'jack.long@example.com', 'john.doe@example.com',
        'john@example.com', 'jane.s@example.com', 'jake@example.com'
    ],
    'Phone': [
        '555-1234', '555-1234', '555-5678', '555-5678', '555-8765',
        '555-8766', '555-1234', '555-1234', '555-5678', '555-8765'
    ]
}

df = pd.DataFrame(data)

# 2. Convert categorical/text fields to numeric features
# Vectorize names and emails with TF-IDF
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))  # char ngrams for fuzzy similarity
name_vec = vectorizer.fit_transform(df['Name'])
email_vec = vectorizer.fit_transform(df['Email'])

# 3. Encode phone numbers (simple numeric encoding after removing non-digits)
df['Phone_Num'] = df['Phone'].str.replace(r'\D', '', regex=True).astype(int).values.reshape(-1, 1)

# 4. Combine all features into one feature matrix
from scipy.sparse import csr_matrix
phone_scaled = StandardScaler().fit_transform(df[['Phone_Num']])
phone_sparse = csr_matrix(phone_scaled)
features = hstack([name_vec, email_vec, phone_sparse])

# 5. Apply DBSCAN clustering to group similar records (eps controls similarity threshold)
dbscan = DBSCAN(eps=0.5, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(features)

df['Cluster'] = clusters

# 6. Display clusters with more than one member (potential duplicates)
duplicates = df.groupby('Cluster').filter(lambda x: len(x) > 1).sort_values('Cluster')

print("🔍 Potential duplicate groups detected:")
for cluster_id, group in duplicates.groupby('Cluster'):
    print(f"\nCluster {cluster_id}:")
    print(group[['Name', 'Email', 'Phone']].to_string(index=False))

🔍 Potential duplicate groups detected:

Cluster 0:
    Name                Email    Phone
John Doe     john@example.com 555-1234
 Jon Doe      jon@example.com 555-1234
John Doe john.doe@example.com 555-1234
 J0hn Do     john@example.com 555-1234

Cluster 1:
       Name                  Email    Phone
 Jane Smith jane.smith@example.com 555-5678
Janet Smith      janet@example.com 555-5678
 Jane Smyth     jane.s@example.com 555-5678

Cluster 2:
     Name                 Email    Phone
Jake Long jake.long@example.com 555-8765
Jack Long jack.long@example.com 555-8766
Jake L0ng      jake@example.com 555-8765


**Task 3**: Implementing classification models to validate data based on learned
characteristics from labeled datasets.

In [16]:
# write your code from here
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Mock labeled dataset (features + label)
data = {
    'feature1': [10, 15, 14, 10, 8, 50, 55, 52, 48, 60],
    'feature2': [100, 110, 105, 95, 90, 300, 310, 305, 290, 320],
    'feature3': [5, 7, 6, 5, 4, 20, 21, 19, 22, 25],
    'label':    [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]  # 0=Valid, 1=Invalid
}
df = pd.DataFrame(data)

# 2. Split dataset
X = df.drop('label', axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Train classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# 4. Evaluate model
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 5. Validate new data
new_data = pd.DataFrame({
    'feature1': [12, 53],
    'feature2': [102, 295],
    'feature3': [6, 18]
})

predictions = clf.predict(new_data)
print("\nNew data validation predictions (0=Valid, 1=Invalid):", predictions)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

Confusion Matrix:
 [[1 0]
 [0 2]]

New data validation predictions (0=Valid, 1=Invalid): [0 1]
