In [None]:
"""
Data Validation Utilities for Fake News Detection

This module provides utilities for validating and cleaning all columns in the fake news dataset.
It handles null values, blank fields, and malformed data, ensuring high data quality for downstream analysis.

The implementation uses Spark's distributed processing capabilities to ensure scalability.
"""

In [None]:
import os
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, length, trim, when, lit, regexp_replace, udf
from pyspark.sql.types import StringType, BooleanType
import re
import string
from datetime import datetime

In [None]:
# Configure Spark session optimized for Databricks Community Edition
spark = SparkSession.builder \
    .appName("FakeNewsDetection_DataValidation") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.driver.memory", "8g") \
    .enableHiveSupport() \
    .getOrCreate()

# Display Spark configuration
print(f"Spark version: {spark.version}")
print(f"Shuffle partitions: {spark.conf.get('spark.sql.shuffle.partitions')}")
print(f"Driver memory: {spark.conf.get('spark.driver.memory')}")

In [None]:
# Import custom modules
from BDA02_preprocessing.data_validation_utils import DataValidator

# Initialize validator
validator = DataValidator()

In [None]:
# Load sample data
fake_path = "/dbfs/FileStore/tables/fake.csv"
true_path = "/dbfs/FileStore/tables/true.csv"

# Check if files exist, otherwise use local paths
if not os.path.exists(fake_path.replace("/dbfs", "")):
    fake_path = "../01_data_ingestion/Fake.csv"
    true_path = "../01_data_ingestion/True.csv"

# Load data with sampling for demonstration
fake_df = spark.read.csv(fake_path, header=True, inferSchema=True).sample(0.1)
true_df = spark.read.csv(true_path, header=True, inferSchema=True).sample(0.1)

# Combine datasets with label
fake_df = fake_df.withColumn("label", lit(0))  # 0 for fake
true_df = true_df.withColumn("label", lit(1))  # 1 for true
df = fake_df.union(true_df)

# Display sample
print(f"Total rows: {df.count()}")
display(df.limit(5))

In [None]:
# Analyze data quality issues
quality_metrics = validator.analyze_data_quality(df)

# Display metrics
print("Data Quality Metrics:")
print("\nCompleteness:")
for column, score in quality_metrics['completeness'].items():
    print(f"  {column}: {score:.2f}")

print("\nValidity:")
for column, score in quality_metrics['validity'].items():
    print(f"  {column}: {score:.2f}")

In [None]:
# Validate and clean data
cleaned_df = validator.validate_and_clean(df)

# Display cleaned data
print(f"Original row count: {df.count()}")
print(f"Cleaned row count: {cleaned_df.count()}")
display(cleaned_df.limit(5))

In [None]:
# Check for data leakage in subject column
print("Subject distribution by label:")
display(cleaned_df.groupBy("subject", "label").count().orderBy("subject", "label"))

In [None]:
# Remove subject column to prevent data leakage
final_df = cleaned_df.drop("subject")

# Display final dataset
print("Final dataset schema:")
final_df.printSchema()
display(final_df.limit(5))

In [None]:
# Save validated data for next steps
output_path = "../processed_data/validated_data.parquet"
final_df.write.mode("overwrite").parquet(output_path)
print(f"Validated data saved to {output_path}")

In [None]:
# Analyze data quality after cleaning
final_metrics = validator.analyze_data_quality(final_df)

# Display metrics
print("Final Data Quality Metrics:")
print("\nCompleteness:")
for column, score in final_metrics['completeness'].items():
    print(f"  {column}: {score:.2f}")

print("\nValidity:")
for column, score in final_metrics['validity'].items():
    print(f"  {column}: {score:.2f}")