# DATA ASSURANCE

In [None]:
# Load post-imputation dataset and initialize quality assessment tools
import os
import pandas as pd
from pathlib import Path
from scipy.stats import zscore

# Fix path to go up one directory level from notebooks/ to project root
data_folder = Path.cwd().parent / 'data'
data_file = data_folder / 'my_clean_data_with_imputation.csv'
if data_file.exists():
    df = pd.read_csv(data_file)
    print(f"Successfully loaded dataset from: {data_file}")
else:
    raise FileNotFoundError(f"File not found: {data_file}")

# Process data by day using day_id
if 'day_id' in df.columns:
    print(f"Processing data by day using day_id column")
    unique_days = sorted(df['day_id'].unique())
    print(f"Number of unique days: {len(unique_days)}")
else:
    print("Warning: No day_id column found in dataset")
    unique_days = []

FileNotFoundError: File not found: c:\Users\13min\Final-Group-ML-Project-Theme-5\notebooks\data\my_clean_data_with_imputation.csv

## Dataset Dimensions Check

In [None]:
# Verify dataset size and structure for baseline understanding
print("Initial dataset shape:", df.shape)

Initial dataset shape: (2837629, 13)


## Duplicate Row Detection

In [None]:
# Identify exact duplicate rows that may inflate model performance
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

# Process by day if day_id column exists
if 'day_id' in df.columns and len(unique_days) > 0:
    print("\nDuplicates by day:")
    for day in unique_days:
        day_data = df[df['day_id'] == day]
        day_duplicates = day_data.duplicated().sum()
        print(f"Day {day}: {day_duplicates} duplicates")

## GPS Coordinate Validation

In [None]:
# Check for invalid latitude/longitude values outside valid geographic ranges
invalid_gps = df[(df["latitude"] < -90) | (df["latitude"] > 90) | (df["longitude"] < -180) | (df["longitude"] > 180)]
print(f"Invalid GPS rows: {len(invalid_gps)}")

# Process by day if day_id column exists
if 'day_id' in df.columns and len(unique_days) > 0:
    print("\nInvalid GPS by day:")
    for day in unique_days:
        day_data = df[df['day_id'] == day]
        day_invalid = day_data[(day_data["latitude"] < -90) | (day_data["latitude"] > 90) | (day_data["longitude"] < -180) | (day_data["longitude"] > 180)]
        print(f"Day {day}: {len(day_invalid)} invalid GPS rows")

## Negative Value Validation

In [None]:
# Detect logically impossible negative values in latency and throughput metrics
neg_latency = (df[["svr1", "svr2", "svr3", "svr4"]] < 0).sum().sum()
neg_throughput = (df[["upload_bitrate_mbits/sec", "download_bitrate_rx_mbits/sec"]] < 0).sum().sum()
print(f"Negative latency values: {neg_latency}")
print(f"Negative throughput values: {neg_throughput}")

# Process by day if day_id column exists
if 'day_id' in df.columns and len(unique_days) > 0:
    print("\nNegative values by day:")
    for day in unique_days:
        day_data = df[df['day_id'] == day]
        day_neg_latency = (day_data[["svr1", "svr2", "svr3", "svr4"]] < 0).sum().sum()
        day_neg_throughput = (day_data[["upload_bitrate_mbits/sec", "download_bitrate_rx_mbits/sec"]] < 0).sum().sum()
        print(f"Day {day}: {day_neg_latency} negative latency, {day_neg_throughput} negative throughput")

## Define Key Metric Column Groups

In [None]:
# Organize column names for consistent analysis across latency and throughput features
latency_cols = ["svr1", "svr2", "svr3", "svr4"]
throughput_cols = ["upload_bitrate_mbits/sec", "download_bitrate_rx_mbits/sec"]

## Post-Imputation Completeness Verification

In [None]:
# Ensure imputation process successfully eliminated all missing values
missing_any = df.isna().sum().sum()
print(f"Missing values remaining: {missing_any} (should be 0 after imputation)")

Missing values remaining: 0 (should be 0 after imputation)


## Data Type Schema Validation

In [None]:
# Verify critical columns maintain expected numeric data types for ML compatibility
expected_numeric = ["svr1", "svr2", "svr3", "svr4", "upload_bitrate_mbits/sec", "download_bitrate_rx_mbits/sec", "latitude", "longitude"]
type_issues = []
for col in expected_numeric:
    if col in df.columns and not pd.api.types.is_numeric_dtype(df[col]):
        type_issues.append(f"{col}: {df[col].dtype}")
print(f"Non-numeric columns that should be numeric: {type_issues if type_issues else 'None'}")

Non-numeric columns that should be numeric: None


## Feature Variance Assessment

In [None]:
# Identify zero or constant variance columns that provide no predictive value
import numpy as np

numeric_cols = df.select_dtypes(include=[np.number]).columns
zero_var = [col for col in numeric_cols if df[col].nunique() <= 1]
print(f"Zero/constant variance columns: {zero_var if zero_var else 'None'}")

Zero/constant variance columns: None


## Row-Level Anomaly Detection

In [None]:
# Count complete records with extreme values for comprehensive outlier assessment
zscores = df[latency_cols + throughput_cols].apply(zscore)
extreme_rows = (zscores.abs() > 3).any(axis=1).sum()
print(f"Rows with extreme values (z>3): {extreme_rows}")

# Process by day if day_id column exists
if 'day_id' in df.columns and len(unique_days) > 0:
    print("\nExtreme values by day:")
    for day in unique_days:
        day_data = df[df['day_id'] == day]
        day_zscores = day_data[latency_cols + throughput_cols].apply(zscore)
        day_extreme = (day_zscores.abs() > 3).any(axis=1).sum()
        print(f"Day {day}: {day_extreme} rows with extreme values")

print(f"\nDataset ready for EDA: {len(df)} rows, {len(df.columns)} columns")

## Remove Duplicate Rows and Save Clean Dataset

In [None]:
# Based on investigation - these are data collection errors, not valid measurements
print("Before duplicate removal:", df.shape)
df_clean = df.drop_duplicates()
print("After duplicate removal:", df_clean.shape)
print(f"Removed {len(df) - len(df_clean)} duplicate rows")

# Process by day and combine knowledge
if 'day_id' in df_clean.columns and len(unique_days) > 0:
    print(f"\nProcessing and combining knowledge by day:")
    daily_stats = []
    for day in unique_days:
        day_data = df_clean[df_clean['day_id'] == day]
        stats = {
            'day': day,
            'records': len(day_data),
            'avg_latency': day_data[latency_cols].mean().mean(),
            'avg_throughput': day_data[throughput_cols].mean().mean()
        }
        daily_stats.append(stats)
        print(f"Day {day}: {len(day_data)} records, avg latency: {stats['avg_latency']:.2f}ms, avg throughput: {stats['avg_throughput']:.2f}Mbps")
    
    # Combine daily knowledge into summary
    daily_df = pd.DataFrame(daily_stats)
    print("\nCombined daily knowledge:")
    print(daily_df)

# Save cleaned dataset to new file for EDA team - fix path to go to project root
output_file = data_folder / "my_clean_data_after_assurance.csv"
df_clean.to_csv(output_file, index=False)
print(f"\n Clean dataset saved to: {output_file}")
print(f" Ready for EDA: {len(df_clean)} rows, {len(df_clean.columns)} columns")
print("\nNext steps:")
print("1. Use the new file for EDA: pd.read_csv('../data/my_clean_data_after_assurance.csv')")
print("2. Original file preserved at: ../data/my_clean_data_with_imputation.csv")