# DATA ASSURANCE

In [1]:
# Load post-imputation dataset and initialize quality assessment tools
import os
import pandas as pd
from pathlib import Path
from scipy.stats import zscore

data_folder = Path.cwd() / 'data'
df = pd.read_csv(os.path.join(data_folder, 'my_clean_data_with_imputation.csv'))

## Dataset Dimensions Check

In [2]:
# Verify dataset size and structure for baseline understanding
print("Initial dataset shape:", df.shape)

Initial dataset shape: (2837629, 13)


## Duplicate Row Detection

In [3]:
# Identify exact duplicate rows that may inflate model performance
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

Duplicate rows: 109379


## GPS Coordinate Validation

In [4]:
# Check for invalid latitude/longitude values outside valid geographic ranges
invalid_gps = df[(df["latitude"] < -90) | (df["latitude"] > 90) | (df["longitude"] < -180) | (df["longitude"] > 180)]
print(f"Invalid GPS rows: {len(invalid_gps)}")

Invalid GPS rows: 0


## Negative Value Validation

In [5]:
# Detect logically impossible negative values in latency and throughput metrics
neg_latency = (df[["svr1", "svr2", "svr3", "svr4"]] < 0).sum().sum()
neg_throughput = (df[["upload_bitrate_mbits/sec", "download_bitrate_rx_mbits/sec"]] < 0).sum().sum()
print(f"Negative latency values: {neg_latency}")
print(f"Negative throughput values: {neg_throughput}")

Negative latency values: 0
Negative throughput values: 0


## Define Key Metric Column Groups

In [6]:
# Organize column names for consistent analysis across latency and throughput features
latency_cols = ["svr1", "svr2", "svr3", "svr4"]
throughput_cols = ["upload_bitrate_mbits/sec", "download_bitrate_rx_mbits/sec"]

## Post-Imputation Completeness Verification

In [7]:
# Ensure imputation process successfully eliminated all missing values
missing_any = df.isna().sum().sum()
print(f"Missing values remaining: {missing_any} (should be 0 after imputation)")

Missing values remaining: 0 (should be 0 after imputation)


## Data Type Schema Validation

In [8]:
# Verify critical columns maintain expected numeric data types for ML compatibility
expected_numeric = ["svr1", "svr2", "svr3", "svr4", "upload_bitrate_mbits/sec", "download_bitrate_rx_mbits/sec", "latitude", "longitude"]
type_issues = []
for col in expected_numeric:
    if col in df.columns and not pd.api.types.is_numeric_dtype(df[col]):
        type_issues.append(f"{col}: {df[col].dtype}")
print(f"Non-numeric columns that should be numeric: {type_issues if type_issues else 'None'}")

Non-numeric columns that should be numeric: None


## Feature Variance Assessment

In [9]:
# Identify zero or constant variance columns that provide no predictive value
import numpy as np

numeric_cols = df.select_dtypes(include=[np.number]).columns
zero_var = [col for col in numeric_cols if df[col].nunique() <= 1]
print(f"Zero/constant variance columns: {zero_var if zero_var else 'None'}")

Zero/constant variance columns: None


## Row-Level Anomaly Detection

In [10]:
# Count complete records with extreme values for comprehensive outlier assessment
zscores = df[latency_cols + throughput_cols].apply(zscore)
extreme_rows = (zscores.abs() > 3).any(axis=1).sum()
print(f"Rows with extreme values (z>3): {extreme_rows}")

print(f"\nDataset ready for EDA: {len(df)} rows, {len(df.columns)} columns")

Rows with extreme values (z>3): 74191

Dataset ready for EDA: 2837629 rows, 13 columns


## Remove Duplicate Rows and Save Clean Dataset

In [11]:
# Based on investigation - these are data collection errors, not valid measurements
print("Before duplicate removal:", df.shape)
df_clean = df.drop_duplicates()
print("After duplicate removal:", df_clean.shape)
print(f"Removed {len(df) - len(df_clean)} duplicate rows")

# Save cleaned dataset to new file for EDA team
output_file = "data/my_clean_data_after_assurance.csv"
df_clean.to_csv(output_file, index=False)
print(f"\n Clean dataset saved to: {output_file}")
print(f" Ready for EDA: {len(df_clean)} rows, {len(df_clean.columns)} columns")
print("\nNext steps:")
print("1. Use the new file for EDA: pd.read_csv('data/my_clean_data_after_assurance.csv')")
print("2. Original file preserved at: data/my_clean_data_with_imputation.csv")

Before duplicate removal: (2837629, 13)
After duplicate removal: (2728250, 13)
Removed 109379 duplicate rows

 Clean dataset saved to: data/my_clean_data_after_assurance.csv
 Ready for EDA: 2728250 rows, 13 columns

Next steps:
1. Use the new file for EDA: pd.read_csv('data/my_clean_data_after_assurance.csv')
2. Original file preserved at: data/my_clean_data_with_imputation.csv
