**Task D — Missing Data Detection & Simple Imputation**

**Description:**
Given a JSON file with daily weather that contains gaps (`null` or empty strings), implement imputation: linear interpolation for temperatures and a moving-average for wind/humidity. Produce an imputed JSON and a brief imputation report indicating how many values were filled.

**Deliverables:**

* `tokyo_imputed.json` (or `<city>_imputed.json`)
* `imputation_report.json` with counts per field

**Expected report (example):**

```
{
  "imputed_counts": {
    "temp_max": 4,
    "temp_min": 2,
    "humidity": 3,
    "wind_speed": 1
  }
}
```

**Hints:**

* Treat edge cases (leading/trailing nulls) specially: propagate nearest non-null or keep as null if too many missing points.
* Don’t impute if >50% of values in a period are missing — instead flag for manual review.
* Log which strategy you used for each field.

**Run-and-paste (live check):**
Run the cell that prints `imputation_report['imputed_counts']` and paste the dictionary into the chat.

In [None]:
# Task D - Missing Data Detection & Simple Imputation (BUGGY VERSION)
import json
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

def create_sample_data_with_gaps():
    """Create sample weather data with missing values"""
    data = {
        "city": "Tokyo",
        "weather_data": [
            {"date": "2024-08-15", "temp_max": 32.5, "temp_min": 22.1, "humidity": 65, "wind_speed": 15.2},
            {"date": "2024-08-16", "temp_max": None, "temp_min": 21.8, "humidity": 68, "wind_speed": None},
            {"date": "2024-08-17", "temp_max": 31.2, "temp_min": None, "humidity": None, "wind_speed": 18.5},
            {"date": "2024-08-18", "temp_max": None, "temp_min": None, "humidity": 72, "wind_speed": 16.8},
            {"date": "2024-08-19", "temp_max": 33.8, "temp_min": 23.5, "humidity": None, "wind_speed": None},
            {"date": "2024-08-20", "temp_max": 34.1, "temp_min": 24.2, "humidity": 58, "wind_speed": 14.3},
            {"date": "2024-08-21", "temp_max": None, "temp_min": 23.8, "humidity": 61, "wind_speed": 17.2},
            {"date": "2024-08-22", "temp_max": 30.9, "temp_min": None, "humidity": None, "wind_speed": None}
        ]
    }
    
    with open('tokyo_weather_gaps.json', 'w') as f:
        json.dump(data, f, indent=2)
    
    print(" Sample data with gaps created: tokyo_weather_gaps.json")
    return data

def linear_interpolation_buggy(values):
    """
    Linear interpolation for missing values
    BUG: Contains several intentional bugs
    """
    if not values:
        return values
    
    # Convert to numpy array
    arr = np.array(values, dtype=float)
    
    # BUG 1: Not handling edge cases (leading/trailing nulls)
    # Should propagate nearest non-null values for edge cases
    
    # Find missing values
    missing_mask = np.isnan(arr)
    
    if not np.any(missing_mask):
        return values  # No missing values
    
    # BUG 2: Not checking if >50% values are missing
    # Should flag for manual review if too many missing
    
    # Simple linear interpolation
    valid_indices = np.where(~missing_mask)[0]
    
    if len(valid_indices) < 2:
        # BUG 3: Poor handling when insufficient data points
        return values  # Should handle this case better
    
    for i in range(len(arr)):
        if missing_mask[i]:
            # Find nearest valid values
            left_idx = None
            right_idx = None
            
            # Find left boundary
            for j in range(i-1, -1, -1):
                if not missing_mask[j]:
                    left_idx = j
                    break
            
            # Find right boundary  
            for j in range(i+1, len(arr)):
                if not missing_mask[j]:
                    right_idx = j
                    break
            
            # BUG 4: Incorrect interpolation when boundaries missing
            if left_idx is not None and right_idx is not None:
                # Linear interpolation
                x1, y1 = left_idx, arr[left_idx]
                x2, y2 = right_idx, arr[right_idx]
                arr[i] = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
            elif left_idx is not None:
                arr[i] = arr[left_idx]  # Forward fill
            elif right_idx is not None:
                arr[i] = arr[right_idx]  # Backward fill
    
    return arr.tolist()

def moving_average_imputation_buggy(values, window=3):
    """
    Moving average imputation for wind/humidity
    BUG: Contains bugs in implementation
    """
    if not values:
        return values
    
    arr = np.array(values, dtype=float)
    missing_mask = np.isnan(arr)
    
    if not np.any(missing_mask):
        return values
    
    # BUG 5: Window size not adjusted for data length
    # Should adjust window size if data is too short
    
    for i in range(len(arr)):
        if missing_mask[i]:
            # Calculate moving average
            start = max(0, i - window//2)
            end = min(len(arr), i + window//2 + 1)
            
            window_values = arr[start:end]
            valid_values = window_values[~np.isnan(window_values)]
            
            if len(valid_values) > 0:
                arr[i] = np.mean(valid_values)
            # BUG 6: No fallback when no valid values in window
    
    return arr.tolist()

def impute_weather_data_buggy(data):
    """
    Impute missing values in weather data
    BUG: Poor tracking of imputation counts
    """
    imputed_data = json.loads(json.dumps(data))  # Deep copy
    imputation_counts = {}
    
    weather_records = imputed_data.get('weather_data', [])
    
    # Extract field arrays
    fields = ['temp_max', 'temp_min', 'humidity', 'wind_speed']
    field_arrays = {}
    
    for field in fields:
        field_arrays[field] = [record.get(field) for record in weather_records]
    
    # Impute each field
    for field, values in field_arrays.items():
        original_values = values.copy()
        
        if field in ['temp_max', 'temp_min']:
            # Use linear interpolation for temperatures
            imputed_values = linear_interpolation_buggy(values)
        else:
            # Use moving average for humidity and wind
            imputed_values = moving_average_imputation_buggy(values)
        
        # BUG 7: Incorrect counting of imputed values
        # Should count actual None -> value changes
        count = sum(1 for i, v in enumerate(original_values) if v is None)
        imputation_counts[field] = count
        
        # Update records
        for i, record in enumerate(weather_records):
            if i < len(imputed_values):
                record[field] = imputed_values[i]
    
    return imputed_data, imputation_counts

def main_imputation_buggy():
    """Main function for buggy imputation process"""
    print("=== Task D: Missing Data Imputation (BUGGY VERSION) ===")
    
    # Create sample data
    original_data = create_sample_data_with_gaps()
    
    # Perform imputation
    print("\n Performing imputation...")
    imputed_data, counts = impute_weather_data_buggy(original_data)
    
    # Save results
    with open('tokyo_imputed_buggy.json', 'w') as f:
        json.dump(imputed_data, f, indent=2)
    
    report = {"imputed_counts": counts}
    with open('imputation_report_buggy.json', 'w') as f:
        json.dump(report, f, indent=2)
    
    print(" Imputation complete (buggy version)")
    print(f" Imputation counts: {counts}")
    
    return report

# Run buggy version
if __name__ == "__main__":
    result_buggy = main_imputation_buggy()

In [None]:
# Task D - Missing Data Detection & Simple Imputation (FIXED VERSION)
import json
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

def linear_interpolation_fixed(values, max_missing_percent=0.5):
    """
    Linear interpolation for missing values with proper edge case handling
    FIXES: All bugs from buggy version addressed
    """
    if not values:
        return values, 0
    
    # Convert to numpy array, handling None values
    arr = np.array([np.nan if v is None else v for v in values], dtype=float)
    original_missing = np.isnan(arr).copy()
    
    if not np.any(original_missing):
        return values, 0  # No missing values
    
    # FIX 2: Check if >50% values are missing
    missing_percent = np.sum(original_missing) / len(arr)
    if missing_percent > max_missing_percent:
        print(f" Warning: {missing_percent:.1%} values missing (>{max_missing_percent:.0%}). Flagged for manual review.")
        # Still attempt imputation but log the warning
    
    # FIX 1: Handle edge cases (leading/trailing nulls)
    # Forward fill leading nulls
    first_valid_idx = None
    for i in range(len(arr)):
        if not np.isnan(arr[i]):
            first_valid_idx = i
            break
    
    if first_valid_idx is not None and first_valid_idx > 0:
        arr[:first_valid_idx] = arr[first_valid_idx]
    
    # Backward fill trailing nulls
    last_valid_idx = None
    for i in range(len(arr)-1, -1, -1):
        if not np.isnan(arr[i]):
            last_valid_idx = i
            break
    
    if last_valid_idx is not None and last_valid_idx < len(arr)-1:
        arr[last_valid_idx+1:] = arr[last_valid_idx]
    
    # FIX 3: Better handling when insufficient data points
    valid_indices = np.where(~np.isnan(arr))[0]
    if len(valid_indices) < 2:
        # If only one valid value, fill all with that value
        if len(valid_indices) == 1:
            arr.fill(arr[valid_indices[0]])
        # If no valid values, cannot impute
        else:
            print(" Warning: No valid values found for interpolation")
            return values, 0
    else:
        # Linear interpolation for interior missing values
        missing_mask = np.isnan(arr)
        
        for i in range(len(arr)):
            if missing_mask[i]:
                # Find nearest valid values
                left_idx = None
                right_idx = None
                
                # Find left boundary
                for j in range(i-1, -1, -1):
                    if not np.isnan(arr[j]):
                        left_idx = j
                        break
                
                # Find right boundary  
                for j in range(i+1, len(arr)):
                    if not np.isnan(arr[j]):
                        right_idx = j
                        break
                
                # FIX 4: Correct interpolation logic
                if left_idx is not None and right_idx is not None:
                    # Linear interpolation
                    x1, y1 = left_idx, arr[left_idx]
                    x2, y2 = right_idx, arr[right_idx]
                    arr[i] = y1 + (y2 - y1) * (i - x1) / (x2 - x1)
                elif left_idx is not None:
                    arr[i] = arr[left_idx]  # Forward fill
                elif right_idx is not None:
                    arr[i] = arr[right_idx]  # Backward fill
    
    # Count actual imputations (originally missing values that were filled)
    imputed_count = np.sum(original_missing & ~np.isnan(arr))
    
    return arr.tolist(), int(imputed_count)

def moving_average_imputation_fixed(values, window=3, max_missing_percent=0.5):
    """
    Moving average imputation for wind/humidity with proper handling
    FIXES: Window size adjustment and fallback strategies
    """
    if not values:
        return values, 0
    
    # Convert to numpy array
    arr = np.array([np.nan if v is None else v for v in values], dtype=float)
    original_missing = np.isnan(arr).copy()
    
    if not np.any(original_missing):
        return values, 0
    
    # Check missing percentage
    missing_percent = np.sum(original_missing) / len(arr)
    if missing_percent > max_missing_percent:
        print(f" Warning: {missing_percent:.1%} values missing (>{max_missing_percent:.0%}). Flagged for manual review.")
    
    # FIX 5: Adjust window size for data length
    effective_window = min(window, len(arr))
    if effective_window < 3 and len(arr) >= 3:
        effective_window = 3
    
    # First pass: moving average imputation
    for i in range(len(arr)):
        if original_missing[i]:
            # Calculate moving average window
            start = max(0, i - effective_window//2)
            end = min(len(arr), i + effective_window//2 + 1)
            
            window_values = arr[start:end]
            valid_values = window_values[~np.isnan(window_values)]
            
            if len(valid_values) > 0:
                arr[i] = np.mean(valid_values)
    
    # FIX 6: Fallback strategies for remaining missing values
    still_missing = np.isnan(arr)
    if np.any(still_missing):
        # Try global mean as fallback
        global_valid = arr[~still_missing]
        if len(global_valid) > 0:
            global_mean = np.mean(global_valid)
            arr[still_missing] = global_mean
        else:
            # Last resort: use linear interpolation
            arr_list, _ = linear_interpolation_fixed(arr.tolist(), max_missing_percent)
            arr = np.array(arr_list, dtype=float)
    
    # Count actual imputations
    imputed_count = np.sum(original_missing & ~np.isnan(arr))
    
    return arr.tolist(), int(imputed_count)

def impute_weather_data_fixed(data, strategy_log=None):
    """
    Impute missing values in weather data with proper tracking
    FIXES: Accurate counting and strategy logging
    """
    if strategy_log is None:
        strategy_log = {}
    
    imputed_data = json.loads(json.dumps(data))  # Deep copy
    imputation_counts = {}
    
    weather_records = imputed_data.get('weather_data', [])
    
    if not weather_records:
        return imputed_data, imputation_counts, strategy_log
    
    # Extract field arrays
    fields = ['temp_max', 'temp_min', 'humidity', 'wind_speed']
    field_arrays = {}
    
    for field in fields:
        field_arrays[field] = [record.get(field) for record in weather_records]
    
    # Impute each field with appropriate strategy
    for field, values in field_arrays.items():
        print(f"\n Processing {field}...")
        
        if field in ['temp_max', 'temp_min']:
            # Use linear interpolation for temperatures
            strategy_log[field] = "linear_interpolation"
            imputed_values, count = linear_interpolation_fixed(values)
        else:
            # Use moving average for humidity and wind
            strategy_log[field] = "moving_average"
            imputed_values, count = moving_average_imputation_fixed(values)
        
        # FIX 7: Accurate counting of imputed values
        imputation_counts[field] = count
        print(f"   Imputed {count} values using {strategy_log[field]}")
        
        # Update records
        for i, record in enumerate(weather_records):
            if i < len(imputed_values):
                record[field] = imputed_values[i]
    
    return imputed_data, imputation_counts, strategy_log

def validate_imputation_quality(original_data, imputed_data):
    """
    Validate the quality of imputation
    """
    print("\n Imputation Quality Check:")
    
    original_records = original_data.get('weather_data', [])
    imputed_records = imputed_data.get('weather_data', [])
    
    fields = ['temp_max', 'temp_min', 'humidity', 'wind_speed']
    
    for field in fields:
        original_values = [r.get(field) for r in original_records]
        imputed_values = [r.get(field) for r in imputed_records]
        
        original_nulls = sum(1 for v in original_values if v is None)
        imputed_nulls = sum(1 for v in imputed_values if v is None)
        
        print(f"   {field}: {original_nulls} → {imputed_nulls} nulls")
        
        if imputed_nulls == 0 and original_nulls > 0:
            print(f"    {field}: All missing values successfully imputed")
        elif imputed_nulls < original_nulls:
            print(f"    {field}: Partially imputed ({original_nulls - imputed_nulls} filled)")
        elif imputed_nulls == original_nulls:
            print(f"    {field}: No imputation performed")

def main_imputation_fixed():
    """Main function for fixed imputation process"""
    print("=== Task D: Missing Data Imputation (FIXED VERSION) ===")
    
    # Load or create sample data
    try:
        with open('tokyo_weather_gaps.json', 'r') as f:
            original_data = json.load(f)
        print(" Loaded existing sample data")
    except FileNotFoundError:
        print(" Creating new sample data...")
        original_data = create_sample_data_with_gaps()
    
    # Perform imputation
    print("\n Performing imputation (fixed version)...")
    strategy_log = {}
    imputed_data, counts, strategies = impute_weather_data_fixed(original_data, strategy_log)
    
    # Validate imputation quality
    validate_imputation_quality(original_data, imputed_data)
    
    # Save results
    with open('tokyo_imputed_fixed.json', 'w') as f:
        json.dump(imputed_data, f, indent=2)
    
    detailed_report = {
        "imputed_counts": counts,
        "strategies_used": strategies,
        "total_records": len(imputed_data.get('weather_data', [])),
        "imputation_timestamp": datetime.now().isoformat()
    }
    
    with open('imputation_report_fixed.json', 'w') as f:
        json.dump(detailed_report, f, indent=2)
    
    print("\n Imputation complete (fixed version)")
    print(f" Imputation counts: {counts}")
    print(f" Strategies used: {strategies}")
    
    return detailed_report

# Run fixed version
if __name__ == "__main__":
    result_fixed = main_imputation_fixed()

In [10]:
# Demo Cell - Run Both Versions and Show Results
print(" RUNNING BOTH VERSIONS")
print("=" * 50)

# Run buggy version
print(" BUGGY VERSION:")
result_buggy = main_imputation_buggy()

print("\n" + "=" * 50)

# Run fixed version  
print(" FIXED VERSION:")
result_fixed = main_imputation_fixed()

print("\n" + "=" * 50)
print(" COMPARISON RESULTS")
print("=" * 50)

print(" BUGGY VERSION RESULTS:")
print(f"   Imputation counts: {result_buggy['imputed_counts']}")

print("\n FIXED VERSION RESULTS:")
print(f"   Imputation counts: {result_fixed['imputed_counts']}")
print(f"   Strategies used: {result_fixed['strategies_used']}")

print(f"\n REQUESTED OUTPUT (Fixed Version):")
print(f"imputation_report['imputed_counts'] = {result_fixed['imputed_counts']}")

 RUNNING BOTH VERSIONS
 BUGGY VERSION:
=== Task D: Missing Data Imputation (BUGGY VERSION) ===
 Sample data with gaps created: tokyo_weather_gaps.json

🔧 Performing imputation...
 Imputation complete (buggy version)
 Imputation counts: {'temp_max': 3, 'temp_min': 3, 'humidity': 3, 'wind_speed': 3}

 FIXED VERSION:
=== Task D: Missing Data Imputation (FIXED VERSION) ===
 Loaded existing sample data

 Performing imputation (fixed version)...

 Processing temp_max...
   Imputed 3 values using linear_interpolation

 Processing temp_min...
   Imputed 3 values using linear_interpolation

 Processing humidity...
   Imputed 3 values using moving_average

 Processing wind_speed...
   Imputed 3 values using moving_average

 Imputation Quality Check:
   temp_max: 3 → 0 nulls
    temp_max: All missing values successfully imputed
   temp_min: 3 → 0 nulls
    temp_min: All missing values successfully imputed
   humidity: 3 → 0 nulls
    humidity: All missing values successfully imputed
   wind_speed