In [None]:
import json
import os

INPUT_FILE = "large_data.json"
OUTPUT_FILE = "cleaned_large_data.json"

def load_data(filename):
    if not os.path.exists(filename):
        print(f"[ERROR] '{filename}' not found.")
        return {}
    with open(filename, "r") as f:
        return json.load(f)

def save_data(data, filename):
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)
    print(f"[SUCCESS] Cleaned data saved to '{filename}'")

In [None]:
def clean_dataset(data):
    stats = {
        "users_removed_nameless": 0,
        "users_removed_inactive": 0,
        "duplicate_connections_removed": 0,
        "duplicate_pages_removed": 0
    }
    
    raw_users = data.get('users', [])
    raw_pages = data.get('pages', [])
    
    cleaned_users = []
    
    for user in raw_users:
        original_name = user.get('name', "")
        clean_name = original_name.strip()
        
        if not clean_name:
            stats['users_removed_nameless'] += 1
            continue
            
        
        original_friends_count = len(user['friends'])
        original_likes_count = len(user['liked_pages'])
        
        unique_friends = list(set(user['friends']))
        unique_likes = list(set(user['liked_pages']))
        
        removed_dupes = (original_friends_count - len(unique_friends)) + \
                        (original_likes_count - len(unique_likes))
        stats['duplicate_connections_removed'] += removed_dupes
        
        
        if not unique_friends or not unique_likes:
            stats['users_removed_inactive'] += 1
            continue
            
        cleaned_user = {
            "id": user['id'],
            "name": clean_name,
            "friends": unique_friends,
            "liked_pages": unique_likes
        }
        cleaned_users.append(cleaned_user)

    unique_pages_dict = {}
    for page in raw_pages:
        unique_pages_dict[page['id']] = page
        
    stats['duplicate_pages_removed'] = len(raw_pages) - len(unique_pages_dict)
    
    cleaned_pages = list(unique_pages_dict.values())

    new_data = {"users": cleaned_users, "pages": cleaned_pages}
    return new_data, stats

In [None]:
raw_data = load_data(INPUT_FILE)

if raw_data:
    print(f"Loaded {len(raw_data['users'])} users and {len(raw_data['pages'])} pages.\n")

    clean_data, report = clean_dataset(raw_data)

    print("="*40)
    print("üßπ DATA CLEANING REPORT")
    print("="*40)
    print(f"‚ùå Users Removed (No Name):   {report['users_removed_nameless']}")
    print(f"‚ùå Users Removed (Inactive):  {report['users_removed_inactive']}")
    print(f"‚ùå Duplicate Links Removed:   {report['duplicate_connections_removed']}")
    print(f"‚ùå Duplicate Pages Removed:   {report['duplicate_pages_removed']}")
    print("-" * 40)
    print(f"‚úÖ Final User Count:  {len(clean_data['users'])} (Original: {len(raw_data['users'])})")
    print(f"‚úÖ Final Page Count:  {len(clean_data['pages'])} (Original: {len(raw_data['pages'])})")
    print("="*40)

    save_data(clean_data, OUTPUT_FILE)

Loaded 500 users and 55 pages.

üßπ DATA CLEANING REPORT
‚ùå Users Removed (No Name):   55
‚ùå Users Removed (Inactive):  49
‚ùå Duplicate Links Removed:   76
‚ùå Duplicate Pages Removed:   5
----------------------------------------
‚úÖ Final User Count:  396 (Original: 500)
‚úÖ Final Page Count:  50 (Original: 55)
[SUCCESS] Cleaned data saved to 'cleaned_large_data.json'
