# **JSON file analysis experiment**

In [None]:
import json
import pandas as pd
from pathlib import Path

def create_reduced_json(input_json_path, output_json_path, valid_keys_set):
    with open(input_json_path, "r") as f:
        full_data = json.load(f)

    reduced_data = {}
    for key in full_data:
        if key.lower() in valid_keys_set:
            reduced_data[key] = full_data[key]

    with open(output_json_path, "w") as f:
        json.dump(reduced_data, f)

    print(f"✅ Reduced JSON saved to: {output_json_path}")
    print(f"🧮 Original keys: {len(full_data)} → Filtered keys: {len(reduced_data)}")

# Loading object list and normalize
csv_path = "/content/drive/MyDrive/Grad/CAP6412-0001/Project/interesting_objects_v3.csv"
df = pd.read_csv(csv_path)
valid_objects = set(df["Object"].dropna().astype(str).str.lower())

base_path = Path("/content/drive/MyDrive/Grad/CAP6412-0001/Project")
input_jsons = {
    "LLaVA-mix665k": base_path / "object_cooccurences_LLaVA-mix665k.json",
    "LLaVA-Pretrain": base_path / "object_cooccurences_LLaVA-Pretrain.json"
}
output_jsons = {
    name: base_path / f"reduced_{name}.json" for name in input_jsons
}

# Processing both JSON files
for name in input_jsons:
    create_reduced_json(input_jsons[name], output_jsons[name], valid_objects)



✅ Reduced JSON saved to: /content/drive/MyDrive/Grad/CAP6412-0001/Project/reduced_LLaVA-mix665k.json
🧮 Original keys: 52806 → Filtered keys: 283
✅ Reduced JSON saved to: /content/drive/MyDrive/Grad/CAP6412-0001/Project/reduced_LLaVA-Pretrain.json
🧮 Original keys: 140583 → Filtered keys: 159


In [None]:
from pathlib import Path

def filter_and_format_json(input_json_path, output_json_path, valid_objects_set):
    with open(input_json_path, "r") as f:
        raw_data = json.load(f)

    filtered_data = {}
    for key, val_dict in raw_data.items():
        key_lower = key.lower()
        if key_lower not in valid_objects_set:
            continue
        filtered_val = {k: v for k, v in val_dict.items() if k.lower() in valid_objects_set}
        if filtered_val:
            filtered_data[key] = filtered_val

    with open(output_json_path, "w") as f:
        for key, val in filtered_data.items():
            json_str = json.dumps({key: val}, indent=2)
            f.write(json_str + "\n\n")

    print(f"✅ Final filtered and formatted JSON saved to: {output_json_path}")
    print(f"📦 Total keys written: {len(filtered_data)}")

csv_path = "/content/drive/MyDrive/Grad/CAP6412-0001/Project/interesting_objects_v3.csv"
df = pd.read_csv(csv_path)
valid_objects = set(df["Object"].dropna().astype(str).str.lower())

# File paths
base_path = Path("/content/drive/MyDrive/Grad/CAP6412-0001/Project")
input_jsons = {
    "reduced_LLaVA-mix665k.json": base_path / "reduced_LLaVA-mix665k.json",
    "reduced_LLaVA-Pretrain.json": base_path / "reduced_LLaVA-Pretrain.json"
}
output_jsons = {
    name: base_path / f"final_filtered_{name}" for name in input_jsons
}

# executing for both files
for name in input_jsons:
    filter_and_format_json(input_jsons[name], output_jsons[name], valid_objects)



✅ Final filtered and formatted JSON saved to: /content/drive/MyDrive/Grad/CAP6412-0001/Project/final_filtered_reduced_LLaVA-mix665k.json
📦 Total keys written: 270
✅ Final filtered and formatted JSON saved to: /content/drive/MyDrive/Grad/CAP6412-0001/Project/final_filtered_reduced_LLaVA-Pretrain.json
📦 Total keys written: 146


In [None]:
from collections import Counter

def count_kv_pairs(json_path):
    with open(json_path, "r") as f:
        content = f.read()

    blocks = [block for block in content.strip().split("\n\n") if block.strip()]

    key_counter = Counter()
    for block in blocks:
        try:
            d = json.loads(block)
            for key in d:
                key_counter[key] += 1
        except json.JSONDecodeError:
            print("⚠️ Skipped an invalid block")

    total_kv = len(key_counter)
    duplicates = {k: v for k, v in key_counter.items() if v > 1}

    return total_kv, duplicates

# Paths to files
base_path = Path("/content/drive/MyDrive/Grad/CAP6412-0001/Project")
paths = {
    "Pretrain": base_path / "final_filtered_reduced_LLaVA-Pretrain.json",
    "Mix665k": base_path / "final_filtered_reduced_LLaVA-mix665k.json"
}

# Run stats
for label, path in paths.items():
    total_keys, duplicate_keys = count_kv_pairs(path)
    print(f"📊 {label} JSON Stats")
    print(f"Total unique keys: {total_keys}")
    print(f"Duplicate keys found: {len(duplicate_keys)}")
    if duplicate_keys:
        print(f"Sample duplicates: {list(duplicate_keys.items())[:5]}")
    print()



📊 Pretrain JSON Stats
Total unique keys: 146
Duplicate keys found: 0

📊 Mix665k JSON Stats
Total unique keys: 270
Duplicate keys found: 0



In [None]:
import json
from pathlib import Path
from collections import Counter

def list_sorted_keys(json_path):
    with open(json_path, "r") as f:
        content = f.read()

    blocks = [block for block in content.strip().split("\n\n") if block.strip()]

    key_counter = Counter()
    keys = []

    for block in blocks:
        try:
            d = json.loads(block)
            for key in d:
                key_counter[key] += 1
                keys.append(key)
        except json.JSONDecodeError:
            print("⚠️ Skipped an invalid block")

    duplicates = {k: v for k, v in key_counter.items() if v > 1}
    sorted_keys = sorted(set(keys), key=lambda x: x.lower())

    print(f"📊 {json_path.name} Stats")
    print(f"Total unique keys: {len(set(keys))}")
    print(f"Duplicate keys found: {len(duplicates)}\n")

    print("🔤 Sorted Unique Keys:")
    for k in sorted_keys:
        print(k)

# Paths to check
base_path = Path("/content/drive/MyDrive/Grad/CAP6412-0001/Project")
pretrain_path = base_path / "final_filtered_reduced_LLaVA-Pretrain.json"
mix_path = base_path / "final_filtered_reduced_LLaVA-mix665k.json"

# Run for both files
list_sorted_keys(pretrain_path)
print("\n" + "="*60 + "\n")
list_sorted_keys(mix_path)


📊 final_filtered_reduced_LLaVA-Pretrain.json Stats
Total unique keys: 146
Duplicate keys found: 0

🔤 Sorted Unique Keys:
agent
antiquity
article
bed
beef
bitch
block
blood
body
button
buttons
carpet
catch
cause
cement
center
chalk
challenge
charm
chemistry
cloth
commodity
complaint
cone
cones
consolidation
construction
cosmos
cover
covering
crank
creation
curio
curiosity
decker
decoration
dissent
draw
earth
element
enamel
essence
excavation
fabric
facility
filler
film
fixture
float
floater
fluid
food
formation
fuel
glass
glasses
good
goods
ground
grounds
growth
head
humor
ice
ink
inks
insert
installation
jelly
juice
keepsake
kick
land
layer
lemon
line
location
lot
lots
love
marijuana
marker
material
matter
mechanism
media
medium
melancholy
milk
mixture
moon
neighbor
nest
object
opening
ornament
padding
part
passion
pavement
paving
portion
process
prop
property
props
protest
radiator
remains
restoration
ribbon
rock
sample
seed
serum
sheet
shiner
slip
snake
soil
sphere
square
stone
strip

In [None]:
from collections import defaultdict
import json
from pathlib import Path

def load_and_normalize(filepath):
    with open(filepath, "r") as f:
        content = f.read()

    blocks = [block for block in content.strip().split("\n\n") if block.strip()]
    merged = defaultdict(lambda: defaultdict(int))

    for block in blocks:
        try:
            d = json.loads(block)
            for key, val_dict in d.items():
                key_lower = key.lower()
                for subkey, freq in val_dict.items():
                    subkey_lower = subkey.lower()
                    merged[key_lower][subkey_lower] += freq
        except json.JSONDecodeError:
            print("⚠️ Skipping invalid JSON block.")

    return merged

def merge_two_dictionaries(dict1, dict2):
    for key, subdict in dict2.items():
        for subkey, freq in subdict.items():
            dict1[key][subkey] += freq
    return dict1

def sort_nested_dict(d):
    sorted_dict = {}
    for key in sorted(d.keys()):
        sorted_sub = dict(sorted(d[key].items(), key=lambda x: -x[1]))
        sorted_dict[key] = sorted_sub
    return sorted_dict

# File paths
base_path = Path("/content/drive/MyDrive/Grad/CAP6412-0001/Project")
pretrain_path = base_path / "final_filtered_reduced_LLaVA-Pretrain.json"
mix_path = base_path / "final_filtered_reduced_LLaVA-mix665k.json"
output_path = base_path / "final_merged_sorted_cooccur.json"

# Load, normalize, and merge
pretrain_data = load_and_normalize(pretrain_path)
mix_data = load_and_normalize(mix_path)
merged = merge_two_dictionaries(pretrain_data, mix_data)
sorted_merged = sort_nested_dict(merged)

# Save final result
with open(output_path, "w") as f:
    json.dump(sorted_merged, f, indent=2)

print(f"✅ Final merged and sorted co-occurrence data saved to:\n{output_path}")


✅ Final merged and sorted co-occurrence data saved to:
/content/drive/MyDrive/Grad/CAP6412-0001/Project/final_merged_sorted_cooccur.json


In [None]:
# Path to the final cleaned and sorted file
final_json_path = Path("/content/drive/MyDrive/Grad/CAP6412-0001/Project/final_merged_sorted_cooccur.json")

def verify_final_json_stats(filepath, expected_max_values=150):
    with open(filepath, "r") as f:
        data = json.load(f)

    total_keys = len(data)
    too_many_values = {k: len(v) for k, v in data.items() if len(v) > expected_max_values}

    print(f"📊 Total unique main objects (keys): {total_keys}")
    if too_many_values:
        print(f"⚠️ Keys with more than {expected_max_values} co-occurring objects:")
        for k, count in too_many_values.items():
            print(f"  - {k}: {count}")
    else:
        print(f"✅ All keys have ≤ {expected_max_values} co-occurring objects")

    print("\n🔍 Sample order check (first 5 keys):")
    for k in list(data.keys())[:5]:
        print(f"  - {k}")
        print(f"    ↳ Top 3 co-objects: {list(data[k].items())[:3]}")
    print("\n📌 Order of values is descending by frequency? Checking sample...")

    for k in list(data.keys())[:5]:
        freqs = list(data[k].values())
        if freqs != sorted(freqs, reverse=True):
            print(f"⚠️ Order issue found in: {k}")
        else:
            print(f"✅ {k} values are correctly ordered")

# Run verification
verify_final_json_stats(final_json_path)


📊 Total unique main objects (keys): 158
✅ All keys have ≤ 150 co-occurring objects

🔍 Sample order check (first 5 keys):
  - agent
    ↳ Top 3 co-objects: [('process', 19), ('milk', 14), ('food', 9)]
  - allergen
    ↳ Top 3 co-objects: [('food', 14), ('growth', 2), ('allergen', 2)]
  - antiquity
    ↳ Top 3 co-objects: [('stone', 4), ('structure', 4), ('charm', 3)]
  - article
    ↳ Top 3 co-objects: [('cover', 23), ('media', 15), ('part', 9)]
  - bed
    ↳ Top 3 co-objects: [('wall', 365), ('surface', 275), ('part', 198)]

📌 Order of values is descending by frequency? Checking sample...
✅ agent values are correctly ordered
✅ allergen values are correctly ordered
✅ antiquity values are correctly ordered
✅ article values are correctly ordered
✅ bed values are correctly ordered


In [None]:
# Check which of the 160 objects are missing from the final JSON
csv_path = "/content/drive/MyDrive/Grad/CAP6412-0001/Project/interesting_objects_v3.csv"
final_json_path = "/content/drive/MyDrive/Grad/CAP6412-0001/Project/final_merged_sorted_cooccur.json"

df = pd.read_csv(csv_path)
csv_objects = set(df["Object"].dropna().astype(str).str.lower())

with open(final_json_path, "r") as f:
    json_data = json.load(f)
json_keys = set(json_data.keys())

missing = sorted(csv_objects - json_keys)

print(f"❌ Missing objects ({len(missing)}):")
for obj in missing:
    print(f"- {obj}")


❌ Missing objects (1):
- extra


In [None]:
import pandas as pd
from collections import Counter

# Path to the CSV file
csv_path = "/content/drive/MyDrive/Grad/CAP6412-0001/Project/interesting_objects_v3.csv"

df = pd.read_csv(csv_path)
objects_raw = df["Object"].dropna().astype(str).tolist()
objects_normalized = [obj.strip().lower() for obj in objects_raw]

# Counting frequency
object_counter = Counter(objects_normalized)

unique_count = len(set(objects_normalized))
total_count = len(objects_normalized)
duplicates = {obj: count for obj, count in object_counter.items() if count > 1}

# Output
print(f"🔢 Total objects (original): {total_count}")
print(f"🔡 Unique objects (normalized): {unique_count}")
print(f"♻️ Duplicate entries found: {len(duplicates)}")

if duplicates:
    print("\n🔁 Duplicate examples:")
    for obj, count in list(duplicates.items())[:10]:
        print(f"- {obj}: {count} times")


🔢 Total objects (original): 159
🔡 Unique objects (normalized): 159
♻️ Duplicate entries found: 0


In [None]:
# Path to the final merged JSON
final_json_path = "/content/drive/MyDrive/Grad/CAP6412-0001/Project/final_merged_sorted_cooccur.json"

with open(final_json_path, "r") as f:
    data = json.load(f)

multi_word_keys = [k for k in data if len(k.split()) > 1]
multi_word_values = set()

for co_dict in data.values():
    for co_obj in co_dict:
        if len(co_obj.split()) > 1:
            multi_word_values.add(co_obj)

print(f"Total multi-word keys: {len(multi_word_keys)}")
if multi_word_keys:
    print("Sample multi-word keys:")
    for k in list(multi_word_keys)[:10]:
        print(f"- {k}")

print(f"\nTotal multi-word co-occurring objects (values): {len(multi_word_values)}")
if multi_word_values:
    print("Sample multi-word values:")
    for v in list(multi_word_values)[:10]:
        print(f"- {v}")


Total multi-word keys: 0

Total multi-word co-occurring objects (values): 0
