In [2]:
import os 
import json
import pandas as pd
from tqdm import tqdm
from geopy.geocoders import Nominatim
from time import sleep

In [12]:
fusion_path = "d:/multiagent-disaster-reasoning/data/cleaned_fusion_with_text.csv"
json_dir = "d:/multiagent-disaster-reasoning/data/CrisisMMD_v2.0/json"
output_path = "d:/multiagent-disaster-reasoning/data/meta.json"

In [None]:
fusion_df = pd.read_csv(fusion_path)
print(f"✅ Loaded fusion_df: {fusion_df.shape}")

# Setup geolocator
geolocator = Nominatim(user_agent="geoapiExercises", timeout=10)
meta_data = []
location_cache = {}

# Iterate over each row in CSV
for _, row in tqdm(fusion_df.iterrows(), total=len(fusion_df)):
    tweet_id = str(row['tweet_id'])
    disaster_type = row['disaster_type']
    image_name = row['image_name']

    # JSON file path
    json_path = os.path.join(json_dir, f"{disaster_type}_final_data.json")
    if not os.path.exists(json_path):
        continue

    try:
        # Load JSON as line-by-line objects
        with open(json_path, "r", encoding="utf-8") as f:
            tweets = [json.loads(line) for line in f if line.strip()]

        # Match tweet
        tweet_data = next((t for t in tweets if str(t['id']) == tweet_id), None)
        if not tweet_data:
            continue

        created_at = tweet_data.get("created_at", None)
        location = tweet_data.get("location", None)

        lat, lon = None, None
        if location:
            if location in location_cache:
                lat, lon = location_cache[location]
            else:
                try:
                    geo = geolocator.geocode(location)
                    if geo:
                        lat, lon = geo.latitude, geo.longitude
                        location_cache[location] = (lat, lon)
                except:
                    pass
                sleep(1)  # Respect geocoding API rate limit

        meta_data.append({
            "tweet_id": tweet_id,
            "image_name": image_name,
            "disaster_type": disaster_type,
            "timestamp": created_at,
            "latitude": lat,
            "longitude": lon
        })

    except Exception as e:
        print(f"💥 Failed for tweet_id={tweet_id}, disaster_type={disaster_type}: {e}")

# Save meta.json
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(meta_data, f, indent=4)

print(f"✅ meta.json saved with {len(meta_data)} records → {output_path}")

✅ Loaded fusion_df: (8534, 25)


  2%|█▉                                                                             | 213/8534 [01:43<55:38,  2.49it/s]