In [5]:
import json
import pandas as pd
import os
import requests
from tqdm import tqdm
file_path = "sky_conditions_20240701_20250830.json"

In [6]:
with open(file_path, 'r') as f:
    data = json.load(f)

In [7]:
observations = [feat['properties'] for feat in data['features']]
# print(json.dumps(observations[0], indent=2))
# print('skyconditionsSouthPhotoUrl' in observations[0])

In [8]:
cloud_types = [
    "skyconditionsAltocumulus",
    "skyconditionsAltostratus",
    "skyconditionsCirrocumulus",
    "skyconditionsCirrostratus",
    "skyconditionsCirrus",
    "skyconditionsCumulonimbus",
    "skyconditionsCumulus",
    "skyconditionsNimbostratus",
    "skyconditionsStratocumulus",
    "skyconditionsStratus"
]
def has_cloud_type(obs):
    return any(obs[cloud_type] == 'true' for cloud_type in cloud_types)

photo_types = [
    "skyconditionsNorthPhotoUrl",
    "skyconditionsSouthPhotoUrl",
    "skyconditionsEastPhotoUrl",
    "skyconditionsWestPhotoUrl",
    "skyconditionsUpwardPhotoUrl"
]
def has_photos(obs):
    return any(obs[photo_type] != 'null' for photo_type in photo_types)
def count_photos(obs):
    return sum(int(obs[photo_type] != 'null') for photo_type in photo_types)

print(sum(has_cloud_type(obs) and has_photos(obs) for obs in observations))
print(sum(int(has_cloud_type(obs))*count_photos(obs) for obs in observations))

77050
341695


In [9]:
rows = []
for obs in observations:
    if not (has_cloud_type(obs) and has_photos(obs)):
        continue

    one_hot = {ct[13:].lower(): int(obs[ct] == 'true') for ct in cloud_types}
    for pt in photo_types:
        photo_url = obs.get(pt)
        # if photo_url == 'pending approval':
        #     continue
        # if photo_url[-3:] in {'peg', 'png', 'ted', 'JPG'}:
        #     continue
        if photo_url[-3:] != 'jpg':
            continue
        if photo_url and photo_url != 'null':
            row = {'photo_url': photo_url}
            row.update(one_hot)
            rows.append(row)

print(f'number of observations: {len(rows)}')
s = set()
for row in rows:
    s.add(row['photo_url'][-3:])
print(f'types of data: {s}')

number of observations: 334540
types of data: {'jpg'}


In [10]:
df = pd.DataFrame(rows)
print(df.head())

                                           photo_url  altocumulus  \
0  https://data.globe.gov/system/photos/2025/08/3...            1   
1  https://data.globe.gov/system/photos/2025/08/3...            1   
2  https://data.globe.gov/system/photos/2025/08/3...            0   
3  https://data.globe.gov/system/photos/2025/08/3...            0   
4  https://data.globe.gov/system/photos/2025/08/3...            0   

   altostratus  cirrocumulus  cirrostratus  cirrus  cumulonimbus  cumulus  \
0            1             0             0       0             0        1   
1            1             0             0       0             0        1   
2            1             0             0       0             0        0   
3            1             0             0       0             0        0   
4            1             0             0       0             0        0   

   nimbostratus  stratocumulus  stratus  
0             0              0        1  
1             0              0        

In [None]:
subset = df.head(2000)
subset.to_csv('clouds_subset.csv', index=False)
os.makedirs('cloud_images', exist_ok=True)

image_paths = []
for i, url in tqdm(enumerate(subset['photo_url']), total=len(subset)):
    # try: 
    #     response = requests.get(url, timeout=10)
    #     if response.status_code == 200:
    filename = f'cloud_{i}.jpg'
    filepath = os.path.join('cloud_images', filename)
    # with open(filepath, 'wb') as f:
    #     f.write(response.content)
    image_paths.append(filepath)
    #     else:
    #         image_paths.append(None)
    # except Exception as e:
    #     image_paths.append(None)

In [None]:
subset['local_path'] = image_paths
subset.to_csv('clouds_subset.csv', index=False)