In [None]:
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
import numpy as np
import requests
import json

In [None]:
# using SDBD Close-Approach Data API, getting all Earth close approach data for NEOs on or after Jan 01, 2004 up to June 1, 2024
# data of 10 years
data_url = "https://ssd-api.jpl.nasa.gov/cad.api?date-min=2004-01-01&date-max=2024-06-01"
r = requests.get(data_url)
data = json.loads(r.text)

fields = data['fields']
data_list = data['data']

df = pd.DataFrame(data_list, columns=fields)
df.to_csv('neo_data.csv', index=False) # this outputs a dataset of ~15000 entries, with the first 11 columns from the SBDB API
print(df.head())

In [None]:
import time
from tqdm import tqdm # to track progress and time taken to extract data

API_KEY = "tysPTsV4so1r7Mz7zLsllXXjEwyoabBsmw8wMGtx"
BASE_URL = "https://api.nasa.gov/neo/rest/v1/neo/{des}?api_key=" + API_KEY

file_path = 'neo_data.csv'
neo_data = pd.read_csv(file_path)

# define new columns/data to be extracted from NASA API
new_columns = [
    'estimated_diameter_min_km', 'estimated_diameter_max_km', 'data_arc_in_days', 
    'observations_used', 'orbit_uncertainty', 'minimum_orbit_intersection',
    'jupiter_tisserand_invariant', 'epoch_osculation', 'eccentricity', 
    'semi_major_axis', 'inclination', 'ascending_node_longitude',
    'orbital_period', 'perihelion_distance', 'perihelion_argument', 
    'aphelion_distance', 'perihelion_time', 'mean_anomaly', 'mean_motion',
    'is_potentially_hazardous_asteroid'
]

# adding new columns to neo data df, initializing with None values
for col in new_columns:
    neo_data[col] = None 

# fetch data from the NASA API
def fetch_neo_data(des):
    try:
        response = requests.get(BASE_URL.format(des=des)) # HTTP get request to NASA API
        response.raise_for_status()  # HTTP errors
        data = response.json() # converting JSON response from NASA API to dict
        return {
            # features to be extracted 
            'estimated_diameter_min_km': data['estimated_diameter']['kilometers']['estimated_diameter_min'],
            'estimated_diameter_max_km': data['estimated_diameter']['kilometers']['estimated_diameter_max'],
            'data_arc_in_days': data['orbital_data']['data_arc_in_days'],
            'observations_used': data['orbital_data']['observations_used'],
            'orbit_uncertainty': data['orbital_data']['orbit_uncertainty'],
            'minimum_orbit_intersection': data['orbital_data']['minimum_orbit_intersection'],
            'jupiter_tisserand_invariant': data['orbital_data']['jupiter_tisserand_invariant'],
            'epoch_osculation': data['orbital_data']['epoch_osculation'],
            'eccentricity': data['orbital_data']['eccentricity'],
            'semi_major_axis': data['orbital_data']['semi_major_axis'],
            'inclination': data['orbital_data']['inclination'],
            'ascending_node_longitude': data['orbital_data']['ascending_node_longitude'],
            'orbital_period': data['orbital_data']['orbital_period'],
            'perihelion_distance': data['orbital_data']['perihelion_distance'],
            'perihelion_argument': data['orbital_data']['perihelion_argument'],
            'aphelion_distance': data['orbital_data']['aphelion_distance'],
            'perihelion_time': data['orbital_data']['perihelion_time'],
            'mean_anomaly': data['orbital_data']['mean_anomaly'],
            'mean_motion': data['orbital_data']['mean_motion'],
            'is_potentially_hazardous_asteroid': data['is_potentially_hazardous_asteroid'] # label
        }
    # handling objects that are looked up and not in the NASA API
    except requests.exceptions.RequestException as e:
        print(f"API request failed for {des}: {e}")
        return {}

# for each row, fetch data from API, then update neo_data df
request_count = 0 
start_time = time.time()
# save_interval = 100  # saving neo_data df every 100 rows

# https://github.com/softhints/Pandas-Tutorials/blob/master/tqdm/1.progress-bars-pandas-python-tqdm.ipynb
for index, row in tqdm(neo_data.iterrows(), total=neo_data.shape[0]):
    # if request count is less than 1000, show progress time
    if request_count >= 1000: # 1000 requests per hour 
        elapsed_time = time.time() - start_time 
        sleep_time = 3600 - elapsed_time
        if sleep_time > 0:
            print(f"Rate limit reached, sleeping for {sleep_time:.2f} seconds.")
            time.sleep(sleep_time)
        start_time = time.time()
        request_count = 0
    # call fetch function for current object in 'des' col of neo_data df
    api_data = fetch_neo_data(row['des']) # access value in 'des' col for current row, saving dictionary of data to api_data

    # if api_data is not empty
    if api_data: 
        # for each key, value in dict, update to neo_data df
        for key, value in api_data.items():
            neo_data.at[index, key] = value

    request_count += 1 # increment request count

    # code to print progress and save intermediate results every 100 entries
    # if (index + 1) % save_interval == 0:
        # print(f"Processed {index + 1} rows. Saving intermediate results.")
        # neo_data.to_csv(f'intermediate_neo_data_{index + 1}.csv', index=False)

# save expanded data to new csv file
neo_data.to_csv('final_neo_data.csv', index=False)
print("Data fetching complete. CSV file has been saved.")