In [3]:
import requests
import csv
from datetime import datetime, timedelta
import pandas as pd

# Function to fetch NEO data from NASA API
def get_neo_data(api_key, start_date, end_date):
    base_url = "https://api.nasa.gov/neo/rest/v1/feed"
    
    params = {
        'api_key': api_key,
        'start_date': start_date,
        'end_date': end_date
    }
    
    response = requests.get(base_url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        return data['near_earth_objects']
    else:
        print(f"Error: {response.status_code}")
        return None

# Function to save data to CSV
def save_to_csv(data, filename="neo_data.csv"):
    if not data:
        print("No data to save.")
        return
    
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        # Write the header
        keys = set()
        for date in data:
            for neo in data[date]:
                keys.update(neo.keys())
        writer.writerow(keys)
        
        # Write the data rows
        for date in data:
            for neo in data[date]:
                writer.writerow([neo.get(key, '') for key in keys])
    
    print(f"Data saved to {filename}")

# Fetching and saving data
api_key = "gevjvSl4oaSmxEWlxGWK3828qUugSvhf9eJrF7Ix"
start_date = datetime.strptime("2015-01-01", "%Y-%m-%d")
end_date = datetime.strptime("2024-01-31", "%Y-%m-%d")

all_neo_data = {}
total_records_fetched = 0
max_records = 100000
days_per_request = 7

while total_records_fetched < max_records:
    next_end_date = start_date + timedelta(days=days_per_request)
    if next_end_date > end_date:
        next_end_date = end_date

    neo_data = get_neo_data(api_key, start_date.strftime("%Y-%m-%d"), next_end_date.strftime("%Y-%m-%d"))
    
    if neo_data:
        for date in neo_data:
            if date not in all_neo_data:
                all_neo_data[date] = []
            all_neo_data[date].extend(neo_data[date])
            total_records_fetched += len(neo_data[date])
            
        print(f"Fetched {total_records_fetched} records so far.")
        
        if total_records_fetched >= max_records:
            break

    start_date = next_end_date + timedelta(days=1)

if all_neo_data:
    save_to_csv(all_neo_data)

# Data Cleaning
neo_df = pd.read_csv('neo_data.csv')

# Remove duplicates
neo_df.drop_duplicates(inplace=True)

# Handle missing values by forward filling
neo_df.fillna(method='ffill', inplace=True)

# Convert date columns to datetime objects
neo_df['close_approach_date'] = pd.to_datetime(neo_df['close_approach_data_close_approach_date'])

# Normalize column names for consistency
neo_df.columns = [col.replace('.', '_') for col in neo_df.columns]

# Extract relevant columns for analysis
columns_of_interest = [
    'name', 'id', 'absolute_magnitude_h', 'estimated_diameter_meters_min',
    'estimated_diameter_meters_max', 'is_potentially_hazardous_asteroid',
    'close_approach_date', 'relative_velocity_kilometers_per_hour',
    'miss_distance_kilometers', 'orbiting_body'
]
neo_df = neo_df[columns_of_interest]

# Handle outliers in numerical columns
for col in ['absolute_magnitude_h', 'estimated_diameter_meters_min', 'estimated_diameter_meters_max']:
    neo_df[col] = neo_df[col].apply(lambda x: x if x > 0 else None)
neo_df.dropna(inplace=True)

# Save the cleaned data
neo_df.to_csv('cleaned_neo_data.csv',index=False)

Fetched 101 records so far.


KeyboardInterrupt: 