### Import Required Libraries and Set Up Environment Variables

In [1]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
from datetime import datetime

# Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')

### Retrieve CME Data

In [2]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for CME
cme_url = f"{base_url}{CME}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"

In [3]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
cme_response = requests.get(cme_url)

In [4]:
# Convert the response variable to json and store it as a variable named cme_json
cme_json = cme_response.json()

In [5]:
# Preview ONLY the first element from the cme_json list you created in JSON format
# Do NOT print out the entire list
# Use json.dumps with argument indent=4 to format data
cme_json_preview = json.dumps(cme_json[0], indent=4)
print(cme_json_preview)

{
    "activityID": "2013-05-01T03:12:00-CME-001",
    "catalog": "M2M_CATALOG",
    "startTime": "2013-05-01T03:12Z",
    "instruments": [
        {
            "displayName": "SOHO: LASCO/C2"
        },
        {
            "displayName": "SOHO: LASCO/C3"
        },
        {
            "displayName": "STEREO A: SECCHI/COR2"
        },
        {
            "displayName": "STEREO B: SECCHI/COR2"
        }
    ],
    "sourceLocation": "",
    "activeRegionNum": null,
    "note": "",
    "submissionTime": "2013-08-07T16:54Z",
    "versionId": 1,
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/2349/-1",
    "cmeAnalyses": [
        {
            "isMostAccurate": true,
            "time21_5": "2013-05-01T07:07Z",
            "latitude": 12.0,
            "longitude": -120.0,
            "halfAngle": 36.0,
            "speed": 860.0,
            "type": "C",
            "featureCode": "null",
            "imageType": null,
            "measurementTechnique": "null",
   

In [6]:
# Convert cme_json to a Pandas DataFrame
cme_df = pd.DataFrame(cme_json)

# Keep only the columns: activityID, startTime, linkedEvents
cme_df = cme_df[["activityID", "startTime", "linkedEvents"]]

In [7]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
cme_df = cme_df[cme_df["linkedEvents"].notnull()]

### Retrieve GST Data

In [8]:
# Set the specifier for Geomagnetic Storms (GST):
GST = "GST"

# Build URL for GST
gst_url = f"{base_url}{GST}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"

In [9]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response
gst_response = requests.get(gst_url)

In [10]:
# Convert the response variable to json and store it as a variable named gst_json
gst_json = gst_response.json()

In [11]:
# Preview ONLY the first element from the gst_json list you created in JSON format
print(json.dumps(gst_json[0], indent=4))

{
    "gstID": "2013-06-01T01:00:00-GST-001",
    "startTime": "2013-06-01T01:00Z",
    "allKpIndex": [
        {
            "observedTime": "2013-06-01T01:00Z",
            "kpIndex": 6.0,
            "source": "NOAA"
        }
    ],
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/326/-1",
    "linkedEvents": [
        {
            "activityID": "2013-05-31T15:45:00-HSS-001"
        }
    ],
    "submissionTime": "2013-07-15T19:26Z",
    "versionId": 1
}


In [12]:
# Convert gst_json to a Pandas DataFrame
gst_df = pd.DataFrame(gst_json)

# Keep only the columns: gstID, startTime, linkedEvents
gst_df = gst_df[["gstID", "startTime", "linkedEvents"]]

### Merge both datasets

In [14]:
# Extract the first activityID from the linkedEvents column in gst_df
gst_df["linkedActivityID"] = gst_df["linkedEvents"].apply(
    lambda x: x[0]["activityID"] if isinstance(x, list) and len(x) > 0 else None
)

# Now merge both datasets using 'activityID' from cme_df and 'linkedActivityID' from gst_df
merged_df = pd.merge(
    cme_df, gst_df,
    left_on="activityID", right_on="linkedActivityID",
    how="inner"
)

### Computing the time it takes for a CME to cause a GST

In [15]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.
merged_df["startTime_CME"] = pd.to_datetime(merged_df["startTime_x"])
merged_df["startTime_GST"] = pd.to_datetime(merged_df["startTime_y"])
merged_df["timeDiff"] = merged_df["startTime_GST"] - merged_df["startTime_CME"]

In [16]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST.
print(merged_df["timeDiff"].describe())

count                           48
mean               3 days 01:31:55
std      1 days 00:32:58.230543740
min                1 days 08:36:00
25%                2 days 05:53:00
50%                2 days 20:44:30
75%                3 days 18:51:00
max                6 days 03:00:00
Name: timeDiff, dtype: object


### Exporting data in csv format

In [17]:
# Export data to CSV without the index
merged_df.to_csv("processed_cme_gst_data.csv", index=False)