### Import Required Libraries and Set Up Environment Variables

In [7]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')

### CME Data

In [8]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for CME
url = f"{base_url}CME?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"

In [None]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response

cme_response = requests.get(url)
cme_response

In [10]:
# Convert the response variable to json and store it as a variable named cme_json

cme_json = cme_response.json()

In [None]:
# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data

print(json.dumps(cme_json[0], indent=4))

In [None]:
# Convert cme_json to a Pandas DataFrame 

cme_df = pd.DataFrame(cme_json)

# Keep only the columns: activityID, startTime, linkedEvents

cme_df = cme_df[["activityID", "startTime", "linkedEvents"]]
cme_df.head()


In [None]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs

cme_df = cme_df.dropna(subset=["linkedEvents"])
cme_df.head()


In [None]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents' 
# and adds the elements individually to a list of dictionaries where each row is one element 

# Initialize an empty list to store the expanded rows
expanded_rows = []


# Iterate over each index in the DataFrame
for index, row in cme_df.iterrows():

    # Iterate over each dictionary in the list
    for linked_event in row["linkedEvents"]:
    
        # Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
        expanded_rows.append({"activityID": row["activityID"], "startTime": row["startTime"], "linkedEvents" : linked_event, **linked_event})
      
# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)
expanded_df.head()


In [41]:
# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents
# and verify below that it works as expected using one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors

        # Log the error or print it for debugging

def extract_activityID_from_dict(linked_event):
    try:
        return linked_event["activityID"]
    except (ValueError, TypeError) as e:
        print(f"Error: {e}")




In [None]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:

expanded_df.loc[:, "GST_ActivityID"] = expanded_df["linkedEvents"].apply(lambda x: extract_activityID_from_dict(x))
expanded_df.head()


In [43]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:

expanded_df = expanded_df.dropna(subset=["GST_ActivityID"])

In [None]:
# print out the datatype of each column in this DataFrame:

print(expanded_df.dtypes)

In [None]:
# Convert the 'GST_ActivityID' column to string format 

expanded_df["GST_ActivityID"] = expanded_df["GST_ActivityID"].astype(str)

# Convert startTime to datetime format  

expanded_df["startTime"] = pd.to_datetime(expanded_df["startTime"])

# Rename startTime to startTime_CME and activityID to cmeID

expanded_df = expanded_df.rename(columns={"startTime": "startTime_CME", "activityID": "cmeID"})

# Drop linkedEvents

expanded_df = expanded_df.drop(columns=["linkedEvents"])


# Verify that all steps were executed correctly

expanded_df.head()
print(expanded_df.dtypes)


In [48]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.  

expanded_df = expanded_df[expanded_df["GST_ActivityID"].str.contains("GST")]



### GST Data

In [50]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for Geomagnetic Storms (GST):
GST = "GST"

# Search for GSTs between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST

url = f"{base_url}GST?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"


In [52]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response

gst_response = requests.get(url)
gst_response


<Response [200]>

In [53]:
# Convert the response variable to json and store it as a variable named gst_json

gst_json = gst_response.json()

# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data

print(json.dumps(gst_json[0], indent=4))


{
    "gstID": "2013-06-01T01:00:00-GST-001",
    "startTime": "2013-06-01T01:00Z",
    "allKpIndex": [
        {
            "observedTime": "2013-06-01T01:00Z",
            "kpIndex": 6.0,
            "source": "NOAA"
        }
    ],
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/326/-1",
    "linkedEvents": [
        {
            "activityID": "2013-05-31T15:45:00-HSS-001"
        }
    ],
    "submissionTime": "2013-07-15T19:26Z",
    "versionId": 1
}


In [56]:
# Convert gst_json to a Pandas DataFrame 

gst_df = pd.DataFrame(gst_json) 

# Keep only the columns: activityID, startTime, linkedEvents

gst_df = gst_df[["gstID", "startTime", "linkedEvents"]]
gst_df.head()


Unnamed: 0,gstID,startTime,linkedEvents
0,2013-06-01T01:00:00-GST-001,2013-06-01T01:00Z,[{'activityID': '2013-05-31T15:45:00-HSS-001'}]
1,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,[{'activityID': '2013-06-02T20:24:00-CME-001'}]
2,2013-06-29T03:00:00-GST-001,2013-06-29T03:00Z,
3,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,[{'activityID': '2013-09-29T22:40:00-CME-001'}...
4,2013-12-08T00:00:00-GST-001,2013-12-08T00:00Z,[{'activityID': '2013-12-04T23:12:00-CME-001'}...


In [57]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME

gst_df = gst_df.dropna(subset=["linkedEvents"])


In [58]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.

gst_df = gst_df.explode("linkedEvents").reset_index(drop=True).dropna(subset=["linkedEvents"])


In [59]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:

gst_df.loc[:, "CME_ActivityID"] = gst_df["linkedEvents"].apply(lambda x: extract_activityID_from_dict(x))

# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:

gst_df = gst_df.dropna(subset=["CME_ActivityID"])


In [60]:
# Convert the 'CME_ActivityID' column to string format 

gst_df["CME_ActivityID"] = gst_df["CME_ActivityID"].astype(str)

# Convert the 'gstID' column to string format 

gst_df["gstID"] = gst_df["gstID"].astype(str)

# Convert startTime to datetime format  

gst_df["startTime"] = pd.to_datetime(gst_df["startTime"])

# Rename startTime to startTime_GST 

gst_df = gst_df.rename(columns={"startTime": "startTime_GST"})

# Drop linkedEvents

gst_df = gst_df.drop(columns=["linkedEvents"])
# Verify that all steps were executed correctly

gst_df.head()


Unnamed: 0,gstID,startTime_GST,CME_ActivityID
0,2013-06-01T01:00:00-GST-001,2013-06-01 01:00:00+00:00,2013-05-31T15:45:00-HSS-001
1,2013-06-07T03:00:00-GST-001,2013-06-07 03:00:00+00:00,2013-06-02T20:24:00-CME-001
2,2013-10-02T03:00:00-GST-001,2013-10-02 03:00:00+00:00,2013-09-29T22:40:00-CME-001
3,2013-10-02T03:00:00-GST-001,2013-10-02 03:00:00+00:00,2013-10-02T01:54:00-IPS-001
4,2013-10-02T03:00:00-GST-001,2013-10-02 03:00:00+00:00,2013-10-02T02:47:00-MPC-001


In [61]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.  

gst_df = gst_df[gst_df["CME_ActivityID"].str.contains("CME")]


### Merge both datatsets

In [68]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.

ValueError: len(right_on) must equal len(left_on)

In [65]:
# Verify that the new DataFrame has the same number of rows as cme and gst

print(len(merged_df), len(cme_df), len(gst_df))


0 1023 61


### Computing the time it takes for a CME to cause a GST

In [32]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.


In [33]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST. 


### Exporting data in csv format

In [34]:
# Export data to CSV without the index
