<a href="https://colab.research.google.com/github/nickcanoy/PSMDSRC103_SY20252026A/blob/main/7_1_DataCollection_tru_API_Exercise_CanoyNick.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#RODENICK CANOY Data Collaction in API Exercise 103-Programming

In [None]:
import requests

def make_request(endpoint, payload=None):
    """
    Make a request to a specific endpoint on the weather API
    passing headers and optional payload.

    Parameters:
        - endpoint: The endpoint of the API you want to
          make a GET request to.
        - payload: A dictionary of data to pass along
          with the request.

    Returns:
        Response object.
    """
    return requests.get(
        f'https://www.ncdc.noaa.gov/cdo-web/api/v2/{endpoint}',
        headers={
            'token': 'aGQokEAzndYvedTahoYxvESuyGeYxmMd'
        },
        params=payload
    )

In [None]:
# see what datasets are available
response = make_request('datasets', {'startdate': '2018-10-01'})
response.status_code

200

In [None]:
response.json().keys()


dict_keys(['metadata', 'results'])

In [None]:
response.json()['metadata']

{'resultset': {'offset': 1, 'count': 11, 'limit': 25}}

In [None]:
response.json()['results'][0].keys()

dict_keys(['uid', 'mindate', 'maxdate', 'name', 'datacoverage', 'id'])

In [None]:
[(data['id'], data['name']) for data in response.json()['results']]

[('GHCND', 'Daily Summaries'),
 ('GSOM', 'Global Summary of the Month'),
 ('GSOY', 'Global Summary of the Year'),
 ('NEXRAD2', 'Weather Radar (Level II)'),
 ('NEXRAD3', 'Weather Radar (Level III)'),
 ('NORMAL_ANN', 'Normals Annual/Seasonal'),
 ('NORMAL_DLY', 'Normals Daily'),
 ('NORMAL_HLY', 'Normals Hourly'),
 ('NORMAL_MLY', 'Normals Monthly'),
 ('PRECIP_15', 'Precipitation 15 Minute'),
 ('PRECIP_HLY', 'Precipitation Hourly')]

In [None]:
# get data category id
response = make_request(
    'datacategories',
    payload={
        'datasetid': 'GHCND'
    }
)
response.status_code

200

In [None]:
response.json()['results']

[{'name': 'Evaporation', 'id': 'EVAP'},
 {'name': 'Land', 'id': 'LAND'},
 {'name': 'Precipitation', 'id': 'PRCP'},
 {'name': 'Sky cover & clouds', 'id': 'SKY'},
 {'name': 'Sunshine', 'id': 'SUN'},
 {'name': 'Air Temperature', 'id': 'TEMP'},
 {'name': 'Water', 'id': 'WATER'},
 {'name': 'Wind', 'id': 'WIND'},
 {'name': 'Weather Type', 'id': 'WXTYPE'}]

In [None]:
#Get Data Type ID
response = make_request(
    'datatypes',
    payload={
        'datacategoryid': 'TEMP',
        'limit': 100
    }
)
response.status_code

200

In [None]:
[(datatype['id'], datatype['name']) for datatype in response.json()['results']][-5:] # look at the last 5

[('MNTM', 'Monthly mean temperature'),
 ('TAVG', 'Average Temperature.'),
 ('TMAX', 'Maximum temperature'),
 ('TMIN', 'Minimum temperature'),
 ('TOBS', 'Temperature at the time of observation')]

In [None]:
# get location category id
response = make_request(
    'locationcategories',
    {
        'datasetid' : 'GHCND'
    }
)
response.status_code

200

In [None]:
import pprint
pprint.pprint(response.json())

{'metadata': {'resultset': {'count': 12, 'limit': 25, 'offset': 1}},
 'results': [{'id': 'CITY', 'name': 'City'},
             {'id': 'CLIM_DIV', 'name': 'Climate Division'},
             {'id': 'CLIM_REG', 'name': 'Climate Region'},
             {'id': 'CNTRY', 'name': 'Country'},
             {'id': 'CNTY', 'name': 'County'},
             {'id': 'HYD_ACC', 'name': 'Hydrologic Accounting Unit'},
             {'id': 'HYD_CAT', 'name': 'Hydrologic Cataloging Unit'},
             {'id': 'HYD_REG', 'name': 'Hydrologic Region'},
             {'id': 'HYD_SUB', 'name': 'Hydrologic Subregion'},
             {'id': 'ST', 'name': 'State'},
             {'id': 'US_TERR', 'name': 'US Territory'},
             {'id': 'ZIP', 'name': 'Zip Code'}]}


In [None]:
import requests

def make_request(endpoint, payload=None):
    """
    Make a request to a specific endpoint on the weather API
    passing headers and optional payload.

    Parameters:
        - endpoint: The endpoint of the API you want to
          make a GET request to.
        - payload: A dictionary of data to pass along
          with the request.

    Returns:
        Response object.
    """
    return requests.get(
        f'https://www.ncdc.noaa.gov/cdo-web/api/v2/{endpoint}',
        headers={
            'token': 'aGQokEAzndYvedTahoYxvESuyGeYxmMd'
        },
        params=payload
    )

def get_item(name, what, endpoint, start=1, end=None):
    """
    Grab the JSON payload for a given field by name using binary search.

    Parameters:
    - name: The item to look for.
    - what: Dictionary specifying what the item in `name` is.
    - endpoint: Where to look for the item.
    - start: The position to start at. We don't need to touch this, but the
             function will manipulate this with recursion.
    - end: The last position of the cities. Used to find the midpoint, but
           like `start` this is not something we need to worry about.

    Returns:
    Dictionary of the information for the item if found otherwise
    an empty dictionary.
    """

    # find the midpoint which we use to cut the data in half each time
    mid = (start + (end if end else 1)) // 2

    # lowercase the name so this is not case-sensitive
    name = name.lower()

    # define the payload we will send with each request
    payload = {
        'datasetid': 'GHCND',
        'sortfield': 'name',
        'offset': mid,  # we will change the offset each time
        'limit': 1  # we only want one value back
    }

    # make our request adding any additional filter parameters from `what`
    response = make_request(endpoint, payload={**payload, **what})

    if response.ok:
        # if response is ok, grab the end index from the response metadata the first time through
        end = end if end else response.json()['metadata']['resultset']['count']

        # grab the lowercase version of the current name
        current_name = response.json()['results'][0]['name'].lower()

        # if what we are searching for is in the current name, we have found our item
        if name in current_name:
            return response.json()['results'][0]  # return the found item
        # else was the problem here
        elif start >= end:
            # if our start index is greater than or equal to our end, we couldn't find it
            return {}
        elif name < current_name:
            # our name comes before the current name in the alphabet, so we search further to the left
            return get_item(name, what, endpoint, start, mid - 1)
        elif name > current_name:
            # our name comes after the current name in the alphabet, so we search further to the right
            return get_item(name, what, endpoint, mid + 1, end)
    else:
        # response wasn't ok, use code to determine why
        print(f'Response not OK, status: {response.status_code}')
        return {}


def get_location(name):
    """
    Grab the JSON payload for the location by name using binary search.

    Parameters:
    - name: The city to look for.

    Returns:
    Dictionary of the information for the city if found otherwise
    an empty dictionary.
    """
    return get_item(name, {'locationcategoryid': 'CITY'}, 'locations')

In [None]:
#Get NYC ID
nyc = get_location('New York')
nyc

Response not OK, status: 503


{}

In [None]:
if nyc:
    central_park = get_item('NY City Central Park', {'locationid': nyc['id']}, 'stations')
    print(central_park)
else:
    print("Could not retrieve location data for New York.")

Could not retrieve location data for New York.


In [None]:
# get NYC daily summaries data
response = make_request(
    'data',
    {
        'datasetid': 'GHCND',
        'stationid': central_park['id'],
        'locationid': nyc['id'],
        'startdate': '2018-10-01',
        'enddate': '2018-10-31',
        'datatypeid': ['TMIN', 'TMAX', 'TOBS'],  # temperature at time of observation, min, and max
        'units': 'metric',
        'limit': 1000
    }
)
response.status_code

NameError: name 'central_park' is not defined

In [None]:
import pandas as pd
df = pd.DataFrame(response.json()['results'])
df.head()

Unnamed: 0,name,id
0,City,CITY
1,Climate Division,CLIM_DIV
2,Climate Region,CLIM_REG
3,Country,CNTRY
4,County,CNTY


In [None]:
df.datatype.unique()

array(['City', 'Climate Division', 'Climate Region', 'Country', 'County',
       'Hydrologic Accounting Unit', 'Hydrologic Cataloging Unit',
       'Hydrologic Region', 'Hydrologic Subregion', 'State',
       'US Territory', 'Zip Code'], dtype=object)

In [None]:
if get_item(
    'NY City Central Park',
    {'locationid': nyc['id']},
    'stations'
)
central_park

SyntaxError: expected ':' (ipython-input-601415547.py, line 5)

In [None]:
# get NYC daily summaries data
if 'central_park' in locals() and central_park:
    response = make_request(
        'data',
        {
            'datasetid': 'GHCND',
            'stationid': central_park['id'],
            'locationid': nyc['id'],
            'startdate': '2018-10-01',
            'enddate': '2018-10-31',
            'datatypeid': ['TMIN', 'TMAX', 'TOBS'],  # temperature at time of observation, min, and max
            'units': 'metric',
            'limit': 1000
        }
    )
    print(response.status_code)
else:
    print("Could not retrieve Central Park station data.")

Could not retrieve Central Park station data.


In [None]:
df = pd.DataFrame(response.json()['results'])
df.head()

Unnamed: 0,name,id
0,City,CITY
1,Climate Division,CLIM_DIV
2,Climate Region,CLIM_REG
3,Country,CNTRY
4,County,CNTY


In [None]:
df.datatype.value_counts()

AttributeError: 'DataFrame' object has no attribute 'datatype'

In [None]:
df.to_csv('data/nyc_temperatures.csv'), index=False)

SyntaxError: unmatched ')' (ipython-input-674715374.py, line 1)