### API to get data

List of endpoints : https://www.ncdc.noaa.gov/cdo-web/webservices/v2#gettingStarted

|Endpoint|Description|
|-------|-------|
|`/datasets	`|A dataset is the primary grouping for data at NCDC.|
|`/datacategories`	|A data category is a general type of data used to group similar data types.|
|`/datatypes`	|A data type is a specific type of data that is often unique to a dataset.|
|`/locationcategories`	|A location category is a grouping of similar locations.|
|`/locations`	|A location is a geopolitical entity.|
|`/stations`	|A station is a any weather observing platform where data is recorded.|
|`/data`	|A datum is an observed value along with any ancillary attributes at a specific place and time.|

In [55]:
import pandas as pd

import src.acquire as ac

### `datasets` endpoint

In [2]:
# see what datasets are available
response = ac.make_request('datasets', {'startdate':'2018-10-01'})

# check the status code, 200 -> successful
response.status_code

200

In [3]:
# check if successful with boolean response
response.ok

True

In [4]:
# response.json()

In [5]:
response.json().keys()

dict_keys(['metadata', 'results'])

In [6]:
response.json()['metadata'] # 11 rows

{'resultset': {'offset': 1, 'count': 11, 'limit': 25}}

In [7]:
# response.json()['results']

In [8]:
response.json()['results'][0].keys()

dict_keys(['uid', 'mindate', 'maxdate', 'name', 'datacoverage', 'id'])

In [9]:
# get names and ids of each dataset

[(data['id'], data['name']) for data in response.json()['results']]

[('GHCND', 'Daily Summaries'),
 ('GSOM', 'Global Summary of the Month'),
 ('GSOY', 'Global Summary of the Year'),
 ('NEXRAD2', 'Weather Radar (Level II)'),
 ('NEXRAD3', 'Weather Radar (Level III)'),
 ('NORMAL_ANN', 'Normals Annual/Seasonal'),
 ('NORMAL_DLY', 'Normals Daily'),
 ('NORMAL_HLY', 'Normals Hourly'),
 ('NORMAL_MLY', 'Normals Monthly'),
 ('PRECIP_15', 'Precipitation 15 Minute'),
 ('PRECIP_HLY', 'Precipitation Hourly')]

### `datacategories` endpoint

In [10]:
# dataset id = 'GHCND'
# now when we know which dataset we want to request, let's see what 

https://www.ncdc.noaa.gov/cdo-web/webservices/v2#dataCategories parameters for payload

In [11]:
response = ac.make_request('datacategories', payload={'datasetid':'GHCND'})

In [12]:
response.status_code, response.ok

(200, True)

In [13]:
response.json()

{'metadata': {'resultset': {'offset': 1, 'count': 9, 'limit': 25}},
 'results': [{'name': 'Evaporation', 'id': 'EVAP'},
  {'name': 'Land', 'id': 'LAND'},
  {'name': 'Precipitation', 'id': 'PRCP'},
  {'name': 'Sky cover & clouds', 'id': 'SKY'},
  {'name': 'Sunshine', 'id': 'SUN'},
  {'name': 'Air Temperature', 'id': 'TEMP'},
  {'name': 'Water', 'id': 'WATER'},
  {'name': 'Wind', 'id': 'WIND'},
  {'name': 'Weather Type', 'id': 'WXTYPE'}]}

In [14]:
response.json()['results'][5]

{'name': 'Air Temperature', 'id': 'TEMP'}

### `datatypes` endpoint

In [15]:
response = ac.make_request('datatypes', payload={'datasetid':'GHCND', 'datacategoryid':'TEMP','limit':100})

In [16]:
response.status_code, response.ok

(200, True)

In [17]:
response.json().keys()

dict_keys(['metadata', 'results'])

In [18]:
response.json()['results'][0]

{'mindate': '1863-05-04',
 'maxdate': '2022-12-15',
 'name': 'Number of days included in the multiday minimum temperature (MDTN)',
 'datacoverage': 1,
 'id': 'DATN'}

In [79]:
[(data['id'], data['name']) for data in response.json()['results']]

[('DATN',
  'Number of days included in the multiday minimum temperature (MDTN)'),
 ('DATX',
  'Number of days included in the multiday maximum temperature (MDTX)'),
 ('MDTN', 'Multiday minimum temperature (use with DATN)'),
 ('MDTX', 'Multiday maximum temperature (use with DATX)'),
 ('TAVG', 'Average Temperature.'),
 ('TMAX', 'Maximum temperature'),
 ('TMIN', 'Minimum temperature'),
 ('TOBS', 'Temperature at the time of observation')]

### `locationcategories` endpoint

In [20]:
response = ac.make_request('locationcategories', {'datasetid':'GHCND', 'startdate':'2018-10-01'})

In [21]:
response.status_code, response.ok

(200, True)

In [22]:
response.json()

{'metadata': {'resultset': {'offset': 1, 'count': 12, 'limit': 25}},
 'results': [{'name': 'City', 'id': 'CITY'},
  {'name': 'Climate Division', 'id': 'CLIM_DIV'},
  {'name': 'Climate Region', 'id': 'CLIM_REG'},
  {'name': 'Country', 'id': 'CNTRY'},
  {'name': 'County', 'id': 'CNTY'},
  {'name': 'Hydrologic Accounting Unit', 'id': 'HYD_ACC'},
  {'name': 'Hydrologic Cataloging Unit', 'id': 'HYD_CAT'},
  {'name': 'Hydrologic Region', 'id': 'HYD_REG'},
  {'name': 'Hydrologic Subregion', 'id': 'HYD_SUB'},
  {'name': 'State', 'id': 'ST'},
  {'name': 'US Territory', 'id': 'US_TERR'},
  {'name': 'Zip Code', 'id': 'ZIP'}]}

In [26]:
# print in organized format
import pprint
pprint.pprint(response.json())

{'metadata': {'resultset': {'count': 12, 'limit': 25, 'offset': 1}},
 'results': [{'id': 'CITY', 'name': 'City'},
             {'id': 'CLIM_DIV', 'name': 'Climate Division'},
             {'id': 'CLIM_REG', 'name': 'Climate Region'},
             {'id': 'CNTRY', 'name': 'Country'},
             {'id': 'CNTY', 'name': 'County'},
             {'id': 'HYD_ACC', 'name': 'Hydrologic Accounting Unit'},
             {'id': 'HYD_CAT', 'name': 'Hydrologic Cataloging Unit'},
             {'id': 'HYD_REG', 'name': 'Hydrologic Region'},
             {'id': 'HYD_SUB', 'name': 'Hydrologic Subregion'},
             {'id': 'ST', 'name': 'State'},
             {'id': 'US_TERR', 'name': 'US Territory'},
             {'id': 'ZIP', 'name': 'Zip Code'}]}


In [24]:
response.json()['results'][0]

{'name': 'City', 'id': 'CITY'}

In [31]:
austin = ac.get_location('Austin')

In [32]:
austin

{'mindate': '1893-02-28',
 'maxdate': '2022-12-26',
 'name': 'Austin, TX US',
 'datacoverage': 1,
 'id': 'CITY:US480005'}

### `stations` endpoint

In [33]:
# find weather stations in Austin, TX

In [79]:
response = ac.make_request('stations', {'datasetid':'GHCND', 'datacategoryid':'TEMP', 'locationid':austin['id']})

In [80]:
response.status_code, response.ok

(200, True)

In [81]:
[(data['id'], data['name']) for data in response.json()['results']]

[('GHCND:USC00410420', 'AUSTIN, TX US'),
 ('GHCND:USC00410427', 'AUSTIN WATER TREATMENT PLANT, TX US'),
 ('GHCND:USC00410431', 'AUSTIN 6 S, TX US'),
 ('GHCND:USC00410433', 'AUSTIN GREAT HILLS, TX US'),
 ('GHCND:USC00412655', 'DUVAL, TX US'),
 ('GHCND:USC00414185', 'HILL S RANCH, TX US'),
 ('GHCND:USC00415561', 'MANSFIELD DAM, TX US'),
 ('GHCND:USC00417790', 'ROUND ROCK, TX US'),
 ('GHCND:USW00000230', 'AUSTIN EXECUTIVE AIRPORT, TX US'),
 ('GHCND:USW00013904', 'AUSTIN BERGSTROM INTERNATIONAL AIRPORT, TX US'),
 ('GHCND:USW00013958', 'AUSTIN CAMP MABRY, TX US')]

In [84]:
# info about Round Rock station
response.json()['results'][9]

{'elevation': 146.5,
 'mindate': '1948-03-01',
 'maxdate': '2022-12-25',
 'latitude': 30.18311,
 'name': 'AUSTIN BERGSTROM INTERNATIONAL AIRPORT, TX US',
 'datacoverage': 0.5697,
 'id': 'GHCND:USW00013904',
 'elevationUnit': 'METERS',
 'longitude': -97.67989}

In [85]:
# save the information
airport = response.json()['results'][9]

In [86]:
airport

{'elevation': 146.5,
 'mindate': '1948-03-01',
 'maxdate': '2022-12-25',
 'latitude': 30.18311,
 'name': 'AUSTIN BERGSTROM INTERNATIONAL AIRPORT, TX US',
 'datacoverage': 0.5697,
 'id': 'GHCND:USW00013904',
 'elevationUnit': 'METERS',
 'longitude': -97.67989}

### Final Part 
### `data` endpoint. Save the results into a `DataFrame`

In [100]:
# grab the data from Austin, TX from Nov, 2022

In [87]:
response = ac.make_request('data', 
                           {
                               'datasetid':'GHCND', 
                               'datacategoryid':'TEMP', 
                               'locationid':austin['id'],
                               'stationid':airport['id'],
                               'startdate':'2022-11-01',
                               'enddate':'2022-11-30',
                               'datatypeid':['TAVG', 'TOBS', 'TMIN', 'TMAX'],
                               'units':'standard',
                               'limit':1000
                           })

In [88]:
response.status_code, response.ok

(200, True)

In [62]:
# response.json() # max date at round rock is 1895 :( 

In [51]:
# reload without stationid

In [75]:
response.json().keys()

dict_keys(['metadata', 'results'])

In [89]:
response.json()['metadata']

{'resultset': {'offset': 1, 'count': 90, 'limit': 1000}}

In [90]:
df = pd.DataFrame(response.json()['results'])

In [91]:
df

Unnamed: 0,date,datatype,station,attributes,value
0,2022-11-01T00:00:00,TAVG,GHCND:USW00013904,"H,,S,",61.0
1,2022-11-01T00:00:00,TMAX,GHCND:USW00013904,",,D,2400",66.0
2,2022-11-01T00:00:00,TMIN,GHCND:USW00013904,",,D,2400",53.0
3,2022-11-02T00:00:00,TAVG,GHCND:USW00013904,"H,,S,",63.0
4,2022-11-02T00:00:00,TMAX,GHCND:USW00013904,",,D,2400",77.0
...,...,...,...,...,...
85,2022-11-29T00:00:00,TMAX,GHCND:USW00013904,",,D,2400",84.0
86,2022-11-29T00:00:00,TMIN,GHCND:USW00013904,",,D,2400",49.0
87,2022-11-30T00:00:00,TAVG,GHCND:USW00013904,"H,,S,",53.0
88,2022-11-30T00:00:00,TMAX,GHCND:USW00013904,",,D,2400",59.0


In [93]:
df.datatype.unique() # TOBS wasn't saved. Might be not available at the location

array(['TAVG', 'TMAX', 'TMIN'], dtype=object)

In [94]:
df.columns

Index(['date', 'datatype', 'station', 'attributes', 'value'], dtype='object')

In [95]:
df.rename(columns={'value':'temp_f',
                   'attributes':'flags'}, inplace=True)

In [97]:
df.head(1)

Unnamed: 0,date,datatype,station,flags,temp_f
0,2022-11-01T00:00:00,TAVG,GHCND:USW00013904,"H,,S,",61.0


In [99]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
temp_f,90.0,57.366667,14.02041,29.0,47.0,56.0,67.0,86.0


In [103]:
# rename all columns to upper case
df.rename(str.upper, axis=1, inplace=True)

In [104]:
df.head(1)

Unnamed: 0,DATE,DATATYPE,STATION,FLAGS,TEMP_F
0,2022-11-01T00:00:00,TAVG,GHCND:USW00013904,"H,,S,",61.0


In [106]:
# rename back to lower case
df.rename(str.lower, axis='columns', inplace=True)

In [107]:
df.head(1)

Unnamed: 0,date,datatype,station,flags,temp_f
0,2022-11-01T00:00:00,TAVG,GHCND:USW00013904,"H,,S,",61.0


In [108]:
df.dtypes

date         object
datatype     object
station      object
flags        object
temp_f      float64
dtype: object

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      90 non-null     object 
 1   datatype  90 non-null     object 
 2   station   90 non-null     object 
 3   flags     90 non-null     object 
 4   temp_f    90 non-null     float64
dtypes: float64(1), object(4)
memory usage: 3.6+ KB


In [112]:
# df.temp_f.to_list() all end with .0 can be converted into integer

In [124]:
df.loc[:,'date'] = pd.to_datetime(df.date)

In [114]:
df.dtypes

date        datetime64[ns]
datatype            object
station             object
flags               object
temp_f             float64
dtype: object

In [115]:
df.head(1)

Unnamed: 0,date,datatype,station,flags,temp_f
0,2022-11-01,TAVG,GHCND:USW00013904,"H,,S,",61.0


In [120]:
df.date.describe(datetime_is_numeric=False)

  df.date.describe(datetime_is_numeric=False)


count                      90
unique                     30
top       2022-11-01 00:00:00
freq                        3
first     2022-11-01 00:00:00
last      2022-11-30 00:00:00
Name: date, dtype: object

In [122]:
df.date.describe(datetime_is_numeric=True) # useless

count                     90
mean     2022-11-15 12:00:00
min      2022-11-01 00:00:00
25%      2022-11-08 00:00:00
50%      2022-11-15 12:00:00
75%      2022-11-23 00:00:00
max      2022-11-30 00:00:00
Name: date, dtype: object

In [130]:
df.set_index('date')

Unnamed: 0_level_0,datatype,station,flags,temp_f
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-11-01,TAVG,GHCND:USW00013904,"H,,S,",61.0
2022-11-01,TMAX,GHCND:USW00013904,",,D,2400",66.0
2022-11-01,TMIN,GHCND:USW00013904,",,D,2400",53.0
2022-11-02,TAVG,GHCND:USW00013904,"H,,S,",63.0
2022-11-02,TMAX,GHCND:USW00013904,",,D,2400",77.0
...,...,...,...,...
2022-11-29,TMAX,GHCND:USW00013904,",,D,2400",84.0
2022-11-29,TMIN,GHCND:USW00013904,",,D,2400",49.0
2022-11-30,TAVG,GHCND:USW00013904,"H,,S,",53.0
2022-11-30,TMAX,GHCND:USW00013904,",,D,2400",59.0


In [131]:
df.head(1)

Unnamed: 0,date,datatype,station,flags,temp_f
0,2022-11-01,TAVG,GHCND:USW00013904,"H,,S,",61.0


In [137]:
df.to_csv('data/austin_temp.csv', index=False)

In [138]:
# create a dataframe where index is the date and contains the timezone info
central = pd.read_csv('data/austin_temp.csv',
                      index_col='date',
                      parse_dates=True).tz_localize('US/Central')

In [139]:
central.head()

Unnamed: 0_level_0,datatype,station,flags,temp_f
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-11-01 00:00:00-05:00,TAVG,GHCND:USW00013904,"H,,S,",61.0
2022-11-01 00:00:00-05:00,TMAX,GHCND:USW00013904,",,D,2400",66.0
2022-11-01 00:00:00-05:00,TMIN,GHCND:USW00013904,",,D,2400",53.0
2022-11-02 00:00:00-05:00,TAVG,GHCND:USW00013904,"H,,S,",63.0
2022-11-02 00:00:00-05:00,TMAX,GHCND:USW00013904,",,D,2400",77.0


In [140]:
central.tz_convert('UTC').head()

Unnamed: 0_level_0,datatype,station,flags,temp_f
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-11-01 05:00:00+00:00,TAVG,GHCND:USW00013904,"H,,S,",61.0
2022-11-01 05:00:00+00:00,TMAX,GHCND:USW00013904,",,D,2400",66.0
2022-11-01 05:00:00+00:00,TMIN,GHCND:USW00013904,",,D,2400",53.0
2022-11-02 05:00:00+00:00,TAVG,GHCND:USW00013904,"H,,S,",63.0
2022-11-02 05:00:00+00:00,TMAX,GHCND:USW00013904,",,D,2400",77.0


In [143]:
central.to_period('M').index #dtype 'period[M]



PeriodIndex(['2022-11', '2022-11', '2022-11', '2022-11', '2022-11', '2022-11',
             '2022-11', '2022-11', '2022-11', '2022-11', '2022-11', '2022-11',
             '2022-11', '2022-11', '2022-11', '2022-11', '2022-11', '2022-11',
             '2022-11', '2022-11', '2022-11', '2022-11', '2022-11', '2022-11',
             '2022-11', '2022-11', '2022-11', '2022-11', '2022-11', '2022-11',
             '2022-11', '2022-11', '2022-11', '2022-11', '2022-11', '2022-11',
             '2022-11', '2022-11', '2022-11', '2022-11', '2022-11', '2022-11',
             '2022-11', '2022-11', '2022-11', '2022-11', '2022-11', '2022-11',
             '2022-11', '2022-11', '2022-11', '2022-11', '2022-11', '2022-11',
             '2022-11', '2022-11', '2022-11', '2022-11', '2022-11', '2022-11',
             '2022-11', '2022-11', '2022-11', '2022-11', '2022-11', '2022-11',
             '2022-11', '2022-11', '2022-11', '2022-11', '2022-11', '2022-11',
             '2022-11', '2022-11', '2022-11', '2022-

In [145]:
central.to_period('M').to_timestamp().index # to_timestamp() back to datetime format,
#but drops the timezone info and all dates are Nov, 1

DatetimeIndex(['2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
               '2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
               '2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
               '2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
               '2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
               '2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
               '2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
               '2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
               '2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
               '2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
               '2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
               '2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
               '2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
               '2022-11-01', '2022-11-01', '2022-11-01', '2022-11-01',
      

In [157]:
del central['temp_c']

In [162]:
# central = central.assign(temp_c = round((central.temp_f - 32) * 5/9, 1))
# alternative way is to use lambda function
central = central.assign(temp_c = lambda x: round((x.temp_f - 32) * 5/9, 1))

In [163]:
central.head(1)

Unnamed: 0_level_0,datatype,station,flags,temp_f,temp_c
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-11-01 00:00:00-05:00,TAVG,GHCND:USW00013904,"H,,S,",61.0,16.1


In [161]:
temp_c

<function __main__.<lambda>(x)>

In [164]:
central_cat = central.assign(
    station = central.station.astype('category'),
    datatype = central.datatype.astype('category') )

In [165]:
central_cat.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 90 entries, 2022-11-01 00:00:00-05:00 to 2022-11-30 00:00:00-06:00
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   datatype  90 non-null     category
 1   station   90 non-null     category
 2   flags     90 non-null     object  
 3   temp_f    90 non-null     float64 
 4   temp_c    90 non-null     float64 
dtypes: category(2), float64(2), object(1)
memory usage: 3.2+ KB


In [167]:
central_cat.describe(include='category')

Unnamed: 0,datatype,station
count,90,90
unique,3,1
top,TAVG,GHCND:USW00013904
freq,30,90


In [171]:
# we can order category datatypes when needed
pd.Categorical(['station', 'datatype'], categories=['station', 'datatype'], ordered=True)

['station', 'datatype']
Categories (2, object): ['station' < 'datatype']