**How to Query the Daily NOAA Global Historical Climatology Network Weather Data (BigQuery Dataset)**

In [1]:
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
noaa = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="ghcn_d")

In [2]:
bq_assistant = BigQueryHelper("bigquery-public-data", "ghcn_d")
bq_assistant.list_tables()

['ghcnd_1763',
 'ghcnd_1764',
 'ghcnd_1765',
 'ghcnd_1766',
 'ghcnd_1767',
 'ghcnd_1768',
 'ghcnd_1769',
 'ghcnd_1770',
 'ghcnd_1771',
 'ghcnd_1772',
 'ghcnd_1773',
 'ghcnd_1774',
 'ghcnd_1775',
 'ghcnd_1776',
 'ghcnd_1777',
 'ghcnd_1778',
 'ghcnd_1779',
 'ghcnd_1780',
 'ghcnd_1781',
 'ghcnd_1782',
 'ghcnd_1783',
 'ghcnd_1784',
 'ghcnd_1785',
 'ghcnd_1786',
 'ghcnd_1787',
 'ghcnd_1788',
 'ghcnd_1789',
 'ghcnd_1790',
 'ghcnd_1791',
 'ghcnd_1792',
 'ghcnd_1793',
 'ghcnd_1794',
 'ghcnd_1795',
 'ghcnd_1796',
 'ghcnd_1797',
 'ghcnd_1798',
 'ghcnd_1799',
 'ghcnd_1800',
 'ghcnd_1801',
 'ghcnd_1802',
 'ghcnd_1803',
 'ghcnd_1804',
 'ghcnd_1805',
 'ghcnd_1806',
 'ghcnd_1807',
 'ghcnd_1808',
 'ghcnd_1809',
 'ghcnd_1810',
 'ghcnd_1811',
 'ghcnd_1812',
 'ghcnd_1813',
 'ghcnd_1814',
 'ghcnd_1815',
 'ghcnd_1816',
 'ghcnd_1817',
 'ghcnd_1818',
 'ghcnd_1819',
 'ghcnd_1820',
 'ghcnd_1821',
 'ghcnd_1822',
 'ghcnd_1823',
 'ghcnd_1824',
 'ghcnd_1825',
 'ghcnd_1826',
 'ghcnd_1827',
 'ghcnd_1828',
 'ghcnd_18

In [3]:
bq_assistant.head("ghcnd_2018", num_rows=3)

Unnamed: 0,id,date,element,value,mflag,qflag,sflag,time
0,USC00360868,2018-04-21,PRCP,0.0,,,H,800
1,USC00415427,2018-04-21,PRCP,0.0,,,H,800
2,USC00473636,2018-04-21,PRCP,0.0,,,H,700


In [4]:
bq_assistant.table_schema("ghcnd_2018")

[SchemaField('id', 'STRING', 'REQUIRED', '', ()),
 SchemaField('date', 'DATE', 'NULLABLE', '', ()),
 SchemaField('element', 'STRING', 'NULLABLE', '', ()),
 SchemaField('value', 'FLOAT', 'NULLABLE', '', ()),
 SchemaField('mflag', 'STRING', 'NULLABLE', '', ()),
 SchemaField('qflag', 'STRING', 'NULLABLE', '', ()),
 SchemaField('sflag', 'STRING', 'NULLABLE', '', ()),
 SchemaField('time', 'STRING', 'NULLABLE', '', ())]

Find weather stations close to a specific location?  In this case, we are searching for stations close to Chicago (latitude=41.88, longitude=-87.63).



In [5]:
query1 = """SELECT
  id,
  name,
  state,
  latitude,
  longitude
FROM
  `bigquery-public-data.ghcn_d.ghcnd_stations`
WHERE
  latitude > 41.7
  AND latitude < 42
  AND longitude > -87.7
  AND longitude < -87.5;
        """
response1 = noaa.query_to_pandas_safe(query1)
response1.head(10)

Unnamed: 0,id,name,state,latitude,longitude
0,USC00111564,"""CHICAGO S WTR FILT PLT""",IL,41.75,-87.55
1,US1ILCK0204,"""BOOT CAMP-CHICAGO 4.2 SW""",IL,41.8421,-87.6891
2,US1ILCK0151,"""EVERGREEN PARK 0.5 NE""",IL,41.7274,-87.6949
3,USC00111562,"""CHICAGO SAN DIST OFC""",IL,41.9,-87.6333
4,US1ILCK0240,"""CHICAGO 2.7 WNW""",IL,41.9018,-87.6726
5,US1ILCK0179,"""CHICAGO 4.8 NNW""",IL,41.9481,-87.6588
6,US1ILCK0097,"""CHICAGO 6.8 NNE""",IL,41.9301,-87.6393
7,US1ILCK0104,"""LINCOLNWOOD 5.1 SE""",IL,41.9494,-87.6703
8,USC00111584,"""CHICAGO WB CITY 2""",IL,41.8833,-87.6333
9,USC00111526,"""CHICAGO GRANT PARK""",IL,41.8833,-87.6167


Daily rainfall amounts at specific station?  Here, we are obtaining rainfall (in mm) for all days in 2015 from a weather station in Chicago whose id is provided in the query (the station corresponds to O’Hare airport).

In [6]:
query2 = """SELECT
  wx.date,
  wx.value/10.0 AS prcp
FROM
  `bigquery-public-data.ghcn_d.ghcnd_2015` AS wx
WHERE
  id = 'USW00094846'
  AND qflag IS NULL
  AND element = 'PRCP'
ORDER BY wx.date;
        """
response2 = noaa.query_to_pandas_safe(query2, max_gb_scanned=10)
response2.head(10)

Unnamed: 0,date,prcp
0,2015-01-01,0.0
1,2015-01-02,0.0
2,2015-01-03,14.5
3,2015-01-04,3.3
4,2015-01-05,3.8
5,2015-01-06,0.3
6,2015-01-07,0.0
7,2015-01-08,1.5
8,2015-01-09,0.0
9,2015-01-10,0.0


Weather for the past two weeks?  Pulling daily min/max temperature (in Celsius) and rainfall (in mm) for the past 14 days.

In [7]:
query3 = """SELECT
  date,
  MAX(prcp) AS prcp,
  MAX(tmin) AS tmin,
  MAX(tmax) AS tmax
FROM (
  SELECT
    wx.date AS date,
    IF (wx.element = 'PRCP', wx.value/10, NULL) AS prcp,
    IF (wx.element = 'TMIN', wx.value/10, NULL) AS tmin,
    IF (wx.element = 'TMAX', wx.value/10, NULL) AS tmax
  FROM
    `bigquery-public-data.ghcn_d.ghcnd_2018` AS wx
  WHERE
    id = 'USW00094846'
    AND DATE_DIFF(CURRENT_DATE(), wx.date, DAY) < 15
)
GROUP BY
  date
ORDER BY
  date ASC;
        """
response3 = noaa.query_to_pandas_safe(query3, max_gb_scanned=10)
response3.head(10)

Unnamed: 0,date,prcp,tmin,tmax
0,2018-04-09,3.3,-1.1,3.9
1,2018-04-10,0.0,-2.2,7.8
2,2018-04-11,1.3,3.9,18.9
3,2018-04-12,0.8,5.6,22.8
4,2018-04-13,1.8,3.9,8.3
5,2018-04-14,24.4,2.2,3.9
6,2018-04-15,18.8,-2.2,2.8
7,2018-04-16,0.3,-2.8,1.7
8,2018-04-17,0.0,-1.7,6.1
9,2018-04-18,10.4,-1.1,2.8
