In [1]:
import numpy as np
import pandas as pd

import requests
import datetime

import os
import sys

Check ou this website for further documentaion regarding the weather API. <br>
https://www.ncdc.noaa.gov/cdo-web/webservices/v2#data

In [102]:
def get_region_info(mytoken, level='country'):
    
    """
    It gives us all the IDs for different countries, cities, states, or etc.
    """
    
    level_dict = {'country': 'CNTRY', 
                  'state': 'ST',
                  'city': 'CITY',
                  'county': 'CNTY',
                  'zip code': 'ZIP'}
    
    
    assert level in [*level_dict.keys()], f'level should be one of the {[*level_dict.keys()]}'
    
    
    token = {'token': mytoken}

    # passing as string instead of dict because NOAA API does not like percent encoding

    region = f'locationcategoryid={level_dict[level]}' + '&units=standard' + '&limit=1000'
    base_url = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/locations'
    r = requests.get(base_url, headers=token, params=region)
    print("Request status code: " + str(r.status_code))

    try:
        # results comes in json form. Convert to dataframe
        df = pd.DataFrame.from_dict(r.json()['results'])
        print("Successfully retrieved " + str(len(df['id'].unique())) + " stations")

        if df.count().max() >= 1000:
            print('WARNING: Maximum data limit was reached (limit = 1000)')
            print('Consider breaking your request into smaller pieces')

        return df[['name','id']]
    # Catch all exceptions for a bad request or missing data
    except:
        print("Error converting station data to dataframe. Missing data?")
        

In [321]:
def get_temp(locationid, begin_date, end_date, mytoken, data='daily'):
    
    """
    Get average, minimum, and maximum temperature for a location.
    """
    
   
    data_dict = {'daily': 'GHCND',
                  'monthly': 'GSOM',
                  'yearly': 'GSOY'}
    
    
    assert data in [*data_dict.keys()], (f'data should be one of the {[*data_dict.keys()]}. For more datasets visit', 
                                        'https://www.ncdc.noaa.gov/cdo-web/api/v2/datasets')
    

    token = {'token': mytoken}

    # passing as string instead of dict because NOAA API does not like percent encoding
    params = 'datasetid=GHCND' + '&locationid=' + str(locationid) + '&startdate=' + str(begin_date) + '&enddate=' + str(
        end_date) +'&limit=1000' + '&units=standard'+'&datacategoryid=TEMP'

    base_url = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data'

    r = requests.get(base_url, params=params, headers=token)
    print("Request status code: " + str(r.status_code))
    

    try:
        # results comes in json form. Convert to dataframe
        df = pd.DataFrame.from_dict(r.json()['results'])
        print("Successfully retrieved " + str(len(df['station'].unique())) + " stations")
        dates = pd.to_datetime(df['date'])
        print("Last date retrieved: " + str(dates.iloc[-1]))

        if df.count().max() == 1000:
            print('WARNING: Maximum data limit was reached (limit = 1000)')
            print('Consider breaking your request into smaller pieces')
        
        
        tavg = df[df.datatype=='TAVG'].value.values.mean()
        tmax = df[df.datatype=='TMAX'].value.values.mean()
        tmin = df[df.datatype=='TMIN'].value.values.mean()
        prcp = df[df.datatype=='PRCP'].value.values.mean()
        
        return tavg, tmax, tmin, prcp

    # Catch all exceptions for a bad request or missing data
    except:
        print("Error converting weather data to dataframe. Missing data?")
        return np.nan, np.nan, np.nan

# Implementation

In [371]:
countries = ['Italy', 'France']

In [372]:
df_locations = get_region_info(mytoken, level='country')
df_locations_dict = df_locations.set_index('name').to_dict()['id']

WINDOW = pd.date_range(start='2020-01-01', end='2021-07-10', freq='D')

output_list = []

for country in countries:
    country_id = df_locations_dict[country]
    
    for day in WINDOW[:5]:
        
        day = day.strftime('%Y-%m-%d')
        temp = get_temp(locationid=country_id, begin_date=day, end_date=day, mytoken=mytoken, data='daily')
        temp = (*temp, country, day)

        output_list.append(temp)

        
        
output = pd.DataFrame(output_list, columns=['tavg', 'tmax', 'tmin', 'prcp', 'Country', 'Day'])

Request status code: 200
Successfully retrieved 201 stations
Request status code: 200
Successfully retrieved 34 stations
Last date retrieved: 2020-01-01 00:00:00
Request status code: 200
Successfully retrieved 36 stations
Last date retrieved: 2020-01-02 00:00:00
Request status code: 200
Successfully retrieved 36 stations
Last date retrieved: 2020-01-03 00:00:00
Request status code: 200
Successfully retrieved 34 stations
Last date retrieved: 2020-01-04 00:00:00
Request status code: 200
Successfully retrieved 34 stations
Last date retrieved: 2020-01-05 00:00:00
Request status code: 200
Successfully retrieved 79 stations
Last date retrieved: 2020-01-01 00:00:00
Request status code: 200
Successfully retrieved 79 stations
Last date retrieved: 2020-01-02 00:00:00
Request status code: 200
Successfully retrieved 78 stations
Last date retrieved: 2020-01-03 00:00:00
Request status code: 200
Successfully retrieved 78 stations
Last date retrieved: 2020-01-04 00:00:00
Request status code: 200
Succe

In [373]:
output

Unnamed: 0,tavg,tmax,tmin,prcp,Country,Day
0,42.1875,49.034483,34.269231,0.0,Italy,2020-01-01
1,41.617647,50.074074,33.227273,0.0,Italy,2020-01-02
2,42.264706,51.592593,33.793103,0.0,Italy,2020-01-03
3,44.125,50.965517,38.56,0.001724,Italy,2020-01-04
4,43.4375,52.16,32.6875,0.01,Italy,2020-01-05
5,43.076923,50.0,36.910448,0.014487,France,2020-01-01
6,45.307692,53.186441,39.295775,0.027692,France,2020-01-02
7,48.103896,54.072464,41.354839,0.023974,France,2020-01-03
8,46.428571,52.411765,39.032787,0.016026,France,2020-01-04
9,43.909091,51.378788,37.163934,0.004744,France,2020-01-05
