In [None]:
import pandas as pd
import requests
import sys
import time
from requests.exceptions import SSLError

# Matthew's keys
noaa_key = "***REMOVED***"
usda_key = "***REMOVED***"

This notebook is not needed to run the analysis. As of 10/7 the NOAA data is not available due to the effects of the hurricane. We have downlaoded the data and saved it to the folder /data.

In [None]:
def get_usda_data(usda_key, year):
  # API endpoint
  url = "https://quickstats.nass.usda.gov/api/api_GET/"

  params = {
    "key": usda_key,
    "commodity_desc": "CORN",
    "statisticcat_desc": "YIELD",
    "unit_desc": "BU / ACRE",
    "state_alpha": "MI",
    "year": year,
    "short_desc": "CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",
    "agg_level_desc": "COUNTY",
    "format": "JSON"
  }

  response = requests.get(url, params=params)

  if response.status_code == 200:
    data = response.json()
    return pd.DataFrame(data['data'])
  else:
    print(f"Request failed with status code {response.status_code}")
    return None

In [None]:
import time
import pandas as pd
import requests
from requests.exceptions import SSLError

def is_valid_data(data):
    return data and isinstance(data, dict) and "results" in data and data["results"]

def get_raw_weather(state_ansi, county_ansi, startdate, enddate):
    url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data"
    headers = {
        'token': noaa_key
    }

    startdate = pd.to_datetime(startdate)
    enddate = pd.to_datetime(enddate)
    num_days = (enddate - startdate).days

    counties_failed = []

    def make_request(params):
        retries = 3
        backoff = 3
        for i in range(retries):
            try:
                response = requests.get(url, headers=headers, params=params)
                if response.status_code == 200:
                    data = response.json()
                    if is_valid_data(data):
                        return pd.DataFrame(data["results"])
                    else:
                        print(f"No data found for FIPS:{params['locationid']}")
                        counties_failed.append((params['locationid'], params['startdate'][:4]))  # Append location and year
                        return None
                elif response.status_code == 429:
                    print(f"Rate limit exceeded. Retrying in {backoff} seconds...")
                    time.sleep(backoff)
                    backoff *= 2  # Exponential backoff
                elif response.status_code == 503:
                    print("Server error. Retrying in 3 seconds...")
                    time.sleep(3)
                else:
                    print(response.text)
                    return None
            except SSLError as e:
                print(f"SSL Error encountered: {e}. Retrying in 10 seconds...")
                time.sleep(10)
        print(f"Failed to retrieve data after {retries} attempts.")
        counties_failed.append((params['locationid'], params['startdate'][:4]))  # Append location and year
        return None

    if num_days > 30:
        requests_list = []
        current_date = startdate
        while current_date <= enddate:
            current_enddate = current_date + pd.DateOffset(days=30)
            if current_enddate > enddate:
                current_enddate = enddate

            params = {
                "datasetid": "GHCND",
                "startdate": current_date.strftime("%Y-%m-%d"),
                "enddate": current_enddate.strftime("%Y-%m-%d"),
                "units": "standard",
                "limit": "1000",
                "locationid": f"FIPS:{state_ansi}{county_ansi}",
                'datatypeid': ['TMAX', 'TMIN', "PRCP"]
            }

            print(f"FIPS:{state_ansi}{county_ansi}")
            result = make_request(params)
            if result is not None:
                requests_list.append(result)

            current_date = current_enddate + pd.DateOffset(days=1)

        if requests_list:
            final_result = pd.concat(requests_list)
        else:
            final_result = None
    else:
        params = {
            "datasetid": "GHCND",
            "startdate": startdate.strftime("%Y-%m-%d"),
            "enddate": enddate.strftime("%Y-%m-%d"),
            "units": "standard",
            "limit": "1000",
            "locationid": f"FIPS:{state_ansi}{county_ansi}",
            'datatypeid': ['TMAX', 'TMIN', "PRCP"]
        }

        print(f"FIPS:{state_ansi}{county_ansi}")
        final_result = make_request(params)

    # At the end of the process, append counties and year to a file if there are any
    if counties_failed:
        with open('counties_failed.txt', 'a') as file:  # Open in append mode
            for county, year in counties_failed:  # Unpack the tuple
                file.write(f"{county}, {year}\n")
        print("Counties with no data have been appended to 'counties_failed.txt'.")

    return final_result

# Usage
# df = get_raw_weather(state_ansi, county_ansi, "2023-01-01", "2023-01-31")


In [None]:
def clean_weather(df,state_ansi,county_ansi):
    # For each day in date column average the TMAX and TMIN into their own columns across all stations
    df["date"] = pd.to_datetime(df["date"])
    df["TMAX"] = df["value"].where(df["datatype"] == "TMAX")
    df["TMIN"] = df["value"].where(df["datatype"] == "TMIN")
    df["PRCP"] = df["value"].where(df["datatype"] == "PRCP")
    
    df = df.groupby("date").agg({"TMAX": "mean", "TMIN": "mean", "PRCP": "mean"}).reset_index()

    # add state and county columns
    df["state_ansi"] = state_ansi
    df["county_ansi"] = county_ansi
    
    return df

In [None]:
# for each state_ansi county_ansi combo in the USDA data, get the weather data

def get_weather_data(usda_df,year):
    weather_data = []
    for index, row in usda_df.iterrows():
        state_ansi = row["state_ansi"]
        county_ansi = row["county_ansi"]
        #time.sleep(1)
        weather_raw = get_raw_weather(state_ansi, county_ansi, f"{year}-05-15T00:00:00", f"{year}-10-15T00:00:00")
        if weather_raw is not None:
            weather = clean_weather(weather_raw, state_ansi, county_ansi)
            weather_data.append(weather)
    return pd.concat(weather_data)

# weather_data = get_weather_data(usda_df)
# weather_data.to_csv("weather_data.csv", index=False)

In [None]:
years = range(1995, 2013)


for year in years:
  print(year)
  usda_data = get_usda_data(usda_key, year)
  usda_data.to_csv(f"data/usda_data_{year}.csv", index=False)
  weather_data=get_weather_data(usda_data,year)
  weather_data.to_csv(f"data/weather_data_{year}.csv", index=False)