In [None]:
DIR = './ExcelDataFiles/'

In [None]:
import requests
import json
import numpy as np
from io import StringIO
import csv
import time

In [None]:
def csv2dict(fname):
    # Read a temp dat csv and find unique year for each county
    dct = {}
    with open(DIR+fname+'.csv', newline='') as f:
        reader = csv.reader(f)
        i = 0
        for row in reader:
            i += 1
            if i == 1: continue
            # row[1] is county name
            if row[1].lower() not in dct: dct[row[1].lower()] = set()
            dct[row[1].lower()].add(row[2]) # Add the year to that county
    return dct

In [None]:
def csv2fullDict(fname):
    # Create a dictionary with counties as keys, for each county there is a dictionary with years as keys
    # and 2 lists as values, first list is maxt for all days in the year, second list is mint
    dct = {}
    with open(DIR+fname+'.csv', newline='') as f:
        reader = csv.reader(f)
        i = 0
        for row in reader:
            i += 1
            if i == 1: continue
            if row[1].lower() not in dct: dct[row[1].lower()] = {}
            if row[2] not in dct[row[1].lower()]: dct[row[1].lower()][row[2]] = [[], []]
            dct[row[1].lower()][row[2]][0].append(row[4])
            dct[row[1].lower()][row[2]][1].append(row[5])
    return dct

# 1. Fetching web temp data
This happens first

In [None]:
def dicts_for_counties (county_meta):
    '''Get string with all counties for a state and their codes
    return two dictionaries one with county_name as keys and code value and the
    other codes keys county_name and value '''
    county_code = {}
    code_county = {}
    for ln in county_meta.split("\n"):
        lnlst = ln.split("|")
        if len(lnlst) > 1 and lnlst[1].isdigit(): # skip the first header line
            # Some counties name have to words name like st. clair or du page
            # make the logic take all the words up to "County"
            cnt_name = lnlst[4][:lnlst[4].index(" County")]
            county_code[cnt_name] = "".join(lnlst[1:3])
            code_county["".join(lnlst[1:3])] = cnt_name
    return (county_code, code_county)

In [None]:
# For each county we ask for meta data: name of station, sids - station ids (multiple), state, valid range for
# each parameter
# ll for logitude, latitude
# sids is a list with 2 numbers id,type in each element
# parameters are in elems: maxt,mint,pcpn
def get_stations_for_state(state):
    input_dict = {
    'state' : state,
    'meta' : 'name,county,sids,state,valid_daterange,ll',
    'elems' : "maxt,mint,pcpn"
    }
    params = {'params': json.dumps(input_dict)}
    headers = {'Accept': 'application/json'}
    # If the server fails to respond after 10 seconds, the program will stop
    resp = requests.post('http://data.rcc-acis.org/StnMeta', data=params, headers=headers, timeout = 10)
    response_station_meta = resp.json()
    stations_meta = response_station_meta['meta']
    return stations_meta

In [None]:
def get_valid_stations_for_dates(st_meta, syear, eyear):
    station_valid_date_meta = []
    for e in st_meta:
        vr = e['valid_daterange'] # arry with valid range for each data asked
        cnt = 0
        for r in vr:
            if len(r) == 0: break   # Need all 3 pcpn, maxt, mint, if one is missing we skip station
            else:
                # Checking if the available dates covers syear-eyear
                syr, smnth, sday = [int(e) for e in r[0].split('-')]
                eyr, emnth, eday = [int(e) for e in r[1].split('-')]
                if (syr < syear or syr == syear and smnth == 1 and sday == 1) and \
                    (eyr > eyear or eyr == eyear and smnth == 12 and sday == 31):
                    cnt += 1
        if cnt < 3: continue # don't have valid range for all 3
        station_valid_date_meta.append(e)
    return station_valid_date_meta

In [None]:
def county_stations_dct(valid_stations, code_county):
    '''First argument is list of valid stations.
    Second argument is a dictionary that map county code to its name.
    return dictionary with county as key and list of stations within the county'''
    cnty_st_meta = {}
    for stn in valid_stations:
        if 'county' not in stn: continue  # Since there is a station with no county code
        cnty_nm = code_county[stn['county']]
        if cnty_nm not in cnty_st_meta:
            cnty_st_meta[cnty_nm] = [] # First time we see stn in a certain county
        cnty_st_meta[cnty_nm].append(stn)
    return cnty_st_meta

In [None]:
def show_csv(csv_string):
    f = StringIO(csv_string)
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        print('\t'.join(row))

def get_maxt_mint_pcpn(sid, strt_date, end_date, fmt='csv', prnt=False):
    '''Send a request to ncei to get maxt,mint,pcpn for a specific stn
    by stn ID for a range of dates'''
    elems = '1,2,4'   # code for maxt,mint,pcpn
    input_dict = {
    'sid' : sid,
    'sdate' : strt_date,
    'edate' : end_date,
    'elems' : elems,
    'output' : fmt
    }
    params = {'params': json.dumps(input_dict)}
    headers = {'Accept': 'application/json'}
    resp = requests.post('http://data.rcc-acis.org/StnData', data=params, headers=headers)
    if not prnt: return resp   # The normal case csv and do not print
    if fmt == 'csv': return show_csv(resp.text)  # CSV and print
    return resp.json() # json will show nicely

def csv_resp_to_data(csv_str):
    '''
    Get string of csv format, convert to stringIO so that csv.reader can be used
    to read it.
    Conver temp to celsius and pcpn to mm
    '''
    f = StringIO(csv_str)
    reader = csv.reader(f, delimiter=',')
    mxt = []; mnt = []; prc = []
    for row in reader:
        if len(row) < 4: continue # skip first line with station name e.g. ['IOLA 1 W']
        try: mxt.append(round((float(row[1])-32)*5/9, 3))    # convert to Celsius, round to 3 decimals
        except: mxt.append(None)
        try: mnt.append(round((float(row[2])-32)*5/9, 3))
        except: mnt.append(None)
        try: prc.append(round(float(row[3])*25.4, 2))   # convert to mm and check if T or M i.e. not numeric
        except: prc.append(None)
    return (mxt, mnt, prc)

def get_valid_stations_for_year(st_meta, year):
    return get_valid_stations_for_dates(st_meta, year, year)

In [None]:
from copy import deepcopy

def nan_to_surrounding_mean(l):
    '''
    Argument is list of 3 lists, maxt, mint, pcpn (3 lists per station in county)
    For each of the 3 lists if there are fewer than 151 nans, trace consecutive sequences of nan
    and fill them with the mean of one before and one after
    '''
    num_nans = [0, 0, 0] # number of nans in each of the 3 lists
    done = [True, True, True]  # Whether we replace nan or not
    new_l = [None, None, None]
    for i in range(len(l)):
        npl = np.array(l[i], dtype=float)
        nans = np.isnan(npl).sum()
        num_nans[i] = nans
        if nans < int(len(l[i])/2.5):
            #nan_indcis = list(map(list, np.where(np.isnan(npl))))[0] # a list
            nan_indcis = np.where(np.isnan(npl))[0].tolist()
            j = 0
            while j < (len(nan_indcis)-1):
                start = nan_indcis[j]; end = nan_indcis[j]
                # print(nan_indcis, 'Outer', j)
                while nan_indcis[j] == (nan_indcis[j+1]-1):  # consecutive
                    j += 1
                    # print('\tInner', j)
                    if j == (len(nan_indcis) - 1): break
                end = nan_indcis[j]
                if end - start < int(len(l[i])/3.5):   # a bit over a season
                    if start == 0:  # there is nothing before
                        npl[start:end+1] = npl[end+1]
                    elif end == (len(l[i])-1):  # the last element is nan
                        npl[start:end+1] = npl[start-1]
                    else:  # In between - take average of before and after
                        npl[start:end+1] = (npl[start-1] + npl[end+1])/2
                else: done[i] = False
                j += 1
        else: done[i] = False
        new_l[i] = deepcopy(list(npl))
    return (done, num_nans, new_l)

def get_cnty_data(cnty_st_meta, sdate, edate):
    '''argument is dictionary with counties as key and list of stations as value
    Return dictionary with counties as keys. For each county there is a list
    with a tuple for each station in the county.
    Each tuple has 3 lists: maxt,mint,pcpn, the length of each is the number of
    days requested.'''
    counties_dat = {}
    i = 0
    for c in cnty_st_meta: # Go over each county c
        i += 1
        #if i > 35: break    # TODO: Remove break to go over all counties
        stn_lst = cnty_st_meta[c]
        cnty_dat = []
        for stn in stn_lst:
            # Get tuple with 3 lists (maxt,mint,pcpn from the server response)
            sid_idx = 0  # Start with first sid in the list of sids for the station
            dat = get_maxt_mint_pcpn(stn['sids'][sid_idx].split(' ')[0], sdate, edate)
            # Add this stations tuple to the county_dat
            data_lsts = csv_resp_to_data(dat.text)
            # NEW CODE
            # Check how many nan in maxt and mint. If not 0 store the current minimum
            # of number of nan and the sid, then try another sid, and so on
            # until finding sid that gives no nan or choose the one that gives the minimum
            # number of them
            mxt, mnt, prc = data_lsts
            min_nans_and_sid = (1e6, None)  # Contain the minimum # nans so far and SID it was in
            # put large number 1e6 = 1,000,000 to find something less on first iteration
            nan_cnt_mx = np.isnan(np.array(mxt, dtype=float)).sum()
            nan_cnt_mn = np.isnan(np.array(mnt, dtype=float)).sum()
            if nan_cnt_mx == 0 and nan_cnt_mn == 0:
                 cnty_dat.append(data_lsts)
                 continue  # Will not enter while loop
            # This while loop goes over the other sids of the station to find the one with minimum
            # number of nans
            while (nan_cnt_mx > 0 or nan_cnt_mn > 0):
                crnt_min = max(nan_cnt_mx, nan_cnt_mn)  # The the higher of the two
                ##print(c, stn['name'], sid_idx, crnt_min)
                if crnt_min < min_nans_and_sid[0]:
                    min_nans_and_sid = (crnt_min, sid_idx)
                    data_lsts_to_keep = deepcopy(data_lsts)
                if sid_idx == (len(stn['sids']) - 1): break
                sid_idx +=1  # move to next sid; trying other sids for the station to see if they have less nans
                dat = get_maxt_mint_pcpn(stn['sids'][sid_idx].split(' ')[0], sdate, edate)
                data_lsts = csv_resp_to_data(dat.text)
                mxt, mnt, prc = data_lsts
                nan_cnt_mx = np.isnan(np.array(mxt, dtype=float)).sum()
                nan_cnt_mn = np.isnan(np.array(mnt, dtype=float)).sum()
            cnty_dat.append(data_lsts_to_keep)  # If we entered the while loop will save the best
        counties_dat[c] = cnty_dat
    return counties_dat

def compare_datas(dat1, dat2):
    good = True
    for k, v in dat1.items():
        #dat2[k][0][0][0] = -5000   # Uncomment to check that compare goes deep
        if not (v == dat2[k]):
            print(f"---ERROR--- in comparison of 2 requests for county {k}")
            good = False

In [None]:
from datetime import datetime

def extract_data(stations_meta, code_cnty, state, yrs = range(1999,2020)):
    '''
    results dictionary:
        state:
            year:
                county:
     see               maxt np.array, mint np.array pcpn np.array
    '''
    ''' In the following code you sometime may see:
    <ipython-input-15-4e8ce361a02d>:53: RuntimeWarning: Mean of empty slice
      nparmean = np.nanmean(npar, axis=0)    # mean over row, ignore nan
    This is when pcpn are all nan
    '''
    # Initializing the dictionary
    results = {state: {}}
    report = {'temp Nan Not Removed': {}, 'pcpn Nan Not Removed':{}, 'all Nan Removed #nans': {}}

    for yr in yrs:
        results[state][yr] = {}
        print("Year", yr)
        # Get valid station for year yr
        station_valid_meta = get_valid_stations_for_year(stations_meta, yr)
        # Get dictionary with stations per county
        cnty_stations_meta = county_stations_dct(station_valid_meta, code_cnty)

        sdate = str(yr)+'-01-01'
        edate = str(yr)+'-12-31'
        # Get the actual data twice and compare the results for now since there were
        # few cases I saw with diff data, but lately the compare looks good so can remove
        # the second and the compare when running for the whole data set
        cnt_dat = get_cnty_data(cnty_stations_meta, sdate, edate)
        for c, v in cnt_dat.items():
            npar = np.array(v, dtype=float)
            nparmean = np.nanmean(npar, axis=0)    # mean over row, ignore nan
            done, nans, lsts = nan_to_surrounding_mean(nparmean.tolist())
            if sum(done[:-1]) < 2:
                if c not in report['temp Nan Not Removed']: report['temp Nan Not Removed'][c] = {}
                if yr not in report['temp Nan Not Removed'][c]: report['temp Nan Not Removed'][c][yr] = \
                    ("maxt "+str(nans[0])+" " if not done[0] else "") + \
                    ("mint "+str(nans[1]) if not done[1] else "")
            if not done[2]:
                if c not in report['pcpn Nan Not Removed']: report['pcpn Nan Not Removed'][c] = {}
                if yr not in report['pcpn Nan Not Removed'][c]:
                    report['pcpn Nan Not Removed'][c][yr] = str(nans[2])
            if sum(done) == 3:
                if c not in report['all Nan Removed #nans']: report['all Nan Removed #nans'][c] = {}
                if yr not in report['all Nan Removed #nans'][c]:
                    report['all Nan Removed #nans'][c][yr] = [str(e) for e in nans]

            results[state][yr][c] = lsts  # convert np matrix to list of lists

    # Write to file and add date/time to the file name
    current_datetime = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
    str_current_datetime = str(current_datetime)
    fname = state+'_report_'+str_current_datetime+'.txt'
    with open(fname, 'w') as f:
        for k in report:
            f.write(k+":\n")
            for c in report[k]:
                f.write(f'    {c}:\n')
                for yr, v in report[k][c].items():
                    f.write(f'\t{yr}: {v}\n')

    return results

In [None]:
##### Kansas data ######
# To get codes for counties go to https://www.census.gov/library/reference/code-lists/ansi.html#cou
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st20_ks_cou2020.txt

KS_counties_codes = """STATE|STATEFP|COUNTYFP|COUNTYNS|COUNTYNAME|CLASSFP|FUNCSTAT
KS|20|001|00484970|Allen County|H1|A
KS|20|003|00484971|Anderson County|H1|A
KS|20|005|00484972|Atchison County|H1|A
KS|20|007|00484973|Barber County|H1|A
KS|20|009|00484974|Barton County|H1|A
KS|20|011|00484975|Bourbon County|H1|A
KS|20|013|00484976|Brown County|H1|A
KS|20|015|00484977|Butler County|H1|A
KS|20|017|00484978|Chase County|H1|A
KS|20|019|00484979|Chautauqua County|H1|A
KS|20|021|00484980|Cherokee County|H1|A
KS|20|023|00484981|Cheyenne County|H1|A
KS|20|025|00484982|Clark County|H1|A
KS|20|027|00484983|Clay County|H1|A
KS|20|029|00484984|Cloud County|H1|A
KS|20|031|00484985|Coffey County|H1|A
KS|20|033|00484986|Comanche County|H1|A
KS|20|035|00484987|Cowley County|H1|A
KS|20|037|00484988|Crawford County|H1|A
KS|20|039|00484989|Decatur County|H1|A
KS|20|041|00484990|Dickinson County|H1|A
KS|20|043|00484991|Doniphan County|H1|A
KS|20|045|00484992|Douglas County|H1|A
KS|20|047|00484993|Edwards County|H1|A
KS|20|049|00484994|Elk County|H1|A
KS|20|051|00484995|Ellis County|H1|A
KS|20|053|00484996|Ellsworth County|H1|A
KS|20|055|00485326|Finney County|H1|A
KS|20|057|00484997|Ford County|H1|A
KS|20|059|00484998|Franklin County|H1|A
KS|20|061|00484999|Geary County|H1|A
KS|20|063|00485000|Gove County|H1|A
KS|20|065|00481811|Graham County|H1|A
KS|20|067|00485099|Grant County|H1|A
KS|20|069|00485001|Gray County|H1|A
KS|20|071|00485002|Greeley County|H6|C
KS|20|073|00485003|Greenwood County|H1|A
KS|20|075|00485327|Hamilton County|H1|A
KS|20|077|00485004|Harper County|H1|A
KS|20|079|00485005|Harvey County|H1|A
KS|20|081|00485328|Haskell County|H1|A
KS|20|083|00485006|Hodgeman County|H1|A
KS|20|085|00485007|Jackson County|H1|A
KS|20|087|00485008|Jefferson County|H1|A
KS|20|089|00485009|Jewell County|H1|A
KS|20|091|00485010|Johnson County|H1|A
KS|20|093|00485011|Kearny County|H1|A
KS|20|095|00485012|Kingman County|H1|A
KS|20|097|00485013|Kiowa County|H1|A
KS|20|099|00485014|Labette County|H1|A
KS|20|101|00485015|Lane County|H1|A
KS|20|103|00485016|Leavenworth County|H1|A
KS|20|105|00485017|Lincoln County|H1|A
KS|20|107|00485018|Linn County|H1|A
KS|20|109|00485019|Logan County|H1|A
KS|20|111|00485020|Lyon County|H1|A
KS|20|113|00485021|McPherson County|H1|A
KS|20|115|00485022|Marion County|H1|A
KS|20|117|00485023|Marshall County|H1|A
KS|20|119|00485024|Meade County|H1|A
KS|20|121|00485025|Miami County|H1|A
KS|20|123|00485026|Mitchell County|H1|A
KS|20|125|00485027|Montgomery County|H1|A
KS|20|127|00485028|Morris County|H1|A
KS|20|129|00485135|Morton County|H1|A
KS|20|131|00485029|Nemaha County|H1|A
KS|20|133|00485030|Neosho County|H1|A
KS|20|135|00485031|Ness County|H1|A
KS|20|137|00485032|Norton County|H1|A
KS|20|139|00485033|Osage County|H1|A
KS|20|141|00485034|Osborne County|H1|A
KS|20|143|00485035|Ottawa County|H1|A
KS|20|145|00485036|Pawnee County|H1|A
KS|20|147|00485037|Phillips County|H1|A
KS|20|149|00485038|Pottawatomie County|H1|A
KS|20|151|00485039|Pratt County|H1|A
KS|20|153|00485040|Rawlins County|H1|A
KS|20|155|00485041|Reno County|H1|A
KS|20|157|00485042|Republic County|H1|A
KS|20|159|00485043|Rice County|H1|A
KS|20|161|00485044|Riley County|H1|A
KS|20|163|00485045|Rooks County|H1|A
KS|20|165|00485358|Rush County|H1|A
KS|20|167|00485046|Russell County|H1|A
KS|20|169|00485047|Saline County|H1|A
KS|20|171|00485048|Scott County|H1|A
KS|20|173|00485049|Sedgwick County|H1|A
KS|20|175|00485050|Seward County|H1|A
KS|20|177|00485051|Shawnee County|H1|A
KS|20|179|00485052|Sheridan County|H1|A
KS|20|181|00485053|Sherman County|H1|A
KS|20|183|00484969|Smith County|H1|A
KS|20|185|00485054|Stafford County|H1|A
KS|20|187|00485055|Stanton County|H1|A
KS|20|189|00485056|Stevens County|H1|A
KS|20|191|00481812|Sumner County|H1|A
KS|20|193|00485057|Thomas County|H1|A
KS|20|195|00485058|Trego County|H1|A
KS|20|197|00485059|Wabaunsee County|H1|A
KS|20|199|00485060|Wallace County|H1|A
KS|20|201|00485061|Washington County|H1|A
KS|20|203|00485062|Wichita County|H1|A
KS|20|205|00485063|Wilson County|H1|A
KS|20|207|00485064|Woodson County|H1|A
KS|20|209|00485065|Wyandotte County|H6|C
"""

# The dictionaries for the counties
KS_county_code, KS_code_county = dicts_for_counties(KS_counties_codes)
# Extracting all available weather stations in KS
KS_stations_meta = get_stations_for_state("KS")


Year 1999


  nparmean = np.nanmean(npar, axis=0)    # mean over row, ignore nan


In [None]:
# # Testing function
# hh = extract_data(KS_stations_meta, KS_code_county, state = "Kansas", yrs = range(1999,2001))

In [None]:
def save_file_as_csv(st_results, file_name, state):
    with open(file_name + ".csv", 'w') as f:
        f.write("State,County,Year,day,maxt,mint,prcp\n")
        for stt in st_results:
            for yr in st_results[stt]:
                for c, lsts in st_results[stt][yr].items():
                    for i in range(len(lsts[0])): # Number of days
                        f.write(f'{state},{c},{str(yr)},{str(i+1)},{lsts[0][i]},{lsts[1][i]},{lsts[2][i]}\n')
                        #print(s, c, yr, lsts[0][i], lsts[1][i], lsts[2][i])

In [None]:
# Extracting the weather and prcp data for Kansas
KS_temp = extract_data(KS_stations_meta, KS_code_county, state = "Kansas", yrs = range(1999,2020))
save_file_as_csv(KS_temp, "KansasTempData", state = "Kansas")

In [None]:
##### Iowa data ######
IA_counties_codes = """STATE|STATEFP|COUNTYFP|COUNTYNS|COUNTYNAME|CLASSFP|FUNCSTAT
IA|19|001|00465190|Adair County|H1|A
IA|19|003|00465191|Adams County|H1|A
IA|19|005|00465192|Allamakee County|H1|A
IA|19|007|00465193|Appanoose County|H1|A
IA|19|009|00465194|Audubon County|H1|A
IA|19|011|00465195|Benton County|H1|A
IA|19|013|00465196|Black Hawk County|H1|A
IA|19|015|00465197|Boone County|H1|A
IA|19|017|00465198|Bremer County|H1|A
IA|19|019|00465199|Buchanan County|H1|A
IA|19|021|00465200|Buena Vista County|H1|A
IA|19|023|00465201|Butler County|H1|A
IA|19|025|00465202|Calhoun County|H1|A
IA|19|027|00465203|Carroll County|H1|A
IA|19|029|00465204|Cass County|H1|A
IA|19|031|00465205|Cedar County|H1|A
IA|19|033|00465206|Cerro Gordo County|H1|A
IA|19|035|00465207|Cherokee County|H1|A
IA|19|037|00465208|Chickasaw County|H1|A
IA|19|039|00465209|Clarke County|H1|A
IA|19|041|00465625|Clay County|H1|A
IA|19|043|00465210|Clayton County|H1|A
IA|19|045|00465211|Clinton County|H1|A
IA|19|047|00465212|Crawford County|H1|A
IA|19|049|00465213|Dallas County|H1|A
IA|19|051|00465214|Davis County|H1|A
IA|19|053|00465215|Decatur County|H1|A
IA|19|055|00465216|Delaware County|H1|A
IA|19|057|00465217|Des Moines County|H1|A
IA|19|059|00465218|Dickinson County|H1|A
IA|19|061|00465219|Dubuque County|H1|A
IA|19|063|00465220|Emmet County|H1|A
IA|19|065|00465221|Fayette County|H1|A
IA|19|067|00465222|Floyd County|H1|A
IA|19|069|00465223|Franklin County|H1|A
IA|19|071|00465224|Fremont County|H1|A
IA|19|073|00465225|Greene County|H1|A
IA|19|075|00465226|Grundy County|H1|A
IA|19|077|00465227|Guthrie County|H1|A
IA|19|079|00465228|Hamilton County|H1|A
IA|19|081|00465229|Hancock County|H1|A
IA|19|083|00465230|Hardin County|H1|A
IA|19|085|00465231|Harrison County|H1|A
IA|19|087|00465232|Henry County|H1|A
IA|19|089|00465233|Howard County|H1|A
IA|19|091|00465234|Humboldt County|H1|A
IA|19|093|00465235|Ida County|H1|A
IA|19|095|00465236|Iowa County|H1|A
IA|19|097|00465237|Jackson County|H1|A
IA|19|099|00465238|Jasper County|H1|A
IA|19|101|00465239|Jefferson County|H1|A
IA|19|103|00465240|Johnson County|H1|A
IA|19|105|00465241|Jones County|H1|A
IA|19|107|00465242|Keokuk County|H1|A
IA|19|109|00465243|Kossuth County|H1|A
IA|19|111|00465244|Lee County|H1|A
IA|19|113|00465245|Linn County|H1|A
IA|19|115|00465246|Louisa County|H1|A
IA|19|117|00465247|Lucas County|H1|A
IA|19|119|00465248|Lyon County|H1|A
IA|19|121|00465249|Madison County|H1|A
IA|19|123|00465250|Mahaska County|H1|A
IA|19|125|00465251|Marion County|H1|A
IA|19|127|00465252|Marshall County|H1|A
IA|19|129|00465253|Mills County|H1|A
IA|19|131|00465254|Mitchell County|H1|A
IA|19|133|00465255|Monona County|H1|A
IA|19|135|00465256|Monroe County|H1|A
IA|19|137|00465257|Montgomery County|H1|A
IA|19|139|00465258|Muscatine County|H1|A
IA|19|141|00465259|O'Brien County|H1|A
IA|19|143|00465260|Osceola County|H1|A
IA|19|145|00465261|Page County|H1|A
IA|19|147|00465262|Palo Alto County|H1|A
IA|19|149|00465263|Plymouth County|H1|A
IA|19|151|00465264|Pocahontas County|H1|A
IA|19|153|00465265|Polk County|H1|A
IA|19|155|00465266|Pottawattamie County|H1|A
IA|19|157|00465267|Poweshiek County|H1|A
IA|19|159|00465268|Ringgold County|H1|A
IA|19|161|00465269|Sac County|H1|A
IA|19|163|00465270|Scott County|H1|A
IA|19|165|00465271|Shelby County|H1|A
IA|19|167|00465272|Sioux County|H1|A
IA|19|169|00465273|Story County|H1|A
IA|19|171|00465274|Tama County|H1|A
IA|19|173|00465275|Taylor County|H1|A
IA|19|175|00465276|Union County|H1|A
IA|19|177|00465277|Van Buren County|H1|A
IA|19|179|00465278|Wapello County|H1|A
IA|19|181|00465279|Warren County|H1|A
IA|19|183|00465280|Washington County|H1|A
IA|19|185|00465281|Wayne County|H1|A
IA|19|187|00465282|Webster County|H1|A
IA|19|189|00465283|Winnebago County|H1|A
IA|19|191|00465284|Winneshiek County|H1|A
IA|19|193|00465285|Woodbury County|H1|A
IA|19|195|00465286|Worth County|H1|A
IA|19|197|00465287|Wright County|H1|A"""
# The dictionaries for the counties
IA_county_code, IA_code_county = dicts_for_counties(IA_counties_codes)
# Extracting all available weather stations in KS
IA_stations_meta = get_stations_for_state("IA")

In [None]:
IA_temp = extract_data(IA_stations_meta, IA_code_county, state = "Iowa", yrs = range(1999,2020))
save_file_as_csv(IA_temp, "IowaTempData", state = "Iowa")

Year 1999


  nparmean = np.nanmean(npar, axis=0)    # mean over row, ignore nan


Year 2000
Year 2001
Year 2002
Year 2003
Year 2004
Year 2005
Year 2006
Year 2007
Year 2008
Year 2009


In [None]:
#### Illinois Data #####
IL_counties_codes = """STATE|STATEFP|COUNTYFP|COUNTYNS|COUNTYNAME|CLASSFP|FUNCSTAT
IL|17|001|00424202|Adams County|H1|A
IL|17|003|00424203|Alexander County|H1|A
IL|17|005|00424204|Bond County|H1|A
IL|17|007|00424205|Boone County|H1|A
IL|17|009|00424206|Brown County|H1|A
IL|17|011|00424207|Bureau County|H1|A
IL|17|013|00424208|Calhoun County|H1|A
IL|17|015|00424209|Carroll County|H1|A
IL|17|017|00424210|Cass County|H1|A
IL|17|019|00424211|Champaign County|H1|A
IL|17|021|00424212|Christian County|H1|A
IL|17|023|00424213|Clark County|H1|A
IL|17|025|00424214|Clay County|H1|A
IL|17|027|00424215|Clinton County|H1|A
IL|17|029|00424216|Coles County|H1|A
IL|17|031|01784766|Cook County|H1|A
IL|17|033|00424218|Crawford County|H1|A
IL|17|035|00424219|Cumberland County|H1|A
IL|17|037|00422190|DeKalb County|H1|A
IL|17|039|00426598|De Witt County|H1|A
IL|17|041|00424222|Douglas County|H1|A
IL|17|043|00422191|DuPage County|H1|A
IL|17|045|00424224|Edgar County|H1|A
IL|17|047|00424225|Edwards County|H1|A
IL|17|049|00424226|Effingham County|H1|A
IL|17|051|00424227|Fayette County|H1|A
IL|17|053|00424228|Ford County|H1|A
IL|17|055|00424229|Franklin County|H1|A
IL|17|057|00424230|Fulton County|H1|A
IL|17|059|00424231|Gallatin County|H1|A
IL|17|061|00424232|Greene County|H1|A
IL|17|063|00424233|Grundy County|H1|A
IL|17|065|00424234|Hamilton County|H1|A
IL|17|067|00424235|Hancock County|H1|A
IL|17|069|00424236|Hardin County|H1|A
IL|17|071|00424237|Henderson County|H1|A
IL|17|073|00424238|Henry County|H1|A
IL|17|075|00424239|Iroquois County|H1|A
IL|17|077|00424240|Jackson County|H1|A
IL|17|079|00424241|Jasper County|H1|A
IL|17|081|00424242|Jefferson County|H1|A
IL|17|083|00424243|Jersey County|H1|A
IL|17|085|00424244|Jo Daviess County|H1|A
IL|17|087|00424245|Johnson County|H1|A
IL|17|089|00424246|Kane County|H1|A
IL|17|091|00424247|Kankakee County|H1|A
IL|17|093|00424248|Kendall County|H1|A
IL|17|095|00424249|Knox County|H1|A
IL|17|097|01784796|Lake County|H1|A
IL|17|099|00422247|LaSalle County|H1|A
IL|17|101|00424252|Lawrence County|H1|A
IL|17|103|00424253|Lee County|H1|A
IL|17|105|00424254|Livingston County|H1|A
IL|17|107|00424255|Logan County|H1|A
IL|17|109|01784729|McDonough County|H1|A
IL|17|111|01784815|McHenry County|H1|A
IL|17|113|01784833|McLean County|H1|A
IL|17|115|00424256|Macon County|H1|A
IL|17|117|00424257|Macoupin County|H1|A
IL|17|119|00424258|Madison County|H1|A
IL|17|121|00424259|Marion County|H1|A
IL|17|123|00424260|Marshall County|H1|A
IL|17|125|00424261|Mason County|H1|A
IL|17|127|01784730|Massac County|H1|A
IL|17|129|00424266|Menard County|H1|A
IL|17|131|01784750|Mercer County|H1|A
IL|17|133|01784865|Monroe County|H1|A
IL|17|135|01784866|Montgomery County|H1|A
IL|17|137|00424270|Morgan County|H1|A
IL|17|139|01784885|Moultrie County|H1|A
IL|17|141|01784894|Ogle County|H1|A
IL|17|143|01784920|Peoria County|H1|A
IL|17|145|01784940|Perry County|H1|A
IL|17|147|00424275|Piatt County|H1|A
IL|17|149|01784941|Pike County|H1|A
IL|17|151|00424277|Pope County|H1|A
IL|17|153|01784966|Pulaski County|H1|A
IL|17|155|00424279|Putnam County|H1|A
IL|17|157|01784967|Randolph County|H1|A
IL|17|159|00424281|Richland County|H1|A
IL|17|161|00424282|Rock Island County|H1|A
IL|17|163|01784987|St. Clair County|H1|A
IL|17|165|00424283|Saline County|H1|A
IL|17|167|01785010|Sangamon County|H1|A
IL|17|169|01785037|Schuyler County|H1|A
IL|17|171|00424286|Scott County|H1|A
IL|17|173|01785051|Shelby County|H1|A
IL|17|175|00424288|Stark County|H1|A
IL|17|177|01785076|Stephenson County|H1|A
IL|17|179|01785094|Tazewell County|H1|A
IL|17|181|01785113|Union County|H1|A
IL|17|183|01785114|Vermilion County|H1|A
IL|17|185|00424293|Wabash County|H1|A
IL|17|187|01785134|Warren County|H1|A
IL|17|189|01785150|Washington County|H1|A
IL|17|191|00424296|Wayne County|H1|A
IL|17|193|00424297|White County|H1|A
IL|17|195|01785167|Whiteside County|H1|A
IL|17|197|01785190|Will County|H1|A
IL|17|199|01785215|Williamson County|H1|A
IL|17|201|01785216|Winnebago County|H1|A
IL|17|203|01785231|Woodford County|H1|A"""
# The dictionaries for the counties
IL_county_code, IL_code_county = dicts_for_counties(IL_counties_codes)
# Extracting all available weather stations in IL
IL_stations_meta = get_stations_for_state("IL")

In [None]:
IL_temp = extract_data(IL_stations_meta, IL_code_county, state = "Illinois", yrs = range(1999,2001))
#save_file_as_csv(IL_temp, "IllinoisTempData1", state = "Illinois")

Year 1999


  nparmean = np.nanmean(npar, axis=0)    # mean over row, ignore nan


Year 2000


In [None]:
##### Indiana Data #####
IN_counties_codes = """STATE|STATEFP|COUNTYFP|COUNTYNS|COUNTYNAME|CLASSFP|FUNCSTAT
IN|18|001|00450401|Adams County|H1|A
IN|18|003|00450402|Allen County|H1|A
IN|18|005|00451683|Bartholomew County|H1|A
IN|18|007|00450403|Benton County|H1|A
IN|18|009|00450404|Blackford County|H1|A
IN|18|011|00450405|Boone County|H1|A
IN|18|013|00451684|Brown County|H1|A
IN|18|015|00450406|Carroll County|H1|A
IN|18|017|00450339|Cass County|H1|A
IN|18|019|00450340|Clark County|H1|A
IN|18|021|00450341|Clay County|H1|A
IN|18|023|00450342|Clinton County|H1|A
IN|18|025|00451674|Crawford County|H1|A
IN|18|027|00450343|Daviess County|H1|A
IN|18|029|00450344|Dearborn County|H1|A
IN|18|031|00450345|Decatur County|H1|A
IN|18|033|00450346|DeKalb County|H1|A
IN|18|035|00450347|Delaware County|H1|A
IN|18|037|00451675|Dubois County|H1|A
IN|18|039|00450348|Elkhart County|H1|A
IN|18|041|00450349|Fayette County|H1|A
IN|18|043|00450350|Floyd County|H1|A
IN|18|045|00450351|Fountain County|H1|A
IN|18|047|00450352|Franklin County|H1|A
IN|18|049|00450353|Fulton County|H1|A
IN|18|051|00450354|Gibson County|H1|A
IN|18|053|00450355|Grant County|H1|A
IN|18|055|00451676|Greene County|H1|A
IN|18|057|00450356|Hamilton County|H1|A
IN|18|059|00450357|Hancock County|H1|A
IN|18|061|00451677|Harrison County|H1|A
IN|18|063|00450358|Hendricks County|H1|A
IN|18|065|00450359|Henry County|H1|A
IN|18|067|00450360|Howard County|H1|A
IN|18|069|00450361|Huntington County|H1|A
IN|18|071|00451678|Jackson County|H1|A
IN|18|073|00450494|Jasper County|H1|A
IN|18|075|00450362|Jay County|H1|A
IN|18|077|00450363|Jefferson County|H1|A
IN|18|079|00450364|Jennings County|H1|A
IN|18|081|00450365|Johnson County|H1|A
IN|18|083|00450366|Knox County|H1|A
IN|18|085|00450367|Kosciusko County|H1|A
IN|18|087|00450368|LaGrange County|H1|A
IN|18|089|00450495|Lake County|H1|A
IN|18|091|00450507|LaPorte County|H1|A
IN|18|093|00451703|Lawrence County|H1|A
IN|18|095|00450370|Madison County|H1|A
IN|18|097|00450371|Marion County|H6|C
IN|18|099|00450372|Marshall County|H1|A
IN|18|101|00451679|Martin County|H1|A
IN|18|103|00450373|Miami County|H1|A
IN|18|105|00451680|Monroe County|H1|A
IN|18|107|00450374|Montgomery County|H1|A
IN|18|109|00450375|Morgan County|H1|A
IN|18|111|00450376|Newton County|H1|A
IN|18|113|00450377|Noble County|H1|A
IN|18|115|00450378|Ohio County|H1|A
IN|18|117|00451681|Orange County|H1|A
IN|18|119|00450379|Owen County|H1|A
IN|18|121|00450380|Parke County|H1|A
IN|18|123|00451682|Perry County|H1|A
IN|18|125|00450381|Pike County|H1|A
IN|18|127|00450382|Porter County|H1|A
IN|18|129|00450383|Posey County|H1|A
IN|18|131|00446852|Pulaski County|H1|A
IN|18|133|00450384|Putnam County|H1|A
IN|18|135|00446853|Randolph County|H1|A
IN|18|137|00450385|Ripley County|H1|A
IN|18|139|00446854|Rush County|H1|A
IN|18|141|00452855|St. Joseph County|H1|A
IN|18|143|00450386|Scott County|H1|A
IN|18|145|00450387|Shelby County|H1|A
IN|18|147|00450388|Spencer County|H1|A
IN|18|149|00450389|Starke County|H1|A
IN|18|151|00450390|Steuben County|H1|A
IN|18|153|00450391|Sullivan County|H1|A
IN|18|155|00450392|Switzerland County|H1|A
IN|18|157|00450393|Tippecanoe County|H1|A
IN|18|159|00450394|Tipton County|H1|A
IN|18|161|00450395|Union County|H1|A
IN|18|163|00450396|Vanderburgh County|H1|A
IN|18|165|00450397|Vermillion County|H1|A
IN|18|167|00450398|Vigo County|H1|A
IN|18|169|00450399|Wabash County|H1|A
IN|18|171|00450400|Warren County|H1|A
IN|18|173|00450335|Warrick County|H1|A
IN|18|175|00451665|Washington County|H1|A
IN|18|177|00450336|Wayne County|H1|A
IN|18|179|00450337|Wells County|H1|A
IN|18|181|00450338|White County|H1|A
IN|18|183|00450369|Whitley County|H1|A"""
# The dictionaries for the counties
IN_county_code, IN_code_county = dicts_for_counties(IN_counties_codes)
# Extracting all available weather stations in IN
IN_stations_meta = get_stations_for_state("IN")

In [None]:
IN_temp = extract_data(IN_stations_meta, IN_code_county, state = "Indiana")
save_file_as_csv(IN_temp, "IndianaTempData", state = "Indiana")

Year 1999


  nparmean = np.nanmean(npar, axis=0)    # mean over row, ignore nan


Year 2000
Year 2001
Year 2002
Year 2003
Year 2004
Year 2005
Year 2006
Year 2007
Year 2008
Year 2009
Year 2010
Year 2011
Year 2012
Year 2013
Year 2014
Year 2015
Year 2016
Year 2017
Year 2018
Year 2019


In [None]:
##### Missouri Data #####
MO_counties_codes = """STATE|STATEFP|COUNTYFP|COUNTYNS|COUNTYNAME|CLASSFP|FUNCSTAT
MO|29|001|00765805|Adair County|H1|A
MO|29|003|00758456|Andrew County|H1|A
MO|29|005|00758457|Atchison County|H1|A
MO|29|007|00758458|Audrain County|H1|A
MO|29|009|00758459|Barry County|H1|A
MO|29|011|00758460|Barton County|H1|A
MO|29|013|00758461|Bates County|H1|A
MO|29|015|00758462|Benton County|H1|A
MO|29|017|00758463|Bollinger County|H1|A
MO|29|019|00758464|Boone County|H1|A
MO|29|021|00758465|Buchanan County|H1|A
MO|29|023|00758466|Butler County|H1|A
MO|29|025|00758467|Caldwell County|H1|A
MO|29|027|00758468|Callaway County|H1|A
MO|29|029|00758469|Camden County|H1|A
MO|29|031|00758470|Cape Girardeau County|H1|A
MO|29|033|00758471|Carroll County|H1|A
MO|29|035|00758472|Carter County|H1|A
MO|29|037|00758473|Cass County|H1|A
MO|29|039|00758474|Cedar County|H1|A
MO|29|041|00758475|Chariton County|H1|A
MO|29|043|00758476|Christian County|H1|A
MO|29|045|00758477|Clark County|H1|A
MO|29|047|00758478|Clay County|H1|A
MO|29|049|00758479|Clinton County|H1|A
MO|29|051|00758480|Cole County|H1|A
MO|29|053|00758481|Cooper County|H1|A
MO|29|055|00758482|Crawford County|H1|A
MO|29|057|00758483|Dade County|H1|A
MO|29|059|00758484|Dallas County|H1|A
MO|29|061|00758485|Daviess County|H1|A
MO|29|063|00758486|DeKalb County|H1|A
MO|29|065|00758487|Dent County|H1|A
MO|29|067|00758488|Douglas County|H1|A
MO|29|069|00758489|Dunklin County|H1|A
MO|29|071|00758490|Franklin County|H1|A
MO|29|073|00758491|Gasconade County|H1|A
MO|29|075|00758492|Gentry County|H1|A
MO|29|077|00758493|Greene County|H1|A
MO|29|079|00758494|Grundy County|H1|A
MO|29|081|00758495|Harrison County|H1|A
MO|29|083|00758496|Henry County|H1|A
MO|29|085|00758497|Hickory County|H1|A
MO|29|087|00758498|Holt County|H1|A
MO|29|089|00758499|Howard County|H1|A
MO|29|091|00758500|Howell County|H1|A
MO|29|093|00758501|Iron County|H1|A
MO|29|095|00758502|Jackson County|H1|A
MO|29|097|00758503|Jasper County|H1|A
MO|29|099|00758504|Jefferson County|H1|A
MO|29|101|00758505|Johnson County|H1|A
MO|29|103|00758506|Knox County|H1|A
MO|29|105|00758507|Laclede County|H1|A
MO|29|107|00758508|Lafayette County|H1|A
MO|29|109|00758509|Lawrence County|H1|A
MO|29|111|00758510|Lewis County|H1|A
MO|29|113|00758511|Lincoln County|H1|A
MO|29|115|00758512|Linn County|H1|A
MO|29|117|00758513|Livingston County|H1|A
MO|29|119|00758518|McDonald County|H1|A
MO|29|121|00758514|Macon County|H1|A
MO|29|123|00758515|Madison County|H1|A
MO|29|125|00758516|Maries County|H1|A
MO|29|127|00758517|Marion County|H1|A
MO|29|129|00758519|Mercer County|H1|A
MO|29|131|00758520|Miller County|H1|A
MO|29|133|00758521|Mississippi County|H1|A
MO|29|135|00758522|Moniteau County|H1|A
MO|29|137|00758523|Monroe County|H1|A
MO|29|139|00758524|Montgomery County|H1|A
MO|29|141|00758525|Morgan County|H1|A
MO|29|143|00758526|New Madrid County|H1|A
MO|29|145|00758527|Newton County|H1|A
MO|29|147|00758528|Nodaway County|H1|A
MO|29|149|00758529|Oregon County|H1|A
MO|29|151|00758530|Osage County|H1|A
MO|29|153|00758531|Ozark County|H1|A
MO|29|155|00758532|Pemiscot County|H1|A
MO|29|157|00758533|Perry County|H1|A
MO|29|159|00758534|Pettis County|H1|A
MO|29|161|00758535|Phelps County|H1|A
MO|29|163|00758536|Pike County|H1|A
MO|29|165|00758537|Platte County|H1|A
MO|29|167|00758538|Polk County|H1|A
MO|29|169|00758539|Pulaski County|H1|A
MO|29|171|00758540|Putnam County|H1|A
MO|29|173|00758541|Ralls County|H1|A
MO|29|175|00758542|Randolph County|H1|A
MO|29|177|00758543|Ray County|H1|A
MO|29|179|00758544|Reynolds County|H1|A
MO|29|181|00758545|Ripley County|H1|A
MO|29|183|00758546|St. Charles County|H1|A
MO|29|185|00758547|St. Clair County|H1|A
MO|29|186|00765806|Ste. Genevieve County|H1|A
MO|29|187|00758548|St. Francois County|H1|A
MO|29|189|00758549|St. Louis County|H1|A
MO|29|195|00758550|Saline County|H1|A
MO|29|197|00758551|Schuyler County|H1|A
MO|29|199|00758552|Scotland County|H1|A
MO|29|201|00758553|Scott County|H1|A
MO|29|203|00758554|Shannon County|H1|A
MO|29|205|00758555|Shelby County|H1|A
MO|29|207|00758556|Stoddard County|H1|A
MO|29|209|00758557|Stone County|H1|A
MO|29|211|00758558|Sullivan County|H1|A
MO|29|213|00758559|Taney County|H1|A
MO|29|215|00758560|Texas County|H1|A
MO|29|217|00758561|Vernon County|H1|A
MO|29|219|00758562|Warren County|H1|A
MO|29|221|00758563|Washington County|H1|A
MO|29|223|00758564|Wayne County|H1|A
MO|29|225|00758565|Webster County|H1|A
MO|29|227|00758566|Worth County|H1|A
MO|29|229|00758567|Wright County|H1|A
MO|29|510|00767557|St. Louis city|C7|F"""
# The dictionaries for the counties
MO_county_code, MO_code_county = dicts_for_counties(MO_counties_codes)
# Extracting all available weather stations in MO
MO_stations_meta = get_stations_for_state("MO")

In [None]:
MO_temp = extract_data(MO_stations_meta, MO_code_county, state = "Missouri")
save_file_as_csv(MO_temp, "MissouriTempData", state = "Missouri")

Year 1999


  nparmean = np.nanmean(npar, axis=0)    # mean over row, ignore nan


Year 2000
Year 2001
Year 2002
Year 2003
Year 2004
Year 2005
Year 2006
Year 2007
Year 2008
Year 2009
Year 2010
Year 2011
Year 2012
Year 2013
Year 2014
Year 2015
Year 2016
Year 2017
Year 2018
Year 2019


### 2. Check name consistency between regdat and temp dat
On first time we extracted temp data, used it in R program and found some counties are missing. Then found they aren't actually missing but their name was recorded differently such as du pont vs. du.pont.

We take for example Kansas regdat, midw_ks and iterate over its counties (c in the loop below). For each county we check if the county name shows in temp data, ks_cnties and print names that were not found

This is to modify the names to match with regdat names since some time names show with - instead of space or other differences.

In [None]:
# Generate dictionary for all states counties and create a sorted list of counties per state
midw = {}
with open(DIR+'midw_regdat.csv', newline='') as f:
    reader = csv.reader(f)
    i = 0
    for row in reader:
        i += 1
        if i == 1: continue
        if row[2] not in midw: midw[row[2]] = {}
        if row[3].lower() not in midw[row[2]]: midw[row[2]][row[3].lower()] = set()
        midw[row[2]][row[3].lower()].add(row[1])
midw_il = sorted(list(midw['ILLINOIS'].keys()))
midw_in = sorted(list(midw['INDIANA'].keys()))
midw_io = sorted(list(midw['IOWA'].keys()))
midw_ks = sorted(list(midw['KANSAS'].keys()))
midw_mis = sorted(list(midw['MISSOURI'].keys()))

In [None]:
KS = csv2dict('KansasTempData')
ks_cnties = sorted(list(KS.keys()))
for c in midw_ks:
    if c not in ks_cnties: print(c)

In [None]:
#IL1 = csv2dict('IllinoisTempData1')
#IL2 = csv2dict('IllinoisTempData2')

IL = csv2dict('IllinoisTempData')
il_cnties = sorted(list(IL.keys()))

for c in midw_il:
    if c not in il_cnties: print(c)

de kalb
du page
gallatin
jo daviess
rock island
st clair
williamson


In [None]:
IN = csv2dict('IndianaTempData')
in_cnties = sorted(list(IN.keys()))
for c in midw_in:
    if c not in in_cnties: print(c)

de kalb


In [None]:
IO = csv2dict('IowaTempData')
io_cnties = sorted(list(IO.keys()))
for c in midw_io:
    if c not in io_cnties: print(c)

black hawk
buena vista
cerro gordo
des moines
o brien
palo alto
van buren


In [None]:
MIS = csv2dict('MissouriTempData')
mis_cnties = sorted(list(MIS.keys()))
for c in midw_mis:
    if c not in mis_cnties: print(c)

cape girardeau
de kalb
new madrid
st charles
st clair
st francois
st louis


# ADDED section
Read preavious temp data from old CSV without replacing NANs after fixing names of counties and added Willsons.

In [None]:
full_dct = {}
with open(DIR+"MidwTempDataNoFillNA.csv") as f:
    reader = csv.reader(f)
    i = 0
    for row in reader:
        i += 1
        if i == 1: continue # header row
        #if i > 412: break
        st, cnty, yr, dy, mxt, mnt, prcp = row
        if st not in full_dct: full_dct[st] = {}
        if cnty not in full_dct[st]: full_dct[st][cnty] = {}
        if int(yr) not in full_dct[st][cnty]: full_dct[st][cnty][int(yr)] = [[], [], []]
        full_dct[st][cnty][int(yr)][0].append(float(mxt) if mxt != 'NA' else None)
        full_dct[st][cnty][int(yr)][1].append(float(mnt) if mnt != 'NA' else None)
        full_dct[st][cnty][int(yr)][2].append(float(prcp) if prcp != 'NA' else None)
print(len(full_dct['kansas']['russell'][1999][0]))
print(i)

365
3577885


In [None]:
for s, cd in full_dct.items():
    for c, yd in cd.items():
        for yr, v in yd.items():
            # Remove 366 day where exist
            full_dct[s][c][yr][0] = np.array(v[0][:365], dtype=float)
            full_dct[s][c][yr][1] = np.array(v[1][:365], dtype=float)
            full_dct[s][c][yr][2] = np.array(v[2][:365], dtype=float)


In [None]:
# Some names in centroid are not the same as in the fundat
name_replace_for_dist = {'new madrid': 'newmadrid', 'cape girardeau': 'capegirardeau',
    'st. francois': 'stfrancois', 'st. clair': 'stclair', 'van buren': 'vanburen',
    'des moines': 'desmoines', 'buena vista': 'buenavista', 'palo alto': 'paloalto',
    'jo daviess': 'jodaviess', 'cerro gordo': 'cerrogordo'}
cent = []
with open(DIR+"MidwCentroids.csv") as f:
    reader = csv.reader(f)
    i = 0
    for row in reader:
        i += 1
        if i == 1: continue # header row
        long, lat, cnty, st = row
        if cnty in name_replace_for_dist: cnty = name_replace_for_dist[cnty]
        cent.append(((st, cnty), (float(long), float(lat))))

In [None]:
import math
from operator import itemgetter
def dist(a, b):
    return math.sqrt((a[0]-b[0])**2 + (a[1]-b[1])**2)

dist_dct = {}
for i in range(len(cent)):
    for j in range(i+1, len(cent)):
        if cent[i][0] not in dist_dct: dist_dct[cent[i][0]] = []
        if cent[j][0] not in dist_dct: dist_dct[cent[j][0]] = []
        dis = dist(cent[i][1], cent[j][1])
        dist_dct[cent[i][0]].append((dis, cent[j][0]))
        dist_dct[cent[j][0]].append((dis, cent[i][0]))


In [None]:
for k in dist_dct:
    dist_dct[k] = sorted(dist_dct[k], key=itemgetter(0))


In [None]:
###############################################
# Filling NA using data from nearby counties
##############################################

import copy
full_dct_na_repl = copy.deepcopy(full_dct)
rplcd = 0
for s, cd in full_dct.items():
    for c, yd in cd.items():
        cand_nearby = []
        for yr, v in yd.items():
            for k in [0,1]:  # mxt, mnt
                isnn = np.isnan(v[k]) # maxt NANs
                if isnn.sum() > 0:
                    if len(cand_nearby) == 0:   # Did not fill it yet
                        j = 0
                        while dist_dct[(s, c)][j][0] < 1.0:
                            cand_nearby.append(dist_dct[(s, c)][j][1]) # Only the st, cnty name
                            j += 1
                    for i in np.where(isnn)[0]: # go over all NAN indecis
                        for nbst, nbct in cand_nearby:
                            if nbct not in full_dct[nbst]: continue
                            if yr not in full_dct[nbst][nbct]: continue
                            if i > (len(full_dct[nbst][nbct][yr][0])-1):
                                continue  # case where there is no full year data
                            if full_dct[nbst][nbct][yr][k][i] != np.nan:
                                full_dct_na_repl[s][c][yr][k][i] = full_dct[nbst][nbct][yr][k][i]
                                break
                            else: print(full_dct[nbst][nbct][yr][k][i])
                    rplcd += 1
                    if np.isnan(full_dct_na_repl[s][c][yr][k]).sum() > 0:
                        p_s = 'mxt' if k == 0 else 'mnt'
                        print(p_s, s, c, yr, isnn.sum(), np.isnan(full_dct_na_repl[s][c][yr][k]).sum())
print(rplcd)


mnt kansas barber 2008 2 1
mxt kansas seward 2003 8 4
mnt kansas seward 2003 8 5
mnt kansas seward 2004 5 1
mxt kansas seward 2005 1 1
mnt kansas seward 2005 2 1
mxt kansas seward 2007 1 1
mnt kansas seward 2007 1 1
mxt kansas chautauqua 2000 17 1
mnt kansas chautauqua 2000 15 1
mxt kansas chautauqua 2002 12 1
mnt kansas chautauqua 2002 13 1
mxt kansas meade 2001 1 1
mnt kansas meade 2002 8 1
mnt kansas meade 2005 10 1
mnt kansas meade 2006 31 1
mxt kansas meade 2008 31 2
mnt kansas meade 2008 32 2
mxt kansas meade 2010 32 1
mnt kansas meade 2010 35 2
mxt kansas elk 2000 8 1
mnt kansas elk 2000 3 1
mxt kansas elk 2002 4 1
mnt kansas elk 2002 4 1
mxt kansas haskell 2003 95 4
mnt kansas haskell 2003 98 5
mnt kansas haskell 2004 12 1
mxt kansas haskell 2005 39 1
mnt kansas haskell 2005 37 1
mxt kansas haskell 2007 318 1
mnt kansas haskell 2007 318 1
mxt kansas wilson 1999 30 3
mnt kansas wilson 1999 69 5
mxt kansas wilson 2000 26 2
mnt kansas wilson 2000 54 2
mxt kansas wilson 2001 43 1
m

mnt indiana grant 1999 15 2
mxt indiana grant 2000 10 2
mnt indiana grant 2000 20 2
mnt indiana grant 2001 6 2
mnt indiana grant 2004 10 1
mxt indiana grant 2005 3 1
mnt indiana grant 2005 7 3
mxt indiana adams 1999 3 1
mxt indiana adams 2000 27 5
mnt indiana adams 2000 32 5
mxt indiana adams 2003 156 2
mnt indiana adams 2003 158 6
mxt indiana adams 2004 135 1
mnt indiana adams 2004 135 4
mnt indiana wabash 2000 5 1
mnt indiana wabash 2005 149 1
mnt indiana newton 2005 79 1
mxt indiana wells 1999 8 2
mnt indiana wells 1999 10 1
mxt indiana wells 2000 8 3
mnt indiana wells 2000 11 2
mnt indiana wells 2003 30 1
mnt indiana wells 2005 4 1
mxt indiana wells 2012 73 2
mnt indiana wells 2012 73 1
mxt indiana wells 2019 94 4
mnt indiana wells 2019 94 2
mxt indiana huntington 1999 8 1
mnt indiana jasper 2005 1 1
mnt indiana pulaski 1999 1 1
mxt indiana pulaski 2000 2 2
mnt indiana pulaski 2000 3 3
mxt indiana pulaski 2003 1 1
mnt indiana pulaski 2003 3 3
mnt indiana pulaski 2004 2 2
mnt indian

In [None]:
def replace_nan_with_surrounding_mean_old(l):
    '''
    Argument a numpy array
    If there are fewer than 100 nan, trace consecutive sequences of nan,
    and if the sequence is not longer than 20 days fill them with the mean of
    one before and one after
    '''
    fail_total = False  # Whether we replace nan or not
    fail_seq_length = False
    nans = np.isnan(l)
    num_nans = nans.sum()
    l_new = np.copy(l)
    if nans.sum() <= 100:
        nan_indcis = np.where(nans)[0].tolist()
        j = 0
        while j < (len(nan_indcis)-1):
            start = nan_indcis[j]; end = nan_indcis[j]
            # print(nan_indcis, 'Outer', j)
            while nan_indcis[j] == (nan_indcis[j+1]-1):  # consecutive
                j += 1
                # print('\tInner', j)
                if j == (len(nan_indcis) - 1): break
            end = nan_indcis[j]
            if end - start <= 20:   # a bit over a season
                if start == 0:  # there is nothing before
                    l_new[start:end+1] = l[end+1]
                elif end == (len(l)-1):  # the last element is nan
                    l_new[start:end+1] = l[start-1]
                else:  # In between - take average of before and after
                    l_new[start:end+1] = (l[start-1] + l[end+1])/2
            else: fail_seq_length = True
            j += 1

        ### Adding a condition to check if num of NA's is 1 ####
        if len(nan_indcis) == 1:
            if nan_indcis[0] == 0: # The beginning of the year
                l_new[nan_indcis[0]] = l[nan_indcis[0]+1] # replace day 1 with second day
            elif nan_indcis[0] == len(l)-1: # The last day of the year is NA
                l_new[nan_indcis[0]] = l[nan_indcis[0]-1] # Take from previous day
            else: # middle of year, avg of day before and after
                l_new[nan_indcis[0]] = (l[nan_indcis[0]-1] + l[nan_indcis[0]+1])/2

            print(nan_indcis, l_new[nan_indcis[0]])

    else: fail_total = True
    return (fail_total, fail_seq_length, num_nans, l_new)

In [None]:
def replace_nan_with_surrounding_mean(l):
    '''
    Argument a numpy array
    If there are fewer than 100 nan, trace consecutive sequences of nan,
    and if the sequence is not longer than 20 days fill them with the mean of
    one before and one after
    '''
    fail_total = False  # Whether we replace nan or not
    fail_seq_length = False
    nans = np.isnan(l)
    num_nans = nans.sum()
    l_new = np.copy(l)
    if nans.sum() <= 100:
        nan_indcis = np.where(nans)[0].tolist()
        j = 0
        while j <= (len(nan_indcis)-1):
            start = nan_indcis[j]; end = nan_indcis[j]
            # print(nan_indcis, 'Outer', j)
            # This condition checks if it's a single NA or the last NA in the list
            # In the case where we have a list of NAs, it will only enter when we reach
            # the last NA in the list because j == len(na_ind)-1 at the end.
            if j == (len(nan_indcis) - 1):
                # Replacing a single observation
                if nan_indcis[j] == 0: # The beginning of the year
                    l_new[nan_indcis[j]] = l[nan_indcis[j]+1] # replace day 1 with second day
                elif nan_indcis[j] == (len(l)-1): # The last day of the year is NA
                    l_new[nan_indcis[j]] = l[nan_indcis[j]-1] # Take from previous day
                else: # middle of year, avg of day before and after
                    l_new[nan_indcis[j]] = (l[nan_indcis[j]-1] + l[nan_indcis[j]+1])/2
                print("single last",nan_indcis, l_new[nan_indcis])

            else: # if it's not a single NA or last NA
                # This condition checks if NAs are consecutive
                while nan_indcis[j] == (nan_indcis[j+1]-1):  # consecutive
                    j += 1
                    # print('\tInner', j)
                    if j == (len(nan_indcis) - 1): break
                end = nan_indcis[j]
                # This condition is checking if we have an NA in between consecutive seq.
                if start == end:
                    # Code for a single NA replacement
                    if nan_indcis[j] == 0: # The beginning of the year
                        l_new[nan_indcis[j]] = l[nan_indcis[j]+1] # replace day 1 with second day
                    elif nan_indcis[j] == (len(l)-1): # The last day of the year is NA
                        l_new[nan_indcis[j]] = l[nan_indcis[j]-1] # Take from previous day
                    else: # middle of year, avg of day before and after
                        l_new[nan_indcis[j]] = (l[nan_indcis[j]-1] + l[nan_indcis[j]+1])/2
                    print("single first/middle",nan_indcis, l_new[nan_indcis])
                # If consecutive seq is less than 20, replace
                elif end - start <= 20:   # a bit over a season
                    if start == 0:  # there is nothing before
                        l_new[start:end+1] = l[end+1]
                    elif end == (len(l)-1):  # the last element is nan
                        l_new[start:end+1] = l[start-1]
                    else:  # In between - take average of before and after
                        l_new[start:end+1] = (l[start-1] + l[end+1])/2
                # last condition
                else: fail_seq_length = True
            j += 1

    else: fail_total = True
    return (fail_total, fail_seq_length, num_nans, l_new)

In [None]:
for s, cd in full_dct_na_repl.items():
    for c, yd in cd.items():
        for yr, v in yd.items():
            for k in [0,1]:
                isnn = np.isnan(v[k]) # maxt NANs
                if isnn.sum() > 0:
                    print("***---", "maxt" if k == 0 else "mint", s,c,yr,isnn.sum())
                    fail_tot, fail_seq, nm_nans, v_new = replace_nan_with_surrounding_mean(v[k])
                    if not fail_tot and not fail_seq:
                        full_dct_na_repl[s][c][yr][k] = v_new
                    else:
                        p_s = 'mxt' if k == 0 else 'mnt'
                        if fail_tot: print(s, c, yr, p_s, 'Too many nans', nm_nans)
                        else: print(s, c, yr, p_s, 'nan sequence too long', nm_nans)


***--- mint kansas barber 2008 1
single last [27] [-5.5555]
***--- maxt kansas seward 2003 4
***--- mint kansas seward 2003 5
single first/middle [8, 9, 69, 156, 157] [-4.167  -4.167  -3.0555     nan     nan]
***--- mint kansas seward 2004 1
single last [0] [-1.667]
***--- maxt kansas seward 2005 1
single last [337] [8.889]
***--- mint kansas seward 2005 1
single last [337] [-6.111]
***--- maxt kansas seward 2007 1
single last [47] [2.5]
***--- mint kansas seward 2007 1
single last [47] [-11.6665]
***--- maxt kansas chautauqua 2000 1
single last [168] [20.]
***--- mint kansas chautauqua 2000 1
single last [168] [13.3335]
***--- maxt kansas chautauqua 2002 1
single last [30] [1.667]
***--- mint kansas chautauqua 2002 1
single last [30] [-4.4445]
***--- maxt kansas meade 2001 1
single last [82] [15.5555]
***--- mint kansas meade 2002 1
single last [304] [-1.3885]
***--- mint kansas meade 2005 1
single last [120] [1.6665]
***--- mint kansas meade 2006 1
single last [59] [2.2225]
***--- ma

single first/middle [9, 14, 18, 67, 111, 129, 211, 218, 219, 236, 247, 299, 315, 316, 338, 339, 341, 344, 362] [-12.778  -13.6115  -2.2225  -1.111   -1.3885   5.8335  18.333   18.8885
  18.8885   8.0555  21.389    5.278    1.9445   1.9445  -3.889   -3.889
  -0.5555   1.1115      nan]
single last [9, 14, 18, 67, 111, 129, 211, 218, 219, 236, 247, 299, 315, 316, 338, 339, 341, 344, 362] [-12.778  -13.6115  -2.2225  -1.111   -1.3885   5.8335  18.333   18.8885
  18.8885   8.0555  21.389    5.278    1.9445   1.9445  -3.889   -3.889
  -0.5555   1.1115  -9.4445]
***--- maxt iowa taylor 2016 24
single first/middle [11, 17, 68, 69, 81, 88, 97, 125, 145, 159, 160, 171, 172, 211, 218, 234, 245, 253, 254, 255, 264, 274, 277, 279] [-8.0555     nan     nan     nan     nan     nan     nan     nan     nan
     nan     nan     nan     nan     nan     nan     nan     nan     nan
     nan     nan     nan     nan     nan     nan]
single first/middle [11, 17, 68, 69, 81, 88, 97, 125, 145, 159, 160, 171, 17

In [None]:
header = "State,County,Year,day,maxt,mint,prcp\n"
with open(DIR+'MidwTempDataNA_filled.csv', 'w') as f:
    f.write(header)
    k = 0
    for s, cd in full_dct_na_repl.items():
        for c, yd in cd.items():
            for yr, v in yd.items():
                for i in range(len(v[0])):
                    f.write(f'{s},{c},{str(yr)},{str(i+1)},{str(v[0][i])},{str(v[1][i])},{str(v[2][i])}\n')
                    k += 1

3575540


# End ADDED section

### Check one state to see how which counties has more than 50 nan in temp data

In [None]:
ILF = csv2fullDict('IllinoisTempData')

In [None]:
for c, d in ILF.items():  # c for county, d for dictionary
    for yr, tpl in d.items():  # yr is the dictionary key, tupl is the temp lists
        #if 'nan' in tpl[0] or 'nan' in tpl[1]:
            #print(c, yr, tpl[0].count('nan'), tpl[1].count('nan'))
        mx_cnt = tpl[0].count('nan') # count #nans in maxt for that yr
        mn_cnt = tpl[1].count('nan') # count #nans in mint for that yr
        if mx_cnt > 50 or mn_cnt > 50: # if any of the list has over 50 nans
            if c in midw['ILLINOIS'] and yr in midw['ILLINOIS'][c]:
                print(c, yr, mx_cnt, mn_cnt)


saline 2006 59 59
union 2008 104 48
union 2009 81 43
macoupin 2014 176 176
macoupin 2015 365 365
macoupin 2016 366 366
macoupin 2017 365 365
montgomery 2000 67 18
montgomery 2001 55 3
montgomery 2002 90 30
montgomery 2003 65 0
montgomery 2004 78 2
montgomery 2005 75 3
montgomery 2012 81 37
schuyler 2000 27 73
randolph 2018 104 106
pulaski 2001 26 66
wabash 2002 52 61
white 2011 33 56
ogle 2018 60 61
jefferson 2008 10 109
jefferson 2009 2 89
jefferson 2010 0 60
effingham 2002 360 360
effingham 2003 363 363
effingham 2004 366 366
effingham 2005 365 365
effingham 2006 365 365
effingham 2007 365 365
effingham 2008 366 366
effingham 2009 365 365
effingham 2010 243 243
effingham 2017 125 125
effingham 2018 365 365
iroquois 2013 56 55
iroquois 2014 65 65
iroquois 2015 78 79
iroquois 2016 85 85
kankakee 2007 54 53
kankakee 2008 58 59
kankakee 2010 57 57
madison 1999 125 130
madison 2000 66 72
madison 2010 60 45
madison 2011 62 84
madison 2012 169 213
madison 2013 113 148
madison 2014 182 191
m