Extracting weather data from www.ncdc.noaa.gov website. This web scraper this [url](https://www.ncdc.noaa.gov/cag/statewide/time-series/), and downloads a JOSN file. The url to download is generated for each state using state code, identified from the dropdown in the page form.<br>
In case of the loop being interupted for any reason, (url not responding is a common occurance), data collected at certain stages is saved as picked object to save some processing time.

In [17]:
import pandas as pd
import json
import pandas as pd
from mechanize import Browser
from bs4 import BeautifulSoup
import numpy as np
from collections import OrderedDict
import pickle, os

fileDir = os.path.dirname(os.path.abspath(''))

In [12]:
base_url = 'https://www.ncdc.noaa.gov/cag/statewide/time-series/'
avg_temp_url = "-tavg-1-1-1984-2018.json?base_prd=true&begbaseyear=1984&endbaseyear=2018"
max_temp_url = "-tmax-1-1-1984-2018.json?base_prd=true&begbaseyear=1984&endbaseyear=2018"
min_temp_url = "-tmin-1-1-1984-2018.json?base_prd=true&begbaseyear=1984&endbaseyear=2018"
pcp_url = "-pcp-1-1-1984-2018.json?base_prd=true&begbaseyear=1984&endbaseyear=2018"

url_list = OrderedDict(sorted({'avg_temp':avg_temp_url, 'max_temp': max_temp_url, 'min_temp': min_temp_url, 'percipitation':pcp_url}.items()))

states = OrderedDict(sorted({'Alabama': 1, 'Alaska': 50, 'Arizona': 2, 'Arkansas': 3, 'California': 4, 'Colorado': 5, 'Connecticut': 6, 'Delaware': 7, 'Florida': 8, 'Georgia': 9, 'Idaho': 10, 'Illinois': 11, 'Indiana': 12, 'Iowa': 13, 'Kansas': 14, 'Kentucky': 15, 'Louisiana': 16, 'Maine': 17, 'Maryland': 18, 'Massachusetts': 19, 'Michigan': 20, 'Minnesota': 21, 'Mississippi': 22, 'Missouri': 23, 'Montana': 24, 'Nebraska': 25, 'Nevada': 26, 'New Hampshire': 27, 'New Jersey': 28, 'New Mexico': 29, 'New York': 30, 'North Carolina': 31, 'North Dakota': 32, 'Ohio': 33, 'Oklahoma': 34, 'Oregon': 35, 'Pennsylvania': 36, 'Rhode Island': 37, 'South Carolina': 38, 'South Dakota': 39, 'Tennessee': 40, 'Texas': 41, 'Utah': 42, 'Vermont': 43, 'Virginia': 44, 'Washington': 45, 'West Virginia': 46, 'Wisconsin': 47, 'Wyoming': 48}.items()))

def read_data(url,col, state_name):   
    browser = Browser()
    browser.open(url)
    r_json=json.loads(browser.response().read())
    response_df = pd.DataFrame(r_json['data']).transpose()
    if (not response_df.empty) and ('value' in response_df.columns):
        response_df = response_df.drop(columns='anomaly').reset_index()
#         response_df.loc[:,'County']=r_json['description']['title'].split(',')[0].replace(' County','')
        response_df.loc[:,'Year']=response_df['index'].str[:4]
        response_df.loc[:,'State']=state_name
        response_df = response_df.rename(columns={'value':col}).drop(columns='index')
    else:
        county_name = r_json['description']['title'].split(',')[0].replace(' County','')
        response_df=pd.DataFrame({'Year':np.nan,'State':state_name,'County':county_name,'avg_temp':np.nan},index=[0])
    return response_df.set_index(['Year','State'])

In [13]:
data_df_path = os.path.join(fileDir, 'Data\pickle\state_df.pkl')

try:
    data_df = pd.read_pickle(data_df_path)
except:
    data_df = pd.DataFrame()

for col, url in url_list.items():
    print(col)
    
    if col in data_df.columns:
        print('Found col')
        if sorted(list(data_df.index.levels[1]))==sorted(states.keys()):
            print('Found saved data for {c}, skipping data point.'.format(c=col))
            continue
        else:
            inner_df = pd.DataFrame()
    else:
        print('Col not found in data_df, loading empty inner_df')
        inner_df = pd.DataFrame()

    
    for state, code in states.items():
        print('\t',state)        
        if inner_df.empty:
            print('\t\t overwritting inner df')
            inner_df = read_data(base_url+str(code)+url,col, state)
        else:
            print('\t\t appending to inner df')
            inner_df = pd.concat([inner_df,read_data(base_url+str(code)+url,col, state)],axis=0)

        
    if data_df.empty:
        data_df = inner_df.copy()
    else:
        data_df = data_df.merge(inner_df,how='outer',left_index=True,right_index=True)
    data_df.to_pickle(data_df_path)

data_df.head()

avg_temp
Found col
	 Alabama
		 overwritting inner df
	 Alaska
		 appending to inner df
	 Arizona
		 appending to inner df
	 Arkansas
		 appending to inner df
	 California
		 appending to inner df
	 Colorado
		 appending to inner df
	 Connecticut
		 appending to inner df
	 Delaware
		 appending to inner df
	 Florida
		 appending to inner df
	 Georgia
		 appending to inner df
	 Idaho
		 appending to inner df
	 Illinois
		 appending to inner df
	 Indiana
		 appending to inner df
	 Iowa
		 appending to inner df
	 Kansas
		 appending to inner df
	 Kentucky
		 appending to inner df
	 Louisiana
		 appending to inner df
	 Maine
		 appending to inner df
	 Maryland
		 appending to inner df
	 Massachusetts
		 appending to inner df
	 Michigan
		 appending to inner df
	 Minnesota
		 appending to inner df
	 Mississippi
		 appending to inner df
	 Missouri
		 appending to inner df
	 Montana
		 appending to inner df
	 Nebraska
		 appending to inner df
	 Nevada
		 appending to inner df
	 New Hampshire


Unnamed: 0_level_0,Unnamed: 1_level_0,avg_temp_x,max_temp,min_temp,percipitation,avg_temp_y
Year,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1984,Alabama,,,,,40.3
1984,Alaska,,,,,3.1
1984,Arizona,42.4,55.1,29.6,0.45,42.4
1984,Arkansas,,,,,34.0
1984,California,45.4,55.7,35.1,0.33,45.4


In [14]:
#Looks like there could some empty columns
print('Columns with any NULLs:\n',data_df.isnull().any())

Columns with any NULLs:
 avg_temp_x        True
max_temp          True
min_temp          True
percipitation     True
avg_temp_y       False
dtype: bool


In [19]:
writer = pd.ExcelWriter(os.path.join(fileDir, 'Data\Weather_Historical_US_State_Yearly.xlsx'))
data_df.to_excel(writer,'StateWeather')
writer.save()