## Statistical Modeling on Past vs Current Year Weather Data
My Spring 2017 study to determining if the highs and lows of this year are more severe than those of last year using statistical methods. 

The raw weather data can be accessed [here](http://www.georgiaweather.net/index.php?variable=HI&site=WATHORT) and is collected at the University of Georgia [Horticulture Research Farm](http://www.caes.uga.edu/departments/horticulture/about/facilities/farm.html) in Watkinsville, Georgia.

In [1]:
import bs4 as bs
import urllib
import urllib2
import numpy as np
import scipy.stats as stats
import math

def scrape_temperatures(fromMonth, fromDay, fromYear, toMonth, toDay, toYear):
    # request website
    url = 'http://www.georgiaweather.net/index.php?variable=HI&site=WATHORT'
    action_url = 'index.php?variable=HI&site=WATHORT'
    # get source
    source = urllib2.urlopen(url)
    # submitting form
    print('Retrieving daily temperatures from %s-%s-%s to %s-%s-%s' %(fromMonth, fromDay, fromYear, toMonth, toDay, toYear))
    req = urllib2.Request(url,
                          data=urllib.urlencode({'fromMonth': fromMonth,
                                                 'toMonth': toMonth,
                                                 'fromDay': fromDay,
                                                 'toDay': toDay,
                                                 'fromYear': fromYear,
                                                 'toYear': toYear}),
                          headers={'User-Agent': 'Mozilla something',
                                   'Cookie': 'name=value; name2=value2'})
    response = urllib2.urlopen(req)
    # get new source
    soup = bs.BeautifulSoup(response, "lxml", from_encoding="utf-8")
    # retrieve table data
    td = []
    for table in soup.findAll('table', {'class': 'tableBackground', 'width': '90%'}):
        td = table.findAll('td', {'class': 'tdClass'})

    # Organize table data
    date = []
    max = []
    min = []
    rain = []
    temp_range = []
    # separates data by date, max temp, min temp, and rain
    for i in range(len(td) / 4):
        # unused attributes
        date.append(str(td[i * 4].text))
        max.append(float(td[(i * 4) + 1].text))
        min.append(float(td[(i * 4) + 2].text))
        rain.append(float(td[(i * 4) + 3].text))
        # value to return
        temp_range.append([max[i], min[i]])
    return temp_range

# retrieve daily temperatures
X = scrape_temperatures(fromMonth='January', fromDay='1', fromYear='2017', toMonth='January', toDay='31', toYear='2017')
Y = scrape_temperatures(fromMonth='January', fromDay='1', fromYear='2016', toMonth='January', toDay='31', toYear='2016')

# separate max & min values
xmax = (np.array(X)[:,0])
xmin = (np.array(X)[:,1])
ymax = (np.array(Y)[:,0])
ymin = (np.array(Y)[:,1])


def paired_ttest(X, Y, alph):
    # difference
    d = X - Y
    # mean of difference
    d_bar = np.mean(d)

    N = len(X)
    df = len(X)+len(Y) - 2

    # np.std divides the statistic by N - ddof
    s = np.std(d, ddof=2)

    # critical value: 1-alpha/2 because it is a two sided test
    cval = stats.t.ppf(1 - alph/2, df)
    print "critical value=", cval

    # t-statistic
    t = (d_bar - 0) / (s / math.sqrt(N))

    # p-value: multiplying by 2 because its a two tailed test
    pval = 2.0 * (1 - stats.t.cdf(t, df))
    print "pval=", pval

    # critical value and pvalue
    return [cval, pval]

alph = 0.05
# calculate critical and p values
highs = paired_ttest(xmax, ymax, alph)
lows = paired_ttest(xmin, ymin, alph)

# hypothesis testing using alpha and the calculated p-value
if highs[1] < alph:
    print 'Based on the p-value of the daily highs we Reject Null Hypothesis'
else:
    print 'Based on the p-value of the daily highs we Fail to Reject Null Hypothesis'
# hypothesis testing using alpha and the calculated p-value
if lows[1] < alph:
    print 'Based on the p-value of the daily lows we Reject Null Hypothesis'
else:
    print 'Based on the p-value of the daily lows we Fail to Reject Null Hypothesis'


Retrieving daily temperatures from January-1-2017 to January-31-2017
Retrieving daily temperatures from January-1-2016 to January-31-2016
critical value= 2.00171748301
pval= 0.000771604180284
critical value= 2.00171748301
pval= 0.00260974274885
Based on the p-value of the daily highs we Reject Null Hypothesis
Based on the p-value of the daily lows we Reject Null Hypothesis
