In [1]:
import os.path as op

thefiles = {'CC_allstations': 'climatecentral/allstations.csv', 
            'CC_daymet': 'climatecentral/climatecentraldaymet.csv',
           'CC_prism': 'climatecentral/climatecentralprism.csv',
            'CC_ccstationmethod': 'climatecentral/stationresults.csv',
           'Dabbage_allstations': 'Dabbage/dabbage_allstations.csv',
           'Dabbage_daymet': 'Dabbage/dabbage_daymet.csv',
           'Dabbage_prism': 'Dabbage/dabbage_prism.csv',
           'Dabbage_results': 'litresults/DabbageResults.csv',
           'CC_results': 'litresults/cc_results.csv'}

#urbanname,uhi

#placename, uhimin, uhimax, uhiavg

import csv

thedata = {}
for k, finname in thefiles.iteritems():
    thedata[k] = {}
    with open(finname, 'r') as fin:
        reader  = csv.DictReader(fin)
        for row in reader:
            thedata[k][row['placename']] = row



In [2]:
import numpy as np

from scipy import stats

from bokeh.io import gridplot, output_notebook, show, output_file
from bokeh.models import CustomJS,TapTool, ColumnDataSource
from bokeh.plotting import figure
output_notebook()

def clean_data(x,y):
    retx = []
    rety = []
    for tx, ty in zip(x, y):
        if not np.isnan(float(tx)) and not np.isnan(float(ty)):
            retx.append(float(tx))
            rety.append(float(ty))
    return retx, rety

def prepare_data(dataset1, dataset2, measure):
    dataset = []
    for d1placename, d1row in dataset1.iteritems():
        for d2placename, d2row in dataset2.iteritems():
            if d1placename.strip().lower().find(d2placename.strip().lower()) != -1 \
                    and d1row.get(measure, False) and d2row.get(measure, False):
                dataset.append([d1row[measure], d2row[measure]])
    
    npds = np.array(dataset).T
    return clean_data(npds[0], npds[1])
    

def create_scatter(dataset2, dataset1, measure, title="",
                  x_label="", y_label=""):
    x,y = prepare_data(dataset1, dataset2, measure)
      
    temps = figure(width=700, plot_height=700, title=title, 
               tools="pan,wheel_zoom,box_zoom,reset,tap,save",
                  x_axis_label=x_label,
                  y_axis_label=y_label,
                  x_range=(-10, 15),
                  y_range=(-10, 15))
    

    source = ColumnDataSource(data=dict(
    x=x,
    y=y
    ))

    

    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    print "Slope", slope, "Intercept", intercept, "RValue", r_value, "P value", p_value, "Standard Error", std_err
    print "r-squared:", r_value**2
    predict_y = intercept + slope * np.array(x)


    temps.circle('x', 'y', color='blue', size=5, source=source, alpha=1)
    temps.line(x, predict_y, line_width=2)
    show(temps)
                                
create_scatter(thedata['CC_results'], thedata['CC_allstations'], 'uhitmin', 'CC AllStations x CC Lit Results',
              'CC AllStations', 'CC Lit Results')

Slope 0.571751165577 Intercept 2.59382368102 RValue 0.549284459994 P value 3.61261150755e-05 Standard Error 0.125546894909
r-squared: 0.301713417991


In [3]:
create_scatter(thedata['CC_results'], thedata['CC_prism'], 'uhitmin', 'CC Prism x CC Lit Results',
              'CC Prism', 'CC Lit Results')

Slope -0.418218764725 Intercept 4.39394143968 RValue -0.133309338646 P value 0.361158228419 Standard Error 0.453524031257
r-squared: 0.0177713797701


In [4]:
create_scatter(thedata['CC_results'], thedata['CC_daymet'], 'uhitmin', 'CC Daymet x CC Lit Results',
              'CC Daymet', 'CC Lit Results')

Slope -0.117850047341 Intercept 4.12849772014 RValue -0.0348224929087 P value 0.810270525817 Standard Error 0.488186550183
r-squared: 0.00121260601238


In [5]:
create_scatter(thedata['CC_results'], thedata['CC_ccstationmethod'], 'uhitmin', 'CC Station Method x CC Lit Results',
              'CC Station Method', 'CC Lit Results')

Slope 0.316083680267 Intercept 3.24271653072 RValue 0.473521351964 P value 0.000774454105493 Standard Error 0.0876445362622
r-squared: 0.224222470766


In [6]:
create_scatter(thedata['Dabbage_results'], thedata['Dabbage_allstations'], 'uhitmin', 'Dabbage AllStations x Dabbage Results',
              'Dabbage AllStations', 'Dabbage Lit Results')

Slope 0.0971334532872 Intercept 0.209860880122 RValue 0.333161899204 P value 0.0290280897835 Standard Error 0.0429312191051
r-squared: 0.110996851081


In [7]:
create_scatter(thedata['Dabbage_results'], thedata['Dabbage_prism'], 'uhitmin', 'Dabbage Prism x Dabbage Results',
              'Dabbage Prism', 'Dabbage Lit Results')

Slope 0.0196512405879 Intercept 0.343890064188 RValue 0.106377936772 P value 0.502533395205 Standard Error 0.0290427080343
r-squared: 0.0113162654319


In [8]:
create_scatter(thedata['Dabbage_results'], thedata['Dabbage_daymet'], 'uhitmin', 'Dabbage DaymeD x Dabbage Results',
              'Dabbage Daymet', 'Dabbage Lit Results')

Slope 0.0194181711492 Intercept 0.349821729678 RValue 0.0985422095596 P value 0.529560956681 Standard Error 0.0306249347916
r-squared: 0.00971056706489


In [11]:
# stats.ttest_ind(rvs1, rvs3)
from IPython.display import display, HTML
import pandas as pd

# Assuming that dataframes df1 and df2 are already defined:
# print "Dataframe 1:"
# display(df1)
# print "Dataframe 2:"
# HTML(df2.to_html())


comparisonkeys = [['Dabbage_results', 'Dabbage_daymet'],
                 ['Dabbage_results', 'Dabbage_prism'],
                 ['Dabbage_results', 'Dabbage_allstations']]
results = [["Comparison", "T stat", "p-value"]]
results_related = [["Comparison", "T stat", "p-value"]]


for xkey, ykey in comparisonkeys:
    x,y = prepare_data(thedata[xkey], thedata[ykey], 'uhitmin')
    tempt, tempp = stats.ttest_ind(x, y, equal_var=False)
    results.append([xkey + "," + ykey, tempt, tempp])
    
    tempt, tempp = stats.ttest_rel(x,y)
    results_related.append([xkey + "," + ykey, tempt, tempp])

print display(pd.DataFrame(results))
print display(pd.DataFrame(results_related))


comparisonkeys2 = [['CC_results', 'CC_daymet'],
                 ['CC_results', 'CC_prism'],
                 ['CC_results', 'CC_allstations']]
results = [["Comparison", "T stat", "p-value"]]
results_related = [["Comparison", "T stat", "p-value"]]
for xkey, ykey in comparisonkeys2:
    x,y = prepare_data(thedata[xkey], thedata[ykey], 'uhitmin')
    tempt, tempp = stats.ttest_ind(x, y, equal_var=False)
    results.append([xkey + "," + ykey, tempt, tempp])
    
    tempt, tempp = stats.ttest_rel(x,y)
    results_related.append([xkey + "," + ykey, tempt, tempp])
    

print display(pd.DataFrame(results))
print display(pd.DataFrame(results_related))



Unnamed: 0,0,1,2
0,Comparison,T stat,p-value
1,"Dabbage_results,Dabbage_daymet",-0.223292,0.824289
2,"Dabbage_results,Dabbage_prism",-1.03811,0.304782
3,"Dabbage_results,Dabbage_allstations",-5.10105,5.18598e-06


None


Unnamed: 0,0,1,2
0,Comparison,T stat,p-value
1,"Dabbage_results,Dabbage_daymet",-0.22756,0.821066
2,"Dabbage_results,Dabbage_prism",-1.05822,0.296004
3,"Dabbage_results,Dabbage_allstations",-5.63599,1.23439e-06


None


Unnamed: 0,0,1,2
0,Comparison,T stat,p-value
1,"CC_results,CC_daymet",10.5556,4.39211e-15
2,"CC_results,CC_prism",9.58573,1.51372e-13
3,"CC_results,CC_allstations",3.29791,0.00135883


None


Unnamed: 0,0,1,2
0,Comparison,T stat,p-value
1,"CC_results,CC_daymet",10.4592,4.44047e-14
2,"CC_results,CC_prism",9.2316,3.24513e-12
3,"CC_results,CC_allstations",4.87963,1.1675e-05


None


In [10]:
def create_histogram(dataset1, dataset2, measure, title=""):
    x,y = prepare_data(dataset1, dataset2, measure)
    diffxy = np.array(x) - np.array(y)
    

    p1 = figure(title=title,
                background_fill_color="#E8DDCB")

    hist, edges = np.histogram(diffxy, density=False, bins=len(diffxy))


    p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
            fill_color="#036564", line_color="#033649")



    p1.xaxis.axis_label = 'difference'
    show(p1)

    
comparisonkeys = [['Dabbage_results', 'Dabbage_daymet'],
                 ['Dabbage_results', 'Dabbage_prism'],
                 ['Dabbage_results', 'Dabbage_allstations']]
for xkey, ykey in comparisonkeys:
    create_histogram(thedata[xkey], thedata[ykey], 'uhitmin', title=xkey + ", " + ykey)
    
    
comparisonkeys2 = [['CC_results', 'CC_daymet'],
                 ['CC_results', 'CC_prism'],
                 ['CC_results', 'CC_allstations'],
                  ['CC_results', 'CC_ccstationmethod']]
for xkey, ykey in comparisonkeys2:
    create_histogram(thedata[xkey], thedata[ykey], 'uhitmin', title=xkey + ", " + ykey)
    
