<h1 id="tocheading">Table of Contents and Notebook Setup</h1>
<div id="toc"></div>

In [13]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import scipy as sp
from scipy import stats
from datetime import timedelta, datetime

# Importing Data

## Importing Hourly Resolution Data

### Obtaining Longitudes and Latitudes from All Stations

We start by reading the first two lines of the data file to obtain the longitude and latitude information of the stations. We zip them together into a list of tuples, and add an element to the beginning of this list corresponding to the time.

In [15]:
lon_and_lat = np.genfromtxt('AllStations_temperature_h_2017.dat', delimiter=" ", max_rows=2)
longitudes = lon_and_lat[0][~np.isnan(lon_and_lat[0])]
latitudes = lon_and_lat[1][~np.isnan(lon_and_lat[1])]

indices = list(zip(longitudes, latitudes))
indices.insert(0, "time")

### Reading the Temperature Data

We now read in the temperature data, skipping the first two rows that contain the longitude and latitude information of the stations. The column names of the dataframe are the longitude and latitude tuples created above, and the index column is the timestamps.

In [16]:
df_ = pd.read_csv('AllStations_temperature_h_2017.dat', sep='\s+', skiprows=[0,1], names=indices, index_col = 'time')
df_.head()

Unnamed: 0_level_0,236.554,236.499,236.679,236.607,236.514,236.630,236.523,236.543,236.662,236.304,...,236.641,236.619,236.514,236.357,236.574,236.596,236.676,236.691,236.689,236.451
Unnamed: 0_level_1,48.5745,48.5376,48.4655,48.4608,48.4356,48.5273,48.4529,48.6804,48.4562,48.3891,...,48.4683,48.4744,48.4205,48.6568,48.4533,48.6529,48.4865,48.4623,48.4359,48.4572
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
733408.0,3.25,1.43,4.21,4.62,3.45,4.44,2.72,3.97,4.68,5.12,...,4.29,4.1,3.01,,4.0,4.81,4.58,,4.87,4.07
733408.0417,3.06,1.38,4.28,4.77,3.46,4.21,2.64,4.15,4.61,4.95,...,4.41,4.3,2.87,,4.15,4.6,4.44,,4.85,4.05
733408.0833,1.9,1.18,4.12,4.7,3.46,4.07,2.64,4.18,4.62,5.11,...,4.24,4.23,2.97,,4.06,4.65,4.47,,4.91,4.02
733408.125,1.69,0.98,4.2,4.77,3.65,3.67,2.81,4.27,4.71,5.33,...,4.36,4.3,3.1,,4.12,4.75,4.53,,4.97,4.06
733408.1667,2.09,0.93,4.61,4.99,3.9,3.55,3.11,4.29,4.93,5.56,...,4.58,4.53,3.37,,4.25,4.97,4.68,,5.17,4.27


### Changing the Datetimes into Appropriate Timestamps

The Datetimes used in this file are MATLAB datetimes. We use a function to convert each time in the time column to the appropriate timestamp and then change the index column into timestamps.

In [17]:
def matlab_to_python_datetime(matlab_datenum):
    return datetime.fromordinal(int(matlab_datenum)) + timedelta(days=matlab_datenum%1) - timedelta(days = 366)

df_.index = [matlab_to_python_datetime(i) for i in df_.index]
df_.head()

Unnamed: 0_level_0,236.554,236.499,236.679,236.607,236.514,236.630,236.523,236.543,236.662,236.304,...,236.641,236.619,236.514,236.357,236.574,236.596,236.676,236.691,236.689,236.451
Unnamed: 0_level_1,48.5745,48.5376,48.4655,48.4608,48.4356,48.5273,48.4529,48.6804,48.4562,48.3891,...,48.4683,48.4744,48.4205,48.6568,48.4533,48.6529,48.4865,48.4623,48.4359,48.4572
2008-01-01 00:00:00.000000,3.25,1.43,4.21,4.62,3.45,4.44,2.72,3.97,4.68,5.12,...,4.29,4.1,3.01,,4.0,4.81,4.58,,4.87,4.07
2008-01-01 01:00:02.879995,3.06,1.38,4.28,4.77,3.46,4.21,2.64,4.15,4.61,4.95,...,4.41,4.3,2.87,,4.15,4.6,4.44,,4.85,4.05
2008-01-01 01:59:57.120005,1.9,1.18,4.12,4.7,3.46,4.07,2.64,4.18,4.62,5.11,...,4.24,4.23,2.97,,4.06,4.65,4.47,,4.91,4.02
2008-01-01 03:00:00.000000,1.69,0.98,4.2,4.77,3.65,3.67,2.81,4.27,4.71,5.33,...,4.36,4.3,3.1,,4.12,4.75,4.53,,4.97,4.06
2008-01-01 04:00:02.879995,2.09,0.93,4.61,4.99,3.9,3.55,3.11,4.29,4.93,5.56,...,4.58,4.53,3.37,,4.25,4.97,4.68,,5.17,4.27


### Finding the Required Station

These are the coordinates where we wish to measure the temperature. We look for the station nearest to this point.

In [18]:
station_lon = 236.691
station_lat =  48.462

In [19]:
lon_diff = abs(longitudes - station_lon)
lat_diff = abs(latitudes - station_lat)

station_num = list((lon_diff+lat_diff)).index(min(lon_diff+lat_diff))

station_num provides the column number of the data we wish to observe.

### Obtain August 7th 2015/2017 Data

In [20]:
df = df_.iloc[:,station_num]

def get_winter_data(year):
    winter_start = pd.Timestamp(year=year-1, month=12, day=1, hour=0)
    winter_end = pd.Timestamp(year=year, month=3, day=1, hour=0)
    return df[(df.index>winter_start) & (df.index<winter_end)]

def get_summer_data(year):
    summer_start = pd.Timestamp(year=year, month=6, day=1, hour=0)
    summer_end = pd.Timestamp(year=year, month=9, day=1, hour=0)
    return df[(df.index>summer_start) & (df.index<summer_end)]

sum17 = get_summer_data(2017)
sum16 = get_summer_data(2016)
sum15 = get_summer_data(2015)
sum14 = get_summer_data(2014)
sum13 = get_summer_data(2013)
sum12 = get_summer_data(2012)
sum11 = get_summer_data(2011)
sum10 = get_summer_data(2010)

sums = np.array([sum10, sum11, sum12, sum13, sum14, sum15, sum16, sum17])

win17 = get_winter_data(2017)
win16 = get_winter_data(2016)
win15 = get_winter_data(2015)
win14 = get_winter_data(2014)
win13 = get_winter_data(2013)
win12 = get_winter_data(2012)
win11 = get_winter_data(2011)
win10 = get_winter_data(2010)

wins = np.array([win10, win11, win12, win13, win14, win15, win16, win17])

In [23]:
win10.std()

2.7091746700505577

### 95% Confidence Calculations

<b> Theorem: </b>

A one sided confidence interval that gives a lower bound for $\mu$ is given by

$$(\bar{X}-t_{\alpha,n-1}\frac{s}{\sqrt{n}}, \infty) $$

A one sided confidence interval that gives a upper bound for $\mu$ is given by

$$(\bar{X}+t_{\alpha,n-1}\frac{s}{\sqrt{n}}, \infty) $$

<b> Application to our Problem: </b>

We want to know if August 7th was warmer in 2017 than in 2015. We will thus calculate two confidence intervals

$$(-\infty \hspace{3mm} , \hspace{3mm} \text{Aug 2015 Upper}) \hspace{25mm}(\text{Aug 2017 Lower} \hspace{3mm} , \hspace{3mm} \infty) $$
    
Each interval will be calculated at $\sqrt{0.95}$ certainty. The probability that the true mean for each lies in their respective interval is

$$\sqrt{0.95}\sqrt{0.95}=0.95$$

This follows from $P(A \cap B)=P(A)P(B)$. If the two intervals don't overlap then we can claim with $95\%$ certainty that August 7th was warmer in 2017 than in 2015.

In [82]:
def find_bound(data, alpha, lim):
    u= data.mean()
    s = data.std()
    N = len(data)
    t = sp.stats.t.ppf(1-alpha, N-1)

    if lim=='lwr':
        return u-t*s/(np.sqrt(N))
    if lim=='upr':
        return u+t*s/(np.sqrt(N))   

In [83]:
def find_overlap_alpha(data1, data2):
    mean1 = data1.mean()
    mean2 = data2.mean()
    
    if (mean1>mean2):
        data_upr = data1
        data_lwr = data2
    else:
        data_upr = data2
        data_lwr = data1
        
    for alpha in np.arange(0, 1, 0.0005):
        bound_upr = find_bound(data_upr, alpha, 'lwr')
        bound_lwr = find_bound(data_lwr, alpha, 'upr')
        if (bound_upr>bound_lwr):
            break
    return alpha

In [84]:
find_overlap_alpha(win13, win11)

0.0115

In [85]:
def get_confidence_matrix(wins):
    a = np.empty([len(wins), len(wins)])
    for i, win_1 in enumerate(wins):
        for j, win_2 in enumerate(wins):
            a[i][j] = ((1-find_overlap_alpha(win_1, win_2))**2)*100
    return a

In [86]:
a = get_confidence_matrix(wins)
print(a)

[[24.950025 99.900025 99.900025 99.900025 99.900025 99.900025 99.900025
  99.900025]
 [99.900025 24.950025 65.044225 97.713225 99.900025 99.900025 99.900025
  99.900025]
 [99.900025 65.044225 24.950025 84.4561   99.900025 99.900025 99.900025
  99.900025]
 [99.900025 97.713225 84.4561   24.950025 99.900025 99.900025 99.900025
  99.900025]
 [99.900025 99.900025 99.900025 99.900025 24.950025 99.900025 99.900025
  99.900025]
 [99.900025 99.900025 99.900025 99.900025 99.900025 24.950025 99.900025
  99.900025]
 [99.900025 99.900025 99.900025 99.900025 99.900025 99.900025 24.950025
  99.900025]
 [99.900025 99.900025 99.900025 99.900025 99.900025 99.900025 99.900025
  24.950025]]


In [87]:
a = get_confidence_matrix(sums)
print(a)

[[24.950025 65.367225 99.900025 99.900025 99.900025 99.900025 99.900025
  99.900025]
 [65.367225 24.950025 99.900025 99.900025 99.900025 99.900025 99.900025
  99.900025]
 [99.900025 99.900025 24.950025 99.900025 99.900025 99.900025 99.900025
  99.900025]
 [99.900025 99.900025 99.900025 24.950025 99.900025 99.900025 99.700225
  99.900025]
 [99.900025 99.900025 99.900025 99.900025 24.950025 99.900025 44.957025
  49.5616  ]
 [99.900025 99.900025 99.900025 99.900025 99.900025 24.950025 99.900025
  99.900025]
 [99.900025 99.900025 99.900025 99.700225 44.957025 99.900025 24.950025
  69.305625]
 [99.900025 99.900025 99.900025 99.900025 49.5616   99.900025 69.305625
  24.950025]]


In [88]:
def hot_cold_matrix(wins):
    a = np.empty([len(wins), len(wins)])
    for i, win_1 in enumerate(wins):
        for j, win_2 in enumerate(wins):
            if (win_1.mean() == win_2.mean()):
                a[i][j] = 0
            if (win_1.mean() > win_2.mean()):
                a[i][j] = 1
            if (win_1.mean() < win_2.mean()):
                a[i][j] = -1
    return a

In [92]:
print(hot_cold_matrix(sums))

[[ 0.  1.  1. -1. -1. -1. -1. -1.]
 [-1.  0.  1. -1. -1. -1. -1. -1.]
 [-1. -1.  0. -1. -1. -1. -1. -1.]
 [ 1.  1.  1.  0. -1. -1. -1. -1.]
 [ 1.  1.  1.  1.  0. -1.  1. -1.]
 [ 1.  1.  1.  1.  1.  0.  1.  1.]
 [ 1.  1.  1.  1. -1. -1.  0. -1.]
 [ 1.  1.  1.  1.  1. -1.  1.  0.]]


In [104]:
win12.std()

2.46477442758805

In [106]:
win11.std()

2.865750274979971