# Temperature Analysis I

In [217]:
import pandas as pd
from datetime import datetime as dt

In [218]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [219]:
# Convert the date column format from string to datetime
df['date']=pd.to_datetime(df['date'])

In [220]:
# Set the date column as the DataFrame index
df2 = df.set_index('date')
df2

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


### Compare June and December data across all years 

In [221]:
from scipy import stats

In [222]:
# Filter data for desired months
june_df = df[df['date'].dt.month==6]
dec_df = df[df['date'].dt.month==12]

In [223]:
# Identify the average temperature for June
june_df.tobs.mean()

74.94411764705882

In [224]:
# Identify the average temperature for December
dec_df.tobs.mean()

71.04152933421226

In [225]:
# Create collections of temperature data

# Need to compare same sized data sets for paired t-test
june_df = june_df.sort_values('date')
dec_df = dec_df.sort_values('date')
june_df = june_df.reset_index(drop=True)
june_df = june_df[june_df['date'] < '2016-12-31'] # limit dataframe to 2016 as there is no data for Dec 2016
dec_df = dec_df.reset_index(drop=True)
june_df2 = june_df[['date','tobs']]
dec_df2 = dec_df[['date','tobs']]
june_dict = june_df2.to_dict()
dec_dict = dec_df2.to_dict()

In [226]:
# December loop (31 days)
# June only has 30 days, so we'll need to match up these two data sets for paired t-test

sum_of_tobs = 0
count_of_tobs = 0
prev_day = 1
date_list = []
day_avg = []

for x in range(len(dec_dict['date'])):
    if(dec_dict['date'][x].day < 31): # if day is less than 31, then do this
        if(dec_dict['date'][x].day == prev_day): # if day number equal to previous day then do this
            prev_day = dec_dict['date'][x].day
            sum_of_tobs = sum_of_tobs + dec_dict['tobs'][x]
            count_of_tobs += 1
        else: # take average of the tob values and append to the day_avg list
            avg = sum_of_tobs / count_of_tobs
            date_list.append(dec_dict['date'][x-1])
            day_avg.append(avg)
            count_of_tobs = 0 # reset count
            sum_of_tobs = 0 # reset sum
            prev_day = dec_dict['date'][x].day
    else:
        if prev_day == 30:
            prev_day = 1
            avg = sum_of_tobs / count_of_tobs
            date_list.append(dec_dict['date'][x-1])
            day_avg.append(avg)
            count_of_tobs = 0 # reset count
            sum_of_tobs = 0 # reset sum
            prev_day = 1
        else:
            continue

In [227]:
# Length of June and December datasets need to match for paired t-test
print(f'Length of December list before removing day 30 of the last year: {len(day_avg)}')
day_avg.pop() # need to remove the last 30 day data to match up with the last June 29 dataframe size
dec_tobs_list = day_avg
print(f'Length of December list after removing day 30 of the last year: {len(dec_tobs_list)}')
ttest_dec_df = pd.DataFrame(list(zip(date_list,day_avg)),columns=['Date','Avg_TOB'])
# dataframe.to_csv('test.csv')
ttest_dec_df

Length of December list before removing day 30 of the last year: 210
Length of December list after removing day 30 of the last year: 209


Unnamed: 0,Date,Avg_TOB
0,2010-12-01,73.125000
1,2010-12-02,72.714286
2,2010-12-03,72.571429
3,2010-12-04,74.833333
4,2010-12-05,72.166667
...,...,...
204,2016-12-25,73.750000
205,2016-12-26,73.600000
206,2016-12-27,72.800000
207,2016-12-28,71.666667


In [228]:
# June loop (30 days)
# December has 31 days, so we'll need to match up these two data sets for paired t-test

sum_of_tobs = 0
count_of_tobs = 0
prev_day = 1
date_list = []
day_avg = []

for x in range(len(june_dict['date'])):
    if(june_dict['date'][x].day < 31): # if day is less than 31, then do this
        if(june_dict['date'][x].day == prev_day): # if day number equal to previous day then do this
            prev_day = june_dict['date'][x].day
            sum_of_tobs = sum_of_tobs + june_dict['tobs'][x]
            count_of_tobs += 1
        else: # take average of the tob values and append to the day_avg list
            avg = sum_of_tobs / count_of_tobs
            date_list.append(june_dict['date'][x-1])
            day_avg.append(avg)
            count_of_tobs = 0 # reset count
            sum_of_tobs = 0 # reset sum
            prev_day = june_dict['date'][x].day
    else:
        continue

In [229]:
june_tobs_list = day_avg
print(f'The last day in the June dataset is 29, while the last day in the December data is 30.')
print('The last day 30 in the December data set has been removed for paired t-test.')
print(f'Length of the June tobs data: {len(june_tobs_list)}')
ttest_june_df = pd.DataFrame(list(zip(date_list,day_avg)),columns=['Date','Avg_TOB'])
ttest_june_df

The last day in the June dataset is 29, while the last day in the December data is 30.
The last day 30 in the December data set has been removed for paired t-test.
Length of the June tobs data: 209


Unnamed: 0,Date,Avg_TOB
0,2010-06-01,73.555556
1,2010-06-02,74.125000
2,2010-06-03,74.500000
3,2010-06-04,72.875000
4,2010-06-05,76.428571
...,...,...
204,2016-06-25,75.200000
205,2016-06-26,77.800000
206,2016-06-27,76.333333
207,2016-06-28,75.166667


In [230]:
# Run paired t-test
stats.ttest_rel(june_tobs_list, dec_tobs_list)

Ttest_relResult(statistic=16.945931794720433, pvalue=4.828803813837092e-41)

### Analysis

Based on the data analysis showing a p-value of less than 0.05, we can say that the difference in temperatures between June and December for Hawaii is statistically insignificant.

We use the paired t-test in this case because we are looking at the same place, just at different times of the year.  We would use an unpaired t-test when comparing two different places which would constitute two separate groups.