# Bonus: Temperature Analysis I

## Analysis
###### Since p-value is less than 0.05, we can reject the NULL HYPOTHESIS that the difference between the two sets is a result of random chance.
###### This implies that the alternative hypothesis is correct, and that the data difference is significant.
##### I used an unpaired t-test since there is no overlap between the data.

In [18]:
import pandas as pd
from datetime import datetime as dt

In [44]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [38]:
df.dtypes

station            object
date       datetime64[ns]
prcp              float64
tobs                int64
dtype: object

In [39]:
# Convert the date column format from string to datetime
#df['date'] = pd.to_date(df['date'])
df['date'] = pd.to_datetime(df['date'])

In [40]:
df['date'].dtype

dtype('<M8[ns]')

In [45]:
# Set the date column as the DataFrame index
#df.set_index("date",inplace=True)
df

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.00,63
2,USC00519397,2010-01-03,0.00,74
3,USC00519397,2010-01-04,0.00,76
4,USC00519397,2010-01-06,,73
...,...,...,...,...
19545,USC00516128,2017-08-19,0.09,71
19546,USC00516128,2017-08-20,,78
19547,USC00516128,2017-08-21,0.56,76
19548,USC00516128,2017-08-22,0.50,76


In [16]:
# Drop the date column
#df.reset_index(drop=True, inplace=True)


Unnamed: 0,station,prcp,tobs
0,USC00519397,0.08,65
1,USC00519397,0.00,63
2,USC00519397,0.00,74
3,USC00519397,0.00,76
4,USC00519397,,73
...,...,...,...
19545,USC00516128,0.09,71
19546,USC00516128,,78
19547,USC00516128,0.56,76
19548,USC00516128,0.50,76


In [46]:
df = df.dropna(how='any')
df

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.00,63
2,USC00519397,2010-01-03,0.00,74
3,USC00519397,2010-01-04,0.00,76
5,USC00519397,2010-01-07,0.06,70
...,...,...,...,...
19543,USC00516128,2017-08-17,0.13,72
19545,USC00516128,2017-08-19,0.09,71
19547,USC00516128,2017-08-21,0.56,76
19548,USC00516128,2017-08-22,0.50,76


### Compare June and December data across all years 

In [56]:
from scipy import stats

In [47]:
# Filter data for desired months
df = df.set_index(pd.DatetimeIndex(df['date']))
df

Unnamed: 0_level_0,station,date,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,USC00519397,2010-01-01,0.08,65
2010-01-02,USC00519397,2010-01-02,0.00,63
2010-01-03,USC00519397,2010-01-03,0.00,74
2010-01-04,USC00519397,2010-01-04,0.00,76
2010-01-07,USC00519397,2010-01-07,0.06,70
...,...,...,...,...
2017-08-17,USC00516128,2017-08-17,0.13,72
2017-08-19,USC00516128,2017-08-19,0.09,71
2017-08-21,USC00516128,2017-08-21,0.56,76
2017-08-22,USC00516128,2017-08-22,0.50,76


In [35]:
junedf = df[df.index.month == 6]
junedf

Unnamed: 0_level_0,station,date,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-06-01,USC00519397,2010-06-01,0.00,78
2010-06-02,USC00519397,2010-06-02,0.01,76
2010-06-03,USC00519397,2010-06-03,0.00,78
2010-06-04,USC00519397,2010-06-04,0.00,76
2010-06-05,USC00519397,2010-06-05,0.00,77
...,...,...,...,...
2017-06-26,USC00516128,2017-06-26,0.02,79
2017-06-27,USC00516128,2017-06-27,0.10,74
2017-06-28,USC00516128,2017-06-28,0.02,74
2017-06-29,USC00516128,2017-06-29,0.04,76


In [36]:
# Identify the average temperature for June
avgtemp_june = junedf["tobs"].mean()
avgtemp_june

74.88754764930114

In [49]:
decdf = df[df.index.month == 12]
decdf

Unnamed: 0_level_0,station,date,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-12-01,USC00519397,2010-12-01,0.04,76
2010-12-03,USC00519397,2010-12-03,0.00,74
2010-12-04,USC00519397,2010-12-04,0.00,74
2010-12-06,USC00519397,2010-12-06,0.00,64
2010-12-07,USC00519397,2010-12-07,0.00,64
...,...,...,...,...
2016-12-27,USC00516128,2016-12-27,0.14,71
2016-12-28,USC00516128,2016-12-28,0.14,71
2016-12-29,USC00516128,2016-12-29,1.03,69
2016-12-30,USC00516128,2016-12-30,2.37,65


In [50]:
# Identify the average temperature for December
avgtemp_dec = decdf["tobs"].mean()
avgtemp_dec

70.93024911032029

In [53]:
# Create collections of temperature data
june_temp = junedf['tobs'].tolist()
len(june_temp)

1574

In [55]:
dec_temp = decdf['tobs'].tolist()
len(dec_temp)

1405

In [57]:
# Run unpaired t-test
stats.ttest_ind(june_temp, dec_temp, equal_var=False)

Ttest_indResult(statistic=30.624201480767336, pvalue=6.622829250184814e-178)

### Analysis

###### Since p-value is less than 0.05, we can reject the NULL HYPOTHESIS that the difference between the two sets is a result of random chance.
###### This implies that the alternative hypothesis is correct, and that the data difference is significant.
##### I used an unpaired t-test since there is no overlap between the data.