# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt

In [2]:
path_csv = "./Resources/hawaii_measurements.csv"

In [3]:
# "tobs" is "temperature observations"
df = pd.read_csv(path_csv)
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [4]:
# Convert the date column format from string to datetime
df['date'] = pd.to_datetime(df['date'],infer_datetime_format=True)

In [5]:
df['yyyy'] = pd.to_datetime(df['date']).dt.year
df['mm'] = pd.to_datetime(df['date']).dt.month

In [6]:
df.head()

Unnamed: 0,station,date,prcp,tobs,yyyy,mm
0,USC00519397,2010-01-01,0.08,65,2010,1
1,USC00519397,2010-01-02,0.0,63,2010,1
2,USC00519397,2010-01-03,0.0,74,2010,1
3,USC00519397,2010-01-04,0.0,76,2010,1
4,USC00519397,2010-01-06,,73,2010,1


In [7]:
# Set the date column as the DataFrame index
df.set_index('date').sort_values('date', inplace = True)
df.head()

Unnamed: 0,station,date,prcp,tobs,yyyy,mm
0,USC00519397,2010-01-01,0.08,65,2010,1
1,USC00519397,2010-01-02,0.0,63,2010,1
2,USC00519397,2010-01-03,0.0,74,2010,1
3,USC00519397,2010-01-04,0.0,76,2010,1
4,USC00519397,2010-01-06,,73,2010,1


In [8]:
# Drop the date column


### Compare June and December data across all years 

In [20]:
from scipy.stats import ttest_ind

In [21]:
# Filter data for desired months
monthly_meanddf = df.groupby('mm', as_index = False).mean('tobs')

monthly_meanddf.head(12)

Unnamed: 0,mm,prcp,tobs,yyyy
0,1,0.129975,68.726115,2013.382166
1,2,0.137271,69.442236,2013.409938
2,3,0.204227,70.059067,2013.35381
3,4,0.150639,72.357268,2013.326912
4,5,0.145677,73.6809,2013.273514
5,6,0.13636,74.944118,2013.338824
6,7,0.167922,76.082408,2013.319696
7,8,0.146662,76.412454,2013.101726
8,9,0.164249,76.164865,2012.874324
9,10,0.155606,75.391388,2012.892031


In [22]:
monthly_meanddf.drop(columns=['yyyy'], inplace = True)

In [23]:
monthly_meanddf.head(12)

Unnamed: 0,mm,prcp,tobs
0,1,0.129975,68.726115
1,2,0.137271,69.442236
2,3,0.204227,70.059067
3,4,0.150639,72.357268
4,5,0.145677,73.6809
5,6,0.13636,74.944118
6,7,0.167922,76.082408
7,8,0.146662,76.412454
8,9,0.164249,76.164865
9,10,0.155606,75.391388


In [24]:
monthly_meanddf.index

RangeIndex(start=0, stop=12, step=1)

In [25]:
# Identify the average temperature for June
print(f"The average temperature for june is, {monthly_meanddf.iloc[5,2]}")

The average temperature for june is, 74.94411764705882


In [26]:
# Identify the average temperature for December
print(f"The average temperature for december is {monthly_meanddf.iloc[11,2]}")

The average temperature for december is 71.04152933421226


In [27]:
# Create collections of temperature data
monthly_meanddf.head(12)

Unnamed: 0,mm,prcp,tobs
0,1,0.129975,68.726115
1,2,0.137271,69.442236
2,3,0.204227,70.059067
3,4,0.150639,72.357268
4,5,0.145677,73.6809
5,6,0.13636,74.944118
6,7,0.167922,76.082408
7,8,0.146662,76.412454
8,9,0.164249,76.164865
9,10,0.155606,75.391388


In [None]:
print(monthly_meanddf.iloc[5,2])

### Comparing june and december data

In [33]:
monthly_station_average = df.groupby(['station', 'mm'], as_index = False).mean('tobs')
monthly_station_average.head()

Unnamed: 0,station,mm,prcp,tobs,yyyy
0,USC00511918,1,0.070391,66.854749,2012.536313
1,USC00511918,2,0.043836,67.271605,2012.450617
2,USC00511918,3,0.079817,68.565476,2012.309524
3,USC00511918,4,0.03284,70.792899,2012.343195
4,USC00511918,5,0.041617,72.22093,2012.337209


In [34]:
june_station_mean = monthly_station_average.query(("mm == 6")).reset_index()
june_station_mean.head(10)

Unnamed: 0,index,station,mm,prcp,tobs,yyyy
0,5,USC00511918,6,0.015157,74.139394,2012.333333
1,17,USC00513117,6,0.118248,74.050847,2013.474576
2,29,USC00514830,6,0.114192,76.005376,2013.607527
3,41,USC00516128,6,0.495748,71.93722,2013.565022
4,53,USC00517948,6,0.057975,76.655405,2013.418919
5,65,USC00518838,6,0.094615,73.394737,2010.526316
6,77,USC00519281,6,0.151525,73.271186,2013.542373
7,89,USC00519397,6,0.022661,77.559322,2013.542373
8,101,USC00519523,6,0.050044,76.668103,2013.478448


In [35]:
dec_station_mean = monthly_station_average.query(("mm == 12")).reset_index()
dec_station_mean.head(10)

Unnamed: 0,index,station,mm,prcp,tobs,yyyy
0,11,USC00511918,12,0.138146,69.684211,2011.980263
1,23,USC00513117,12,0.203241,71.069444,2012.990741
2,35,USC00514830,12,0.154966,73.224719,2012.960674
3,47,USC00516128,12,0.507005,69.291262,2012.946602
4,59,USC00517948,12,0.152727,71.834862,2013.311927
5,71,USC00518838,12,0.638182,72.421053,2010.842105
6,83,USC00519281,12,0.244931,69.903226,2013.0
7,95,USC00519397,12,0.075314,71.109524,2013.090476
8,107,USC00519523,12,0.16201,72.433333,2012.928571


In [46]:
# June yearly mean values
june_yearly_mean_values = june_station_mean.iloc[:,4]

In [47]:
print(june_yearly_mean_values)

0    74.139394
1    74.050847
2    76.005376
3    71.937220
4    76.655405
5    73.394737
6    73.271186
7    77.559322
8    76.668103
Name: tobs, dtype: float64


In [48]:
# December yearly mean values
dec_yearly_mean_values = dec_station_mean.iloc[:,4]
print(dec_yearly_mean_values )

0    69.684211
1    71.069444
2    73.224719
3    69.291262
4    71.834862
5    72.421053
6    69.903226
7    71.109524
8    72.433333
Name: tobs, dtype: float64


In [49]:
# Run paired t-test
ttest_ind(dec_yearly_mean_values, june_yearly_mean_values,  equal_var=False)



Ttest_indResult(statistic=-4.615865424404701, pvalue=0.0003657335214469917)

## Conclusions

According to the student, t-test differences between two datasets is statistically significant if the pvalue is less than 0.05. 
The pvalue from the December and June t-test in this anlaysis is 0.0003657 less than the critical value of 0.05. This suggests that there is a significant statistical difference between the december and June temperatures.