# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt
from scipy import stats

In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
df.dtypes

station     object
date        object
prcp       float64
tobs         int64
dtype: object

In [4]:
# Convert the date column format from string to datetime
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [5]:
# Set the date column as the DataFrame index
df_date_indexed = df.set_index("date")
df_date_indexed.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


In [6]:
# Drop the date column
df_no_date = df_date_indexed.reset_index(drop=True)
df_no_date.head()

Unnamed: 0,station,prcp,tobs
0,USC00519397,0.08,65
1,USC00519397,0.0,63
2,USC00519397,0.0,74
3,USC00519397,0.0,76
4,USC00519397,,73


### Compare June and December data across all years 

In [7]:
# Filter data for desired months - June & December
filtered_df = df.loc[(pd.DatetimeIndex(df['date']).month == 6) | (pd.DatetimeIndex(df['date']).month == 12)]
filtered_df['month'] = pd.DatetimeIndex(filtered_df['date']).month
filtered_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['month'] = pd.DatetimeIndex(filtered_df['date']).month


Unnamed: 0,station,date,prcp,tobs,month
133,USC00519397,2010-06-01,0.0,78,6
134,USC00519397,2010-06-02,0.01,76,6
135,USC00519397,2010-06-03,0.0,78,6
136,USC00519397,2010-06-04,0.0,76,6
137,USC00519397,2010-06-05,0.0,77,6


In [8]:
filtered_df.value_counts('month')

month
6     1700
12    1517
dtype: int64

In [9]:
# Identify the average temperature for June
filtered_df.loc[filtered_df['month'] == 6].groupby('month').agg(avg_tmp = ('tobs', 'mean'))

Unnamed: 0_level_0,avg_tmp
month,Unnamed: 1_level_1
6,74.944118


In [10]:
# Identify the average temperature for December
filtered_df.loc[filtered_df['month'] == 12].groupby('month').agg(avg_tmp = ('tobs', 'mean'))

Unnamed: 0_level_0,avg_tmp
month,Unnamed: 1_level_1
12,71.041529


In [11]:
# Create collections of temperature data
june_df = filtered_df[['date', 'tobs']].loc[filtered_df['month'] == 6].set_index('date')
june_df.head()
dec_df = filtered_df[['date', 'tobs']].loc[filtered_df['month'] == 12].set_index('date')
dec_df.head()

Unnamed: 0_level_0,tobs
date,Unnamed: 1_level_1
2010-12-01,76
2010-12-03,74
2010-12-04,74
2010-12-06,64
2010-12-07,64


### Null hypothesis: The mean difference between the temperatures in June and December is zero.

In [12]:
# Run unpaired t-test
stats.ttest_ind(june_df.tobs,dec_df.tobs)

Ttest_indResult(statistic=31.60372399000329, pvalue=3.9025129038616655e-191)

### Analysis

The mean temperature difference between the June and December is a mere ~3.9 degrees Fahrenheit. This is not much of a difference. The unpaired t-test (as these 2 are not the same group even though they share a common variable) with an extremely low p-value indicates that average (mean) temperatures of Hawaii in June and December are statistically significant.

In [13]:
# average june temperature observations by station ---
june_temps = filtered_df[filtered_df['month'] == 6].groupby(['station']).mean()


# average december temperature observations by station ---
dec_temps = filtered_df[filtered_df['month'] == 12].groupby(['station']).mean()

In [14]:
# Run paired t-test
stats.ttest_rel(june_temps.tobs,dec_temps.tobs)

Ttest_relResult(statistic=6.95696617044294, pvalue=0.00011759380231523222)

### Analysis

A paired t-test can be used in this case as we are comparing the means of the same group i.e. in this case, the mean temperature observations are of the same stations, just for different timepoints. The p-value of 0.0001 is less than 0.05 so we reject the null hypothesis and conclude that the data is statistically significant.
Again the p-value is less than .05% which means we can reject the null hypthesis and conclude that the data is statistically significant