## Findings

Based on the extremely low p-value, the difference between the June mean temperature and the December mean temperature is statistically significant.

In [41]:
# Import dependencies
import pandas as pd
import numpy as np
from datetime import datetime as dt
from scipy import stats

In [42]:
# Read in csv file to a dataframe
# "tobs" is "temperature observations"
measurements_df = pd.read_csv("Resources/hawaii_measurements.csv", parse_dates=True)
measurements_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [43]:
# Convert the date column format from string to datetime
measurements_df["date"] = pd.to_datetime(measurements_df["date"])
measurements_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   station  19550 non-null  object        
 1   date     19550 non-null  datetime64[ns]
 2   prcp     18103 non-null  float64       
 3   tobs     19550 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 611.1+ KB


In [44]:
# Set the date column as the DataFrame index and drop the date column
measurements_by_date_df = measurements_df.set_index("date")
measurements_by_date_df.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


In [45]:
# Select dates in June
measurements_june_df = measurements_by_date_df[measurements_by_date_df.index.month == 6]

# Find average temperature for June
avg_june_temp = np.mean(a=measurements_june_df["tobs"])
avg_june_temp

74.94411764705882

In [46]:
# Select dates in December
measurements_dec_df = measurements_by_date_df[measurements_by_date_df.index.month == 12]

# Find average temperature for December
avg_dec_temp = np.mean(a=measurements_dec_df["tobs"])
avg_dec_temp

71.04152933421226

In [49]:
avg_june_temp_list = measurements_june_df.tobs
avg_dec_temp_list = measurements_dec_df.tobs
avg_dec_temp_list

date
2010-12-01    76
2010-12-03    74
2010-12-04    74
2010-12-06    64
2010-12-07    64
              ..
2016-12-27    71
2016-12-28    71
2016-12-29    69
2016-12-30    65
2016-12-31    65
Name: tobs, Length: 1517, dtype: int64

In [50]:
# Use an unpaired t-test to determine if the difference in mean temperatures is statistically significant
# Using an unpaired t-test because the comparison is between two "populations" of date: June and December.
stats.ttest_ind(avg_june_temp_list, avg_dec_temp_list)

Ttest_indResult(statistic=31.60372399000329, pvalue=3.9025129038616655e-191)