# Bonus: Temperature Analysis I

In [22]:
import pandas as pd
from datetime import datetime
import numpy as np

In [9]:
# "tobs" is "temperature observations"
rain_df = pd.read_csv('Resources/hawaii_measurements.csv').sort_values("date", ascending=True)
rain_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
9007,USC00518838,2010-01-01,0.21,72
9518,USC00519523,2010-01-01,0.15,75
12187,USC00519281,2010-01-01,0.15,70
14959,USC00511918,2010-01-01,0.05,66


In [10]:
# Convert the date column format from string to datetime
rain_df["date"] = pd.to_datetime(rain_df["date"])
rain_df.dtypes

station            object
date       datetime64[ns]
prcp              float64
tobs                int64
dtype: object

In [11]:
# Set the date column as the DataFrame index
# Again, it does not make sense to set the "date" column as the index since it is not unique.

date_as_index = rain_df.set_index("date")
date_as_index

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-01,USC00518838,0.21,72
2010-01-01,USC00519523,0.15,75
2010-01-01,USC00519281,0.15,70
2010-01-01,USC00511918,0.05,66
...,...,...,...
2017-08-22,USC00516128,0.50,76
2017-08-23,USC00519397,0.00,81
2017-08-23,USC00514830,0.00,82
2017-08-23,USC00519523,0.08,82


In [None]:
# Drop the date column

### Compare June and December data across all years 

In [12]:
from scipy import stats

In [14]:
# Filter data for desired months
rain_df["month"] = pd.DatetimeIndex(rain_df["date"]).month

jun_and_dec = rain_df.loc[(rain_df["month"] == 6) | (rain_df["month"] == 12)]
jun_and_dec

Unnamed: 0,station,date,prcp,tobs,month
15104,USC00511918,2010-06-01,0.00,74,6
9667,USC00519523,2010-06-01,0.03,76,6
5518,USC00514830,2010-06-01,0.01,73,6
12338,USC00519281,2010-06-01,0.00,71,6
17082,USC00516128,2010-06-01,0.08,70,6
...,...,...,...,...,...
5401,USC00513117,2017-06-30,0.04,74,6
7588,USC00514830,2017-06-30,0.00,81,6
8989,USC00517948,2017-06-30,0.12,74,6
12139,USC00519523,2017-06-30,0.07,75,6


In [15]:
avg_temp = jun_and_dec.groupby("month").mean() 
avg_temp

Unnamed: 0_level_0,prcp,tobs
month,Unnamed: 1_level_1,Unnamed: 2_level_1
6,0.13636,74.944118
12,0.216819,71.041529


In [19]:
# Identify the average temperature for June
jun_avg_temp = round(avg_temp["tobs"][6],2)
print(f"The average temp in June is {jun_avg_temp} deg F.")

The average temp in June is 74.94 deg F.


In [20]:
# Identify the average temperature for December
dec_avg_temp = round(avg_temp["tobs"][12],2)
print(f"The average temp in December is {dec_avg_temp} deg F.")

The average temp in December is 71.04 deg F.


In [47]:
# Create collections of temperature data
june_df = jun_and_dec.loc[jun_and_dec["month"] == 6]
june_df = june_df.loc[june_df["prcp"].isna() == False]
june = june_df["prcp"].to_numpy()
print(type(june))
# june_df

<class 'numpy.ndarray'>


In [48]:
december_df = jun_and_dec.loc[jun_and_dec["month"] == 12]
december_df = december_df.loc[december_df["prcp"].isna() == False]
december = december_df["prcp"].to_numpy()
print(type(december))

<class 'numpy.ndarray'>


In [49]:
# Run paired t-test

stats.ttest_ind(june, december, equal_var=False)

Ttest_indResult(statistic=-4.806345313914344, pvalue=1.6366855096194443e-06)

### Analysis

With a p-value less than 0.05, we can confidently reject the null hypothosis and that the difference between the population means (June temps and December temps) is statistically significant.