# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt

In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   station  19550 non-null  object 
 1   date     19550 non-null  object 
 2   prcp     18103 non-null  float64
 3   tobs     19550 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 611.1+ KB


In [16]:
# Convert the date column format from string to datetime
df['date']=pd.to_datetime(df['date'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   station  19550 non-null  object        
 1   date     19550 non-null  datetime64[ns]
 2   prcp     18103 non-null  float64       
 3   tobs     19550 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 611.1+ KB


In [4]:
#can also use astype to convert datetime
#df['date'] = df['date'].astype('datetime64[ns]')

In [6]:
# converting to date column when importing data directly
df = pd.read_csv('Resources/hawaii_measurements.csv', parse_dates=[1])
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   station  19550 non-null  object        
 1   date     19550 non-null  datetime64[ns]
 2   prcp     18103 non-null  float64       
 3   tobs     19550 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 611.1+ KB


In [8]:
# Set the date column as the DataFrame index
df.set_index('date', inplace=True)
df

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


### Compare June and December data across all years 

In [None]:
from scipy import stats

In [12]:
# Filter data for desired months
df_june=df.filter(like='-06-', axis=0)
df_dec=df.filter(like='-12-', axis=0)

In [18]:
df_june

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-06-01,USC00519397,0.00,78
2010-06-02,USC00519397,0.01,76
2010-06-03,USC00519397,0.00,78
2010-06-04,USC00519397,0.00,76
2010-06-05,USC00519397,0.00,77
...,...,...,...
2017-06-26,USC00516128,0.02,79
2017-06-27,USC00516128,0.10,74
2017-06-28,USC00516128,0.02,74
2017-06-29,USC00516128,0.04,76


In [14]:
# Identify the average temperature for June
avg_temp_june=df_june['tobs'].mean()
avg_temp_june

74.94411764705882

In [16]:
# Identify the average temperature for December
avg_temp_dec=df_dec['tobs'].mean()
avg_temp_dec

71.04152933421226

In [24]:
# Create collections of temperature data
#for june
df_june_2010=df_june.filter(like='2010-', axis=0)
df_june_2011=df_june.filter(like='2011-', axis=0)
df_june_2012=df_june.filter(like='2012-', axis=0)
df_june_2013=df_june.filter(like='2013-', axis=0)
df_june_2014=df_june.filter(like='2014-', axis=0)
df_june_2015=df_june.filter(like='2015-', axis=0)
df_june_2016=df_june.filter(like='2016-', axis=0)
df_june_2017=df_june.filter(like='2017-', axis=0)

#avg temp for june

avg_june_2010=df_june_2010['tobs'].mean()
avg_june_2011=df_june_2011['tobs'].mean()
avg_june_2012=df_june_2012['tobs'].mean()
avg_june_2013=df_june_2013['tobs'].mean()
avg_june_2014=df_june_2014['tobs'].mean()
avg_june_2015=df_june_2015['tobs'].mean()
avg_june_2016=df_june_2016['tobs'].mean()
avg_june_2017=df_june_2017['tobs'].mean()



#for december
df_dec_2010=df_dec.filter(like='2010-', axis=0)
df_dec_2011=df_dec.filter(like='2011-', axis=0)
df_dec_2012=df_dec.filter(like='2012-', axis=0)
df_dec_2013=df_dec.filter(like='2013-', axis=0)
df_dec_2014=df_dec.filter(like='2014-', axis=0)
df_dec_2015=df_dec.filter(like='2015-', axis=0)
df_dec_2016=df_dec.filter(like='2016-', axis=0)
df_dec_2017=df_dec.filter(like='2017-', axis=0)

#avg temp for dec
avg_dec_2010=df_dec_2010['tobs'].mean()
avg_dec_2011=df_dec_2011['tobs'].mean()
avg_dec_2012=df_dec_2012['tobs'].mean()
avg_dec_2013=df_dec_2013['tobs'].mean()
avg_dec_2014=df_dec_2014['tobs'].mean()
avg_dec_2015=df_dec_2015['tobs'].mean()
avg_dec_2016=df_dec_2016['tobs'].mean()
avg_dec_2017=df_dec_2017['tobs'].mean()

#creating dataframe

df_avg=pd.DataFrame({'Date':[2010,2011,2012,2013,2014,2015,2016],
                    'Temp_june':[avg_june_2010,avg_june_2011,avg_june_2012,avg_june_2013,avg_june_2014,avg_june_2015,avg_june_2016],
                    'Temp_dec':[avg_dec_2010,avg_dec_2011,avg_dec_2012,avg_dec_2013,avg_dec_2014,avg_dec_2015,avg_dec_2016]})
df_avg

Unnamed: 0,Date,Temp_june,Temp_dec
0,2010,74.92562,70.208511
1,2011,73.938326,70.820628
2,2012,74.0,71.188073
3,2013,74.599078,71.094017
4,2014,75.027907,69.896861
5,2015,74.990148,73.423913
6,2016,75.175258,71.13


In [25]:
# Run paired t-test
from scipy import stats
stats.ttest_rel(df_avg['Temp_june'], df_avg['Temp_dec'])

Ttest_relResult(statistic=7.780060705002921, pvalue=0.00023742611093245777)