In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm
from scipy.stats import chi2_contingency
from scipy.stats import pearsonr
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import calendar

In [2]:
df = pd.read_csv('data/weatherstats_vancouver_hourly_excelclean.csv')
df.head()

Unnamed: 0,date_time_local,pressure_station,pressure_sea,wind_dir,wind_dir_10s,wind_speed,wind_gust,relative_humidity,dew_point,temperature,windchill,humidex,visibility,health_index,cloud_okta,max_air_temp_pst1hr,min_air_temp_pst1hr
0,2013-07-01 00:00:00 PDT,101.18,101.16,SSE,16.0,7,,91,18.2,19.7,0.0,0.0,32200.0,2.9,,19.4,18.5
1,2013-07-01 01:00:00 PDT,101.22,101.21,SE,13.0,6,,89,17.8,19.6,0.0,0.0,32200.0,3.0,,20.1,18.7
2,2013-07-01 02:00:00 PDT,101.26,101.24,E,10.0,11,,88,16.7,18.7,0.0,0.0,32200.0,3.0,,19.8,18.0
3,2013-07-01 03:00:00 PDT,101.26,101.25,E,10.0,4,,84,16.5,19.2,0.0,0.0,32200.0,2.7,,18.5,17.5
4,2013-07-01 04:00:00 PDT,101.3,101.28,NNW,33.0,5,,87,15.7,17.9,0.0,0.0,32200.0,2.6,,18.8,17.3


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87648 entries, 0 to 87647
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date_time_local      87648 non-null  object 
 1   pressure_station     87648 non-null  float64
 2   pressure_sea         87648 non-null  float64
 3   wind_dir             87648 non-null  object 
 4   wind_dir_10s         87617 non-null  float64
 5   wind_speed           87648 non-null  int64  
 6   wind_gust            7872 non-null   float64
 7   relative_humidity    87648 non-null  int64  
 8   dew_point            87648 non-null  float64
 9   temperature          87648 non-null  float64
 10  windchill            87642 non-null  float64
 11  humidex              85631 non-null  float64
 12  visibility           87606 non-null  float64
 13  health_index         86746 non-null  float64
 14  cloud_okta           84593 non-null  float64
 15  max_air_temp_pst1hr  87412 non-null 

In [4]:
# Changed data type for date to datetime. Used utc=False argument to remove the PDT indicator as it was causing an error.
df['date_time_local'] = pd.to_datetime(df['date_time_local'], utc=False)
df = df.set_index('date_time_local')
df.head()



Unnamed: 0_level_0,pressure_station,pressure_sea,wind_dir,wind_dir_10s,wind_speed,wind_gust,relative_humidity,dew_point,temperature,windchill,humidex,visibility,health_index,cloud_okta,max_air_temp_pst1hr,min_air_temp_pst1hr
date_time_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2013-07-01 00:00:00,101.18,101.16,SSE,16.0,7,,91,18.2,19.7,0.0,0.0,32200.0,2.9,,19.4,18.5
2013-07-01 01:00:00,101.22,101.21,SE,13.0,6,,89,17.8,19.6,0.0,0.0,32200.0,3.0,,20.1,18.7
2013-07-01 02:00:00,101.26,101.24,E,10.0,11,,88,16.7,18.7,0.0,0.0,32200.0,3.0,,19.8,18.0
2013-07-01 03:00:00,101.26,101.25,E,10.0,4,,84,16.5,19.2,0.0,0.0,32200.0,2.7,,18.5,17.5
2013-07-01 04:00:00,101.3,101.28,NNW,33.0,5,,87,15.7,17.9,0.0,0.0,32200.0,2.6,,18.8,17.3


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 87648 entries, 2013-07-01 00:00:00 to 2023-06-30 23:00:00
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pressure_station     87648 non-null  float64
 1   pressure_sea         87648 non-null  float64
 2   wind_dir             87648 non-null  object 
 3   wind_dir_10s         87617 non-null  float64
 4   wind_speed           87648 non-null  int64  
 5   wind_gust            7872 non-null   float64
 6   relative_humidity    87648 non-null  int64  
 7   dew_point            87648 non-null  float64
 8   temperature          87648 non-null  float64
 9   windchill            87642 non-null  float64
 10  humidex              85631 non-null  float64
 11  visibility           87606 non-null  float64
 12  health_index         86746 non-null  float64
 13  cloud_okta           84593 non-null  float64
 14  max_air_temp_pst1hr  87412 non-null  float64
 15  m

In [6]:
df.isna().sum()

pressure_station           0
pressure_sea               0
wind_dir                   0
wind_dir_10s              31
wind_speed                 0
wind_gust              79776
relative_humidity          0
dew_point                  0
temperature                0
windchill                  6
humidex                 2017
visibility                42
health_index             902
cloud_okta              3055
max_air_temp_pst1hr      236
min_air_temp_pst1hr      236
dtype: int64

In [7]:
# The wind_dir_10s is a mirror of the wind_dir column. It states the degrees instead of the directions. We will drop the column.
df = df.drop('wind_dir_10s', axis=1)

In [8]:
df.isna().sum()

pressure_station           0
pressure_sea               0
wind_dir                   0
wind_speed                 0
wind_gust              79776
relative_humidity          0
dew_point                  0
temperature                0
windchill                  6
humidex                 2017
visibility                42
health_index             902
cloud_okta              3055
max_air_temp_pst1hr      236
min_air_temp_pst1hr      236
dtype: int64