In [89]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

In [72]:
dailyobs_df = pd.read_csv("daily_obs.csv")
dailyobs_df.head()

Unnamed: 0,id,obs_date_time,observer,obs_location,sky_cover,precip_type,precip_rate,air_temp_min,air_temp_max,air_temp_current,...,rain,accumulated_precip,blowing_snow,blowing_snow_dir,frz_lvl_min,frz_lvl_max,frz_lvl_cur,sno_stl,inversion,sh_nsf_obs
0,351,11/10/2015 6:00,mike,Mt Roberts Tram Wx,OVC,RS,S-1,33.0,34.8,33.5,...,,,,,,,,,,
1,352,11/11/2015 6:00,mike,Mt Roberts Tram Wx,OVC,SN,S-1,29.9,33.5,30.0,...,,,,,,,,,,
2,353,11/12/2015 6:00,mike,Mt Roberts Tram Wx,OVC,SN,S2,29.6,32.3,31.9,...,,,,,,,,,,
3,354,11/13/2015 6:00,mike,Mt Roberts Tram Wx,OVC,SN,S2,31.6,32.4,31.7,...,,,,,,,,,,
4,355,11/13/2015 6:00,mike,Speel Arm Balcony Wx,OVC,SN,S-1,30.6,32.5,31.4,...,,,,,,,,,,


In [73]:
# Create new DataFrame with only important columns from the daily_obs DataFrame
daily_obs_clean = dailyobs_df[['obs_date_time', 'obs_location', 'sky_cover', 'precip_type', 
                            'air_temp_min', 'air_temp_max', 'air_temp_current',
                            'snow_height', 'new_snow_height', 'wind_direction', 'wind_speed',
                            'wind_gust', 'hazard']]
daily_obs_clean.head()

Unnamed: 0,obs_date_time,obs_location,sky_cover,precip_type,air_temp_min,air_temp_max,air_temp_current,snow_height,new_snow_height,wind_direction,wind_speed,wind_gust,hazard
0,11/10/2015 6:00,Mt Roberts Tram Wx,OVC,RS,33.0,34.8,33.5,2.8,,SW,5.0,10.0,0.0
1,11/11/2015 6:00,Mt Roberts Tram Wx,OVC,SN,29.9,33.5,30.0,4.3,,SE,11.0,14.0,0.0
2,11/12/2015 6:00,Mt Roberts Tram Wx,OVC,SN,29.6,32.3,31.9,12.6,7.0,SE,27.0,42.0,0.0
3,11/13/2015 6:00,Mt Roberts Tram Wx,OVC,SN,31.6,32.4,31.7,14.2,5.0,SE,26.0,29.0,0.0
4,11/13/2015 6:00,Speel Arm Balcony Wx,OVC,SN,30.6,32.5,31.4,19.0,5.0,SSW,7.1,23.3,0.0


In [74]:
daily_obs_clean.isnull().sum()

obs_date_time          0
obs_location           0
sky_cover            350
precip_type         1509
air_temp_min        1165
air_temp_max        1163
air_temp_current    1519
snow_height         1700
new_snow_height     1664
wind_direction       693
wind_speed           732
wind_gust            644
hazard               419
dtype: int64

In [75]:
# Drop Null value
daily_obs_clean = daily_obs_clean.dropna()

In [76]:
# Check to make sure Null values were dropped
daily_obs_clean.isnull().sum()

obs_date_time       0
obs_location        0
sky_cover           0
precip_type         0
air_temp_min        0
air_temp_max        0
air_temp_current    0
snow_height         0
new_snow_height     0
wind_direction      0
wind_speed          0
wind_gust           0
hazard              0
dtype: int64

In [77]:
daily_obs_clean.shape

(3512, 13)

In [78]:
#Drop observation date
daily_obs_clean = daily_obs_clean.drop('obs_date_time',axis=1)

In [79]:
#Encode wind direction 
wind_direction_counts = daily_obs_clean.wind_direction.value_counts()
wind_direction_counts

# Determine which values to replace
replace_wind_direction = list(wind_direction_counts[wind_direction_counts < 150].index)

# Replace in DataFrame
for direction in replace_wind_direction:
    daily_obs_clean.wind_direction = daily_obs_clean.wind_direction.replace(direction,"Other")

In [80]:
#Encode location
obs_location_counts = daily_obs_clean.obs_location.value_counts()
obs_location_counts

# Determine which values to replace
replace_obs_location = list(obs_location_counts[obs_location_counts < 200].index)

# Replace in DataFrame
for location in replace_obs_location:
    daily_obs_clean.obs_location = daily_obs_clean.obs_location.replace(location,"Other")

In [81]:
#encode categorical columns
X = pd.get_dummies(daily_obs_clean, columns=["obs_location", "wind_direction", "sky_cover", "precip_type"])
X

Unnamed: 0,air_temp_min,air_temp_max,air_temp_current,snow_height,new_snow_height,wind_speed,wind_gust,hazard,obs_location_4-4 Diverter,obs_location_Mt Roberts Tram,...,sky_cover_FEW,sky_cover_OVC,sky_cover_SCT,sky_cover_X,precip_type_GR,precip_type_NO,precip_type_RA,precip_type_RS,precip_type_SN,precip_type_ZR
2,29.6,32.3,31.9,12.6,7.0,27.0,42.0,0.0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,31.6,32.4,31.7,14.2,5.0,26.0,29.0,0.0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,30.6,32.5,31.4,19.0,5.0,7.1,23.3,0.0,0,0,...,0,1,0,0,0,0,0,0,1,0
7,31.6,32.4,31.8,22.4,7.0,10.0,10.0,0.0,0,0,...,0,1,0,0,0,0,0,0,1,0
8,31.2,33.4,31.2,0.0,0.0,1.1,25.1,0.0,0,0,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6038,-14.3,-10.7,-10.7,99.2,1.0,0.0,0.0,1.0,0,0,...,0,0,0,1,0,0,0,0,1,0
6039,-6.6,-3.2,-4.2,142.3,7.2,3.9,9.8,1.0,0,0,...,0,0,0,0,0,1,0,0,0,0
6040,-6.5,-3.8,-6.2,14.0,5.0,0.8,5.3,1.0,0,0,...,0,0,0,0,0,1,0,0,0,0
6041,-8.3,-3.6,-5.9,85.0,9.0,1.9,6.3,1.0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [83]:
X.dtypes

air_temp_min                              float64
air_temp_max                              float64
air_temp_current                          float64
snow_height                               float64
new_snow_height                           float64
wind_speed                                float64
wind_gust                                 float64
hazard                                    float64
obs_location_4-4 Diverter                   uint8
obs_location_Mt Roberts Tram                uint8
obs_location_Mt Roberts Tram Combo Obs      uint8
obs_location_Other                          uint8
obs_location_SS Creek DOT                   uint8
obs_location_Snettisham Combo Obs           uint8
obs_location_Snettisham Dorm                uint8
obs_location_Speel Arm Balcony              uint8
obs_location_Thane Road Combo Obs           uint8
wind_direction_E                            uint8
wind_direction_ENE                          uint8
wind_direction_ESE                          uint8


In [87]:
#Scale data
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled)

[[ 5.69455775  4.99884527  5.84278865 ... -0.26534009  2.10630779
  -0.07567943]
 [ 6.06460106  5.01587083  5.80591447 ... -0.26534009  2.10630779
  -0.07567943]
 [ 5.87957941  5.03289639  5.7506032  ... -0.26534009  2.10630779
  -0.07567943]
 ...
 [-0.98472403 -1.14738341 -1.18174298 ... -0.26534009 -0.47476442
  -0.07567943]
 [-1.31776301 -1.11333228 -1.12643171 ... -0.26534009 -0.47476442
  -0.07567943]
 [-1.78031715 -1.79435485 -1.5504848  ... -0.26534009 -0.47476442
  -0.07567943]]


In [90]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=4)
daily_obs_pca = pca.fit_transform(X_scaled)
daily_obs_pca

array([[ 8.72432057,  5.24866092,  5.42889633,  3.22718945],
       [ 8.81016652,  4.47337037,  4.29134087,  3.01876484],
       [ 8.33320074,  2.30631248,  0.12046946,  4.1553077 ],
       ...,
       [-1.89647457, -1.68693503,  0.69316269, -0.73068812],
       [-2.52349387, -1.04697539,  0.5509441 ,  0.86598477],
       [-3.6753583 , -0.96464698,  0.60996836, -0.38959386]])

In [91]:
# Create a DataFrame with the three principal components.
daily_obs_df = pd.DataFrame(
    data=daily_obs_pca, columns=["PC 1", "PC 2", "PC 3", "PC4"], index = X.index
)
daily_obs_df.head()

Unnamed: 0,PC 1,PC 2,PC 3,PC4
2,8.724321,5.248661,5.428896,3.227189
3,8.810167,4.47337,4.291341,3.018765
4,8.333201,2.306312,0.120469,4.155308
7,8.211952,1.911576,-0.287159,3.307071
8,9.656568,0.80083,-0.003279,3.534785


In [92]:
inertia = []
k = list(range(1, 11))

In [94]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(daily_obs_df)
    inertia.append(km.inertia_)

In [95]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [96]:
def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = daily_obs_df.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data

In [97]:
#Five clusters
five_clusters = get_clusters(5, daily_obs_df)
five_clusters.head()

Unnamed: 0,PC 1,PC 2,PC 3,PC4,class
2,8.724321,5.248661,5.428896,3.227189,2
3,8.810167,4.47337,4.291341,3.018765,2
4,8.333201,2.306312,0.120469,4.155308,2
7,8.211952,1.911576,-0.287159,3.307071,2
8,9.656568,0.80083,-0.003279,3.534785,2


In [99]:
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([X, daily_obs_df], axis=1)
clustered_df.head()

Unnamed: 0,air_temp_min,air_temp_max,air_temp_current,snow_height,new_snow_height,wind_speed,wind_gust,hazard,obs_location_4-4 Diverter,obs_location_Mt Roberts Tram,...,precip_type_GR,precip_type_NO,precip_type_RA,precip_type_RS,precip_type_SN,precip_type_ZR,PC 1,PC 2,PC 3,PC4
2,29.6,32.3,31.9,12.6,7.0,27.0,42.0,0.0,0,0,...,0,0,0,0,1,0,8.724321,5.248661,5.428896,3.227189
3,31.6,32.4,31.7,14.2,5.0,26.0,29.0,0.0,0,0,...,0,0,0,0,1,0,8.810167,4.47337,4.291341,3.018765
4,30.6,32.5,31.4,19.0,5.0,7.1,23.3,0.0,0,0,...,0,0,0,0,1,0,8.333201,2.306312,0.120469,4.155308
7,31.6,32.4,31.8,22.4,7.0,10.0,10.0,0.0,0,0,...,0,0,0,0,1,0,8.211952,1.911576,-0.287159,3.307071
8,31.2,33.4,31.2,0.0,0.0,1.1,25.1,0.0,0,0,...,0,0,1,0,0,0,9.656568,0.80083,-0.003279,3.534785


In [None]:
#2D Graph
four_clusters.hvplot.scatter(x=?, y=?, class=hazard)