In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'Fall_2021/Exams/weather.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
weather = pd.read_csv(file_content_stream).reset_index(drop = True)
weather.head()

Unnamed: 0,rowID,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [2]:
## Subsetting the data 
weather = weather[weather['hpwren_timestamp'] < '2011-12-31'].reset_index(drop = True)
weather.shape

(245934, 13)

In [3]:
## Dropping rowID
weather = weather.drop(columns = ['rowID'], axis = 1)
weather.head()

Unnamed: 0,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [4]:
## Removing observation with NAs
weather = weather.dropna()
weather.shape

(245867, 12)

In [5]:
weather.head()

Unnamed: 0,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8
5,2011-09-10 00:05:49,912.3,63.5,76.0,2.5,92.0,3.0,61.0,2.0,0.0,0.0,62.6


In [6]:
weather.tail()

Unnamed: 0,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
245929,2012-02-27 23:55:41,914.4,35.78,240.0,8.2,251.0,10.5,231.0,6.2,0.0,0.0,90.9
245930,2012-02-27 23:56:41,914.4,35.78,246.0,7.0,252.0,8.1,241.0,6.4,0.0,0.0,91.0
245931,2012-02-27 23:57:41,914.4,35.78,244.0,9.8,251.0,11.1,238.0,8.9,0.0,0.0,91.0
245932,2012-02-27 23:58:41,914.4,35.78,241.0,7.6,252.0,9.2,234.0,5.8,0.0,0.0,90.9
245933,2012-02-27 23:59:41,914.3,35.78,236.0,5.5,248.0,6.5,223.0,4.3,0.0,0.0,90.9


In [7]:
## Transforming data to 0-1 scale
scaler = MinMaxScaler()

weather[['air_pressure_0_1', 'air_temp_0_1', 'avg_wind_direction_0_1', 'avg_wind_speed_0_1', 'max_wind_direction_0_1', 'max_wind_speed_0_1','relative_humidity_0_1']] = scaler.fit_transform(weather[['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 
        'max_wind_speed','relative_humidity']])
weather.head()

Unnamed: 0,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity,air_pressure_0_1,air_temp_0_1,avg_wind_direction_0_1,avg_wind_speed_0_1,max_wind_direction_0_1,max_wind_speed_0_1,relative_humidity_0_1
1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9,0.258621,0.529412,0.448468,0.034188,0.598886,0.054264,0.420306
2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0,0.258621,0.535604,0.214485,0.029915,0.398329,0.042636,0.454148
3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5,0.258621,0.5387,0.247911,0.051282,0.311978,0.05814,0.525109
4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8,0.258621,0.5387,0.51532,0.017094,0.724234,0.034884,0.626638
5,2011-09-10 00:05:49,912.3,63.5,76.0,2.5,92.0,3.0,61.0,2.0,0.0,0.0,62.6,0.258621,0.52322,0.211699,0.106838,0.256267,0.112403,0.668122


In [None]:
## Defining the variables of interest
X = weather[['air_pressure_0_1', 'air_temp_0_1', 'avg_wind_direction_0_1', 'avg_wind_speed_0_1', 'max_wind_direction_0_1', 'max_wind_speed_0_1', 'relative_humidity_0_1']]

## Defining list to store results 
scores = list()

for i in range(2, 21):
    print(i)
    ## Running k-means
    kmeans_md = KMeans(n_clusters = i, n_init = 20).fit(X)
    kmeans_labels = kmeans_md.labels_
    
    ## Computing and scoring silhouette score
    scores(silhouette_score(X, kmeans_labels))
    
## Visualizing results
plt.plot(range(2, 21), scores)
plt.xlabel('Number of Clusters')
plt.ylabel('Silhoutte Score')
plt.grid()
plt.show()

2
