In [9]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'Fall_2021/Exams/weather.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
weather = pd.read_csv(file_content_stream).reset_index(drop = True)
weather.head()

Unnamed: 0,rowID,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [10]:
## Subsetting the data 
weather = weather[weather['hpwren_timestamp'] < '2012-02-28'].reset_index(drop = True)

In [11]:
## Dropping rowID
weather = weather.drop(columns = ['rowID'], axis = 1)
weather.head()

Unnamed: 0,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [12]:
## Summary statistics 
weather.describe()

Unnamed: 0,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
count,367606.0,367606.0,367525.0,367525.0,367525.0,367525.0,367525.0,367525.0,367605.0,367605.0,367606.0
mean,917.272808,56.845599,157.307321,3.236364,160.172954,3.964392,159.766095,2.500448,0.000549,0.690404,48.943402
std,3.365514,10.610905,93.759048,2.286027,90.401929,2.694948,96.964914,1.935097,0.011969,5.933074,27.650342
min,905.0,32.18,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,1.4
25%,915.0,48.74,59.0,1.4,67.0,1.9,60.0,1.0,0.0,0.0,23.7
50%,917.1,57.02,180.0,2.6,186.0,3.2,178.0,1.9,0.0,0.0,46.3
75%,919.2,64.04,217.0,4.7,223.0,5.7,206.0,3.8,0.0,0.0,72.8
max,929.5,91.22,359.0,23.8,359.0,25.9,359.0,23.2,1.41,60.0,93.0


In [13]:
## Removing observation with NAs
weather = weather.dropna()
weather.shape

(367524, 12)

In [14]:
weather.head()

Unnamed: 0,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8
5,2011-09-10 00:05:49,912.3,63.5,76.0,2.5,92.0,3.0,61.0,2.0,0.0,0.0,62.6


In [15]:
weather.tail()

Unnamed: 0,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
367601,2012-04-29 23:55:09,914.9,51.8,61.0,0.3,155.0,0.7,339.0,0.0,0.0,0.0,91.5
367602,2012-04-29 23:56:09,914.9,51.98,70.0,0.5,85.0,0.8,54.0,0.4,0.0,0.0,91.5
367603,2012-04-29 23:57:09,914.9,51.98,84.0,0.5,93.0,0.7,77.0,0.3,0.0,0.0,91.6
367604,2012-04-29 23:58:09,915.0,51.98,63.0,0.8,84.0,1.0,53.0,0.7,0.0,0.0,91.5
367605,2012-04-29 23:59:09,915.0,51.98,30.0,0.5,48.0,0.8,354.0,0.2,0.0,0.0,91.5


In [16]:
## Transforming data to 0-1 scale
scaler = MinMaxScaler()

weather[['air_pressure_0_1', 'air_temp_0_1', 'avg_wind_direction_0_1', 'avg_wind_speed_0_1', 'max_wind_direction_0_1', 'max_wind_speed_0_1','relative_humidity_0_1']] = scaler.fit_transform(weather[['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 
        'max_wind_speed','relative_humidity']])
weather.head()

Unnamed: 0,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity,air_pressure_0_1,air_temp_0_1,avg_wind_direction_0_1,avg_wind_speed_0_1,max_wind_direction_0_1,max_wind_speed_0_1,relative_humidity_0_1
1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9,0.297959,0.536585,0.448468,0.033613,0.598886,0.054264,0.420306
2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0,0.297959,0.542683,0.214485,0.029412,0.398329,0.042636,0.454148
3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5,0.297959,0.545732,0.247911,0.05042,0.311978,0.05814,0.525109
4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8,0.297959,0.545732,0.51532,0.016807,0.724234,0.034884,0.626638
5,2011-09-10 00:05:49,912.3,63.5,76.0,2.5,92.0,3.0,61.0,2.0,0.0,0.0,62.6,0.297959,0.530488,0.211699,0.105042,0.256267,0.112403,0.668122


In [None]:
## Defining the variables of interest
X = weather[['air_pressure_0_1', 'air_temp_0_1', 'avg_wind_direction_0_1', 'avg_wind_speed_0_1', 'max_wind_direction_0_1', 'max_wind_speed_0_1', 'relative_humidity_0_1']]

## Defining list to store results 
scores = list()

for i in range(2, 21):
    print(i)
    ## Running k-means
    kmeans_md = KMeans(n_clusters = i, n_init = 20).fit(X)
    kmeans_labels = kmeans_md.labels_
    
    ## Computing and scoring silhouette score
    scores(silhouette_score(X, kmeans_labels))
    
## Visualizing results
plt.plot(range(2, 21), scores)
plt.xlabel('Number of Clusters')
plt.ylabel('Silhoutte Score')
plt.grid()
plt.show()

2
