In [14]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'Fall_2021/Exams/weather.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
weather = pd.read_csv(file_content_stream).reset_index(drop = True)
weather.head()

Unnamed: 0,rowID,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [6]:
## Dropping rowID
weather = weather.drop(columns = ['rowID'], axis = 1)
weather.head()

Unnamed: 0,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [8]:
## Reporting data-frame shape
weather.shape

(1587257, 12)

In [7]:
## Summary statistics 
weather.describe()

Unnamed: 0,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
count,1587257.0,1587257.0,1586824.0,1586824.0,1586824.0,1586824.0,1586824.0,1586824.0,1587256.0,1587256.0,1587257.0
mean,916.8301,61.85144,161.9654,2.774272,163.403,3.399813,166.8264,2.13313,0.001854836,0.536146,47.60837
std,3.051593,11.83362,95.20812,2.060758,92.36723,2.423167,97.46275,1.745345,0.9609716,81.14766,26.21454
min,905.0,31.64,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.7
25%,914.8,52.7,62.0,1.3,68.0,1.6,77.0,0.8,0.0,0.0,24.7
50%,916.7,62.24,182.0,2.2,187.0,2.7,180.0,1.6,0.0,0.0,44.7
75%,918.7,70.88,217.0,3.8,223.0,4.6,212.0,3.0,0.0,0.0,68.0
max,929.5,99.5,359.0,32.3,359.0,36.0,359.0,32.0,655.01,63305.0,93.0


In [9]:
## Removing observation with NAs
weather = weather.dropna()
weather.shape

(1586823, 12)

In [11]:
weather.head()

Unnamed: 0,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8
5,2011-09-10 00:05:49,912.3,63.5,76.0,2.5,92.0,3.0,61.0,2.0,0.0,0.0,62.6


In [12]:
## Transforming data to 0-1 scale
scaler = MinMaxScaler()

weather[['air_pressure_0_1', 'air_temp_0_1', 'avg_wind_direction_0_1', 'avg_wind_speed_0_1', 'max_wind_direction_0_1', 'max_wind_speed_0_1','relative_humidity_0_1']] = scaler.fit_transform(weather[['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 
        'max_wind_speed','relative_humidity']])
weather.head()

Unnamed: 0,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity,air_pressure_0_1,air_temp_0_1,avg_wind_direction_0_1,avg_wind_speed_0_1,max_wind_direction_0_1,max_wind_speed_0_1,relative_humidity_0_1
1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9,0.297959,0.474801,0.448468,0.024768,0.598886,0.038997,0.424702
2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0,0.297959,0.480106,0.214485,0.021672,0.398329,0.030641,0.458288
3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5,0.297959,0.482759,0.247911,0.037152,0.311978,0.041783,0.528711
4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8,0.297959,0.482759,0.51532,0.012384,0.724234,0.02507,0.629469
5,2011-09-10 00:05:49,912.3,63.5,76.0,2.5,92.0,3.0,61.0,2.0,0.0,0.0,62.6,0.297959,0.469496,0.211699,0.077399,0.256267,0.08078,0.670639
