# Clustering Algorithm 

## Install necessary dependencies

In [50]:
!pip install tslearn



## Import Necessary Libraries 

In [None]:
# Native libraries
import os
import math

# Essential Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import MinMaxScaler

# Algorithms
from tslearn.barycenters import dtw_barycenter_averaging
from tslearn.clustering import TimeSeriesKMeans
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

## Read data 

In [44]:
directory = '/Users/perimtemizoz/Desktop/Doktar/Doktar_Case/point_data/'

mySeries = []
namesofMySeries = []
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        df = pd.read_csv(directory+filename)
        df = df.loc[:,["Time","Relative Humidity","Temperature"]]
        df.sort_index(inplace=True)
        mySeries.append(df)
        namesofMySeries.append(filename[:-4])

In [25]:
def stringToDate(frame):
    frame['Time'] = pd.to_datetime(frame['Time'])
    frame['Hour'] = frame['Time'].apply(lambda time: time.hour)
    frame['Month'] = frame['Time'].apply(lambda time: time.month)
    frame['Week'] = frame['Time'].apply(lambda time: time.dayofweek)
    return frame

## New Feature 

Hourly temperature and relative humidity data are transformed to monthly mean as a new feature. 
\
\
In order to make a cluster,  

I wanted to bring these data together in a common feature, so it makes sense to separate them *monthly* or *seasonally*, since hourly data cannot be a common feature.

In [34]:
def getMean(mySeries):
    temp_data = []
    for i in range(len(mySeries)):
      data = stringToDate(mySeries[i])
      data = data.set_index("Time")
      df_column = ['Temperature', 'Relative Humidity']
      df_monthly_mean = data[df_column].resample("MS").mean() #MS-YEAR Starting
      #df_year_mean=mySeries[i].resample("YS").mean() #YS-YEAR Starting
      temp_data.append(df_monthly_mean)
    return temp_data

In [39]:
mean_calculated_frames = getMean(mySeries)

## Scaling 

MinMax Scaling is a case where data takes values between **0 and 1**.

Here the distribution is similar to the distribution of the data.
\
There is a sensitivity to the outlier data called *outlier* here, so it may not perform well in a situation where these values are high.

In [40]:
for i in range(len(mean_calculated_frames)):
    scaler = MinMaxScaler()
    mean_calculated_frames[i] = MinMaxScaler().fit_transform(mean_calculated_frames[i])
    mean_calculated_frames[i]= mean_calculated_frames[i].reshape(len(mean_calculated_frames[i]),2)

## K-means
A good rule of thumb is choosing k as the square root of the number of points in the training data set in kMeans
In time series analysis, Dynamic Time Warping is used to compare the similarity or calculate the distance between two arrays or time series with different length.

In [41]:
cluster_count = math.ceil(math.sqrt(len(mean_calculated_frames))) 

km = TimeSeriesKMeans(n_clusters=cluster_count, metric="dtw")

labels = km.fit_predict(mean_calculated_frames)

## Comment 

### Why K-means 
\
**1**- Simple to implement 
\
**2**- Scales to large data sets}

## Conclusion 

K-means create 3 cluster based on montly mean temperature and relative humidity for 15 station. 
\
**As I understand from clustering below, It separated the points that are close to each other in the same clusters.**

In [None]:
fancy_names_for_labels = [f"Cluster {label}" for label in labels]
pd.DataFrame(zip(namesofMySeries,fancy_names_for_labels),columns=["Series","Cluster"]).sort_values(by="Cluster").set_index("Series")

Unnamed: 0_level_0,Cluster
Series,Unnamed: 1_level_1
point_10,Cluster 0
point_6,Cluster 0
point_7,Cluster 0
point_1,Cluster 0
point_3,Cluster 0
point_5,Cluster 1
point_4,Cluster 1
point_2,Cluster 1
point_13,Cluster 2
point_9,Cluster 2
