### KMeans and KShape Clustering


In [None]:
import pandas as pd 
from pathlib import Path 
from typing import List
import numpy as np
import matplotlib.pyplot as plt
import datetime
import random
from random import randrange
from datetime import timedelta
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf,month_plot,quarter_plot
import warnings
warnings.filterwarnings('ignore')


In [None]:
data_path = Path.cwd().parent.parent / "data"
soil_df_path = data_path / "SOPA-data-TS-daily-clean-2023-04-01-to-2023-09-30.csv"
soil_df_raw = pd.read_csv(soil_df_path, index_col=0)
soil_df = soil_df_raw


In [None]:
def remove_microseconds(datetime_str):
    if '.' in datetime_str:
        dot_index = datetime_str.index('.')
        plus_index = datetime_str.index('+')
        return datetime_str[:dot_index] + datetime_str[plus_index:]
    else:
        return datetime_str
    


In [None]:
soil_df['datetime'] = soil_df['datetime'].apply(remove_microseconds)


In [None]:
import matplotlib.pyplot as plt

# Assuming 'index' is the index column of your DataFrame
# Replace 'column_name' with the name of the column you want to plot
column_name = 'SENS0015-SM-SOPA'

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(soil_df.index, soil_df[column_name], marker='o', linestyle='-')
plt.title('Daily Values of {}'.format(column_name))
plt.xlabel('Day')
plt.ylabel('Value')
plt.grid(True)
plt.show()


In [None]:
start_date = '2023-04-01 06:00:00+00:00'
end_date = '2023-09-30 23:00:00+00:00'

soil_df = soil_df[(soil_df['datetime'] >= start_date) & (soil_df['datetime'] < end_date)]

soil_df.to_csv(data_path / 'SOPA-data-raw-2023-04-01-to-2023-09-30.csv')

In [None]:
soil_df=soil_df.drop_duplicates()

In [None]:
specific_station_names = ['SENS0004-SM-SOPA', 'SENS0005-SM-SOPA', 'SENS0006-SM-SOPA']
dfp = df[df['station_name'].isin(specific_station_names)]

In [None]:
dfd=df.groupby(['station_name',df.index.date])['value'].median().unstack(0)

In [None]:
missing_sensors = []
for col in dfd.columns:
    consecutive_days = 0
    for value in dfd[col].isnull():
        if value:
            consecutive_days += 1
        else:
            consecutive_days = 0
        if consecutive_days >= 3:
            missing_sensors.append(col)
            break
            
print("Sensors with missing data for 3 or more consecutive days:", missing_sensors)


In [None]:
dfd_cleaned = dfd.drop(columns=missing_sensors)

# Optionally, you can also drop rows with missing values across all sensors if needed
dfd_cleaned = dfd_cleaned.dropna(how='all')

# Optionally, you can also reset the index if needed
dfd_cleaned.reset_index(inplace=True)
dfd_cleaned.shape

In [None]:
dfd_cleaned.to_csv(data_path / 'SOPA-data-TS-daily-clean-2023-04-01-to-2023-09-30.csv')

In [None]:
dfd=pd.read_csv(data_path / 'SOPA-data-TS-daily-clean-2023-04-01-to-2023-09-30.csv')

In [None]:
import matplotlib.pyplot as plt

# Define the number of sensors per row
sensors_per_row = 3

# Calculate the number of rows needed
num_sensors = len(dfd.columns)
num_rows = -(-num_sensors // sensors_per_row)  # Ceiling division to ensure all sensors are covered

# Set up the plot
fig, axs = plt.subplots(num_rows, sensors_per_row, figsize=(15, 5*num_rows))

# Flatten the axes if there's only one row
if num_rows == 1:
    axs = [axs]

# Iterate over columns and plot
for i, col in enumerate(dfd.columns):
    row_idx = i // sensors_per_row
    col_idx = i % sensors_per_row
    axs[row_idx][col_idx].plot(dfd.index, dfd[col])
    axs[row_idx][col_idx].set_title(col)
    axs[row_idx][col_idx].set_xlabel('Date')
    axs[row_idx][col_idx].set_ylabel('Value')

# Hide any empty subplots
for i in range(num_sensors, num_rows * sensors_per_row):
    row_idx = i // sensors_per_row
    col_idx = i % sensors_per_row
    axs[row_idx][col_idx].axis('off')

# Adjust layout
plt.tight_layout()
plt.show()


## Clustering starts here

In [None]:
# Rename the 'Unnamed: 0' column to 'station_name'
dfd.rename(columns={'Unnamed: 0': 'station_name'}, inplace=True)

# Set the 'station_name' column as the index
dfd.set_index('station_name', inplace=True)

# Drop the 'index' column if needed
if 'index' in dfd.columns:
    dfd.drop(columns='index', inplace=True)

# Now, the DataFrame dfd has the desired structure


In [None]:
# Impute missing values with forward fill
dfd = dfd.ffill(axis=0)

# Now, the DataFrame dfd has missing values imputed


In [None]:
# KShape clsutering

import numpy as np
import pandas as pd
from tslearn.clustering import KShape
import matplotlib.pyplot as plt

# Assuming dfh is your DataFrame containing sensor data

# Step 1: Preprocess the data
# Extract the sensor data
#dfd_imputed = dfd_cleaned.fillna(dfd.mean())
#dfd.reset_index(inplace=True)
#dfd['Unnamed: 0'] = pd.to_datetime(dfd['Unnamed: 0'])

# Now, you can use the 'Unnamed: 0' column as the index
#dfd.set_index('Unnamed: 0', inplace=True)
#dfd = dfd.ffill(axis=0)

sensor_data = dfd.values.T  # Transpose to have sensors as rows and time steps as columns

# Step 2: Apply KShape clustering algorithm
n_clusters = 30
kshape = KShape(n_clusters=n_clusters, verbose=True, random_state=42)
kshape.fit(sensor_data)

# Step 3: Plot each cluster
cluster_labels = kshape.labels_
for cluster_id in range(n_clusters):
    plt.figure(figsize=(10, 6))
    cluster_indices = np.where(cluster_labels == cluster_id)[0]
    for idx in cluster_indices:
        plt.plot(sensor_data[idx], label=f'Sensor {idx}')
    plt.title(f'Cluster {cluster_id + 1}')
    plt.xlabel('Time Step')
    plt.ylabel('Sensor Value')
    plt.legend()
    plt.show()


In [None]:
# DTW KMeans clustering

import numpy as np
import pandas as pd
from tslearn.clustering import TimeSeriesKMeans
import matplotlib.pyplot as plt

# Assuming dfd is your DataFrame containing sensor data

# Step 1: Preprocess the data
# Impute missing values with forward fill
#dfd = dfd.ffill(axis=0)

# Transpose to have sensors as rows and time steps as columns
sensor_data = dfd.values.T  # Transpose to have sensors as rows and time steps as columns

# Step 2: Apply DTW clustering algorithm
n_clusters = 30
dtw_kmeans = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", verbose=True, random_state=42)
dtw_kmeans.fit(sensor_data)

# Step 3: Plot each cluster
cluster_labels = dtw_kmeans.labels_
for cluster_id in range(n_clusters):
    plt.figure(figsize=(10, 6))
    cluster_indices = np.where(cluster_labels == cluster_id)[0]
    for idx in cluster_indices:
        plt.plot(sensor_data[idx], label=f'Sensor {idx}')
    plt.title(f'Cluster {cluster_id + 1}')
    plt.xlabel('Time Step')
    plt.ylabel('Sensor Value')
    plt.legend()
    plt.show()


In [None]:
dfh=df.groupby([df['datetime_converted'].dt.strftime('%Y-%m-%d %H'), 'station_name'])['value'].median().unstack()

In [None]:
dfh.to_csv(data_path / 'SOPA-data-TS-hourly-2023-04-01-to-2023-09-30.csv')


In [None]:
# Check if dfh contains NaN values
if dfh.isnull().values.any():
    print("DataFrame contains NaN values")
else:
    print("DataFrame does not contain NaN values")

# Print the shape and head of dfh to inspect its structure
print("Shape of dfh:", dfh.shape)
print("Head of dfh:", dfh.head())


In [None]:
import numpy as np
import pandas as pd
from tslearn.clustering import KShape
import matplotlib.pyplot as plt

# Assuming dfh is your DataFrame containing sensor data

# Step 1: Preprocess the data
# Extract the sensor data
dfh_imputed = dfh.fillna(dfh.mean())


sensor_data = dfh_imputed.values.T  # Transpose to have sensors as rows and time steps as columns

# Step 2: Apply KShape clustering algorithm
n_clusters = 10
kshape = KShape(n_clusters=n_clusters, verbose=True, random_state=42)
kshape.fit(sensor_data)

# Step 3: Plot each cluster
cluster_labels = kshape.labels_
for cluster_id in range(n_clusters):
    plt.figure(figsize=(10, 6))
    cluster_indices = np.where(cluster_labels == cluster_id)[0]
    for idx in cluster_indices:
        plt.plot(sensor_data[idx], label=f'Sensor {idx}')
    plt.title(f'Cluster {cluster_id + 1}')
    plt.xlabel('Time Step')
    plt.ylabel('Sensor Value')
    plt.legend()
    plt.show()
