In [None]:
# Configuration for suppressing warnings
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)  # Suppress specific categories as needed

# Importing standard libraries and configuring path
import sys
sys.path.append('..')
sys.path.append('../utils/')

# Importing third-party libraries for data manipulation, machine learning, and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import hiplot as hip
import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, scale
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from tqdm import tqdm
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.datasets import make_blobs
import umap

# Importing Plotly for interactive plotting
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Importing local utilities/modules, assuming these are located in the 'utils' directory
from utils.EDA import *
from utils.Clustering import *

# IPython specific configuration to set the backend for rendering high-resolution images in Jupyter notebooks
%config InlineBackend.figure_formats = ['retina']

In [None]:
plt.style.use('default')

plt.rcParams.update({
    'font.size': 20,
    'axes.linewidth': 2,
    'axes.titlesize': 20,
    'axes.edgecolor': 'black',
    'axes.labelsize': 18,
    'axes.grid': True,
    'lines.linewidth': 1.5,
    'lines.markersize': 6,
    'figure.figsize': (20, 8),
    'xtick.labelsize': 16,
    'ytick.labelsize': 16,
    'font.family': 'Times New Roman',
    'legend.fontsize': 13,
    'legend.framealpha': 0.8,
    'legend.edgecolor': 'black',
    'legend.shadow': False,
    'legend.fancybox': True,
    'legend.frameon': True,
})

In [None]:
path_to_dataset = "../3.Feature_Engineering/Datasets/OP6_Features.parquet"
df = pd.read_parquet(path_to_dataset)

In [None]:
df1 = df[df['Machine']=='M01']
df1.reset_index(drop=True,inplace=True)

In [None]:
features = ['Z_D3', 'Z_D2', 'Y_D3', 'Y_D2', 'X_D3', 'X_Rolling Energy Entropy','Y_Rolling Energy Entropy','Z_Rolling Energy Entropy', 'Y_Rolling RMS']

plot_scatter_matrix_FE(df, machine='M01', process='OP06', cols=features, sample_frac=0.1, random_state=42)

- https://ravindranathsawane.medium.com/spectral-clustering-algorithm-b469938a8841
- https://github.com/koaning/drawdata?tab=readme-ov-file

# U-MAP

In [None]:
X = df1[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.iloc[::100])
target = df1['Label'].iloc[::100].values  

In [None]:
reducer = umap.UMAP(random_state=42)
X_umap = reducer.fit_transform(X_scaled)

In [None]:
embedding = reducer.embedding_

In [None]:
fig, ax = plt.subplots(1, figsize=(8, 6))

boundaries = [0, 0.5, 1]
norm = BoundaryNorm(boundaries, ncolors=256, clip=True)


scatter = ax.scatter(*embedding.T, s=0.1, c=target, cmap='Spectral', norm=norm, alpha=1.0)


cbar = plt.colorbar(scatter, ax=ax, ticks=[0, 1])
cbar.set_label('Target')


plt.show()

In [None]:
neighbor_values = [5, 10, 15, 20, 30]  # List of different number of neighbors to try

fig, axs = plt.subplots(len(neighbor_values), 1, figsize=(8, 6 * len(neighbor_values)))

boundaries = [0, 0.5, 1]
norm = BoundaryNorm(boundaries, ncolors=256, clip=True)

for i, n_neighbors in enumerate(neighbor_values):
    reducer = umap.UMAP(n_neighbors=n_neighbors, random_state=42, n_jobs=1)
    X_umap = reducer.fit_transform(X_scaled)
    embedding = reducer.embedding_
    
    ax = axs[i] if len(neighbor_values) > 1 else axs
    scatter = ax.scatter(*embedding.T, s=0.1, c=target, cmap='Spectral', norm=norm, alpha=1.0)
    ax.set_title(f'UMAP with n_neighbors = {n_neighbors}')

    if i == len(neighbor_values) - 1:
        cbar = plt.colorbar(scatter, ax=ax, ticks=[0, 1])
        cbar.set_label('Target')

plt.tight_layout()
plt.show()

In [None]:
def fit_gmm_evaluate(data, n_components_range, random_state=0, covariance_type='diag'):
    """
    Fit Gaussian Mixture Models for a range of component numbers and evaluate using several metrics.
    
    Parameters:
        data (np.ndarray): Data to fit the models on.
        n_components_range (range): Range of component numbers to fit the models for.
        random_state (int): Random state for reproducibility of the models.
        
    Returns:
        dict: Dictionary containing fitted models and evaluation metrics.
    """
    # Storage for models and metrics
    models = []
    bics = []
    log_likelihoods = []
    davies_bouldin_indices = []
    calinski_harabasz_indices = []

    # Fit models and compute metrics
    for n in tqdm(n_components_range, desc='Fitting Models'):
        gmm = GaussianMixture(n_components=n, covariance_type=covariance_type, random_state=random_state).fit(data)
        models.append(gmm)
        bics.append(gmm.bic(data))
        log_likelihoods.append(gmm.score(data) * len(data))  # Adjusted log likelihood

        # Predict the labels
        labels = gmm.predict(data)

        # Calculate metrics if there is more than one cluster
        if n > 1:
            davies_bouldin_indices.append(davies_bouldin_score(data, labels))
            calinski_harabasz_indices.append(calinski_harabasz_score(data, labels))
        else:
            davies_bouldin_indices.append(None)
            calinski_harabasz_indices.append(None)

    return {
        "models": models,
        "bics": bics,
        "log_likelihoods": log_likelihoods,
        "davies_bouldin_indices": davies_bouldin_indices,
        "calinski_harabasz_indices": calinski_harabasz_indices
    }

In [None]:
# Define the range of neighbors to evaluate
neighbor_values = [5, 10, 15, 20, 30]

# Storage for evaluation metrics
all_gmm_metrics = {}

for n_neighbors in neighbor_values:
    reducer = umap.UMAP(n_neighbors=n_neighbors, random_state=42, n_jobs=1)
    X_umap = reducer.fit_transform(X_scaled)
    
    # Fit GMM and evaluate
    gmm_metrics = fit_gmm_evaluate(X_umap, range(2, 11), random_state=42)
    
    all_gmm_metrics[n_neighbors] = gmm_metrics

# Plot the GMM evaluation metrics for each embedding
fig, axs = plt.subplots(4, 1, figsize=(10, 24))

for n_neighbors in neighbor_values:
    metrics = all_gmm_metrics[n_neighbors]
    n_components = range(2, 11)
    
    # Plot BIC
    axs[0].plot(n_components, metrics["bics"], label=f'n_neighbors = {n_neighbors}')
    axs[0].set_title('BIC Scores for Different UMAP Embeddings')
    axs[0].set_xlabel('Number of Components')
    axs[0].set_ylabel('BIC')
    
    # Plot log likelihood
    axs[1].plot(n_components, metrics["log_likelihoods"], label=f'n_neighbors = {n_neighbors}')
    axs[1].set_title('Log Likelihood Scores for Different UMAP Embeddings')
    axs[1].set_xlabel('Number of Components')
    axs[1].set_ylabel('Log Likelihood')
    
    # Plot Davies-Bouldin index
    axs[2].plot(n_components, metrics["davies_bouldin_indices"], label=f'n_neighbors = {n_neighbors}')
    axs[2].set_title('Davies-Bouldin Index for Different UMAP Embeddings')
    axs[2].set_xlabel('Number of Components')
    axs[2].set_ylabel('Davies-Bouldin Index')
    
    # Plot Calinski-Harabasz index
    axs[3].plot(n_components, metrics["calinski_harabasz_indices"], label=f'n_neighbors = {n_neighbors}')
    axs[3].set_title('Calinski-Harabasz Index for Different UMAP Embeddings')
    axs[3].set_xlabel('Number of Components')
    axs[3].set_ylabel('Calinski-Harabasz Index')

# Add legends
for ax in axs:
    ax.legend()

plt.tight_layout()
plt.show()


In [None]:
X = df1[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
target = df1['Label'].values  

# UMAP in batches

I've tried to fit in this way because the dataset is too big

In [None]:
# Fit UMAP on a subset of the data
subset_size = 10000  # Adjust based on memory constraints
umap_model = umap.UMAP(n_neighbors=15, random_state=42, n_jobs=1)
umap_model.fit(X_scaled[::100])

# Function to transform data in batches
def transform_in_batches(umap_model, data, batch_size=10000):
    embeddings = []
    for i in tqdm(range(0, len(data), batch_size), desc='Transforming batches'):
        batch = data[i:i+batch_size]
        embedding = umap_model.transform(batch)
        embeddings.append(embedding)
    return np.vstack(embeddings)

# Transform the entire dataset in batches
X_umap = transform_in_batches(umap_model, X_scaled)

In [None]:
X_umap

In [None]:
# Create UMAP embedding with 15 neighbors
#n_neighbors = 15
#reducer = umap.UMAP(n_neighbors=n_neighbors, random_state=42, n_jobs=1)
#X_umap = reducer.fit_transform(X_scaled)

# Fit GMM with 3 components
n_components = 3
gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=42)
gmm.fit(X_umap)
labels = gmm.predict(X_umap)

In [None]:
# Create a DataFrame with the embedding and labels
df_umap = pd.DataFrame(X_umap, columns=['UMAP1', 'UMAP2'])
df_umap['Cluster'] = labels

df_umap['Cluster'] = df_umap['Cluster'].astype('str')

In [None]:
# Visualize with Plotly
fig = px.scatter(df_umap, x='UMAP1', y='UMAP2', color='Cluster', title='UMAP Embedding with GMM Clustering (3 Components)',
                 color_continuous_scale='Viridis')
fig.update_layout(coloraxis_colorbar=dict(title='Cluster'))
fig.show()

In [None]:
# Visualize with Plotly
fig = px.scatter(df_umap, x='UMAP1', y='UMAP2', color='Cluster', title='UMAP Embedding with GMM Clustering (3 Components)',
                 color_continuous_scale='Viridis')
fig.update_layout(coloraxis_colorbar=dict(title='Cluster'))
fig.show()

In [None]:
df1

In [None]:
df1.columns

In [None]:
df_original = df1[features]
df_original['Unique_Code'] = df1['Unique_Code']
df_original['Label'] = df1['Label']
df_original = df_original[::100]

In [None]:
df_original.reset_index(drop=True,inplace=True)

In [None]:
df_original['Cluster'] = df_umap['Cluster']

In [None]:
df_original.columns

In [None]:
df_original

In [None]:
plot_vars2 = ['Z_D3', 'Z_D2', 'Y_D3', 'Y_D2', 'X_D3', 'X_Rolling Energy Entropy',
       'Y_Rolling Energy Entropy', 'Z_Rolling Energy Entropy', 'Y_Rolling RMS',
       'Unique_Code', 'Cluster','Label']
visualize_with_hiplot(df_original[plot_vars2].sample(frac=0.01,random_state=0))

# Spectral Clustering

In [None]:
#https://towardsdatascience.com/spectral-clustering-aba2640c0d5b