In [None]:
# Configuration for suppressing warnings
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)  # Suppress specific categories as needed

# Importing standard libraries and configuring path
import sys
sys.path.append('..')
sys.path.append('../utils/')

# Importing third-party libraries for data manipulation, machine learning, and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import hiplot as hip
import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, scale
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from tqdm import tqdm
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.datasets import make_blobs
import umap

# Importing Plotly for interactive plotting
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Importing local utilities/modules, assuming these are located in the 'utils' directory
from utils.EDA import *
from utils.Clustering import *

# IPython specific configuration to set the backend for rendering high-resolution images in Jupyter notebooks
%config InlineBackend.figure_formats = ['retina']

In [None]:
plt.style.use('default')

plt.rcParams.update({
    'font.size': 20,
    'axes.linewidth': 2,
    'axes.titlesize': 20,
    'axes.edgecolor': 'black',
    'axes.labelsize': 18,
    'axes.grid': True,
    'lines.linewidth': 1.5,
    'lines.markersize': 6,
    'figure.figsize': (20, 8),
    'xtick.labelsize': 16,
    'ytick.labelsize': 16,
    'font.family': 'Times New Roman',
    'legend.fontsize': 13,
    'legend.framealpha': 0.8,
    'legend.edgecolor': 'black',
    'legend.shadow': False,
    'legend.fancybox': True,
    'legend.frameon': True,
})

In [None]:
path_to_dataset = "../3.Feature_Engineering/Datasets/OP6_Features.parquet"
df = pd.read_parquet(path_to_dataset)

### Create sub-categories of the data

In [None]:
#variance_col = ['X_Rolling Variance', 'X_Rolling RMS', 'Y_Rolling Variance', 'Y_Rolling RMS','_Rolling Variance', 'Z_Rolling RMS']

variance_col = ['X_Rolling RMS', 'Y_Rolling RMS','Z_Rolling RMS']

non_dim_col = ['X_Rolling Impulse Factor','X_Rolling Margin Factor', 'Y_Rolling Impulse Factor','Y_Rolling Margin Factor',
               'Z_Rolling Impulse Factor','Z_Rolling Margin Factor']

wavelet_col = ['X_D1', 'X_D2', 'X_D3', 'X_A3', 'Y_D1','Y_D2', 'Y_D3', 'Y_A3', 'Z_D1', 'Z_D2', 'Z_D3', 'Z_A3']

mean_col = ['X_Rolling Mean','X_Rolling Median', 'Y_Rolling Mean','Y_Rolling Median',
            'Z_Rolling Mean','Z_Rolling Median']

min_max_col = ['X_Rolling Max','X_Rolling Min', 'Y_Rolling Max','Y_Rolling Min', 'Z_Rolling Max']

original_col = ['X_Axis','Y_Axis','Z_axis', 'X_Jerk','Y_Jerk','Z_Jerk']

energy_col = ['X_Rolling Energy', 'X_Rolling Energy Entropy','Y_Rolling Energy',
       'Y_Rolling Energy Entropy',
       'Z_Rolling Energy', 'Z_Rolling Energy Entropy']

stat_col = ['X_Rolling Skewness', 'X_Rolling Kurtosis','Y_Rolling Skewness', 'Y_Rolling Kurtosis', 'Z_Rolling Skewness', 'Z_Rolling Kurtosis']

- To avoid redundant features we will identify linesar correlation and remove features that are 90% correlated or more

In [None]:
col = ['X_axis', 'X_Rolling Mean', 'X_Rolling Std', 'X_Rolling Max',
       'X_Rolling Min', 'X_Rolling Median', 'X_Rolling Variance',
       'X_Rolling Skewness', 'X_Rolling Kurtosis', 'X_Rolling RMS',
       'X_Rolling Impulse Factor', 'X_Rolling Margin Factor', 'Y_axis',
       'Y_Rolling Mean', 'Y_Rolling Std', 'Y_Rolling Max', 'Y_Rolling Min',
       'Y_Rolling Median', 'Y_Rolling Variance', 'Y_Rolling Skewness',
       'Y_Rolling Kurtosis', 'Y_Rolling RMS', 'Y_Rolling Impulse Factor',
       'Y_Rolling Margin Factor', 'Z_axis', 'Z_Rolling Mean', 'Z_Rolling Std',
       'Z_Rolling Max', 'Z_Rolling Min', 'Z_Rolling Median',
       'Z_Rolling Variance', 'Z_Rolling Skewness', 'Z_Rolling Kurtosis',
       'Z_Rolling RMS', 'Z_Rolling Impulse Factor', 'Z_Rolling Margin Factor',
       'X_Rolling Energy', 'X_Rolling Energy Entropy',
       'X_Rolling Normalized Energy', 'Y_Rolling Energy',
       'Y_Rolling Energy Entropy', 'Y_Rolling Normalized Energy',
       'Z_Rolling Energy', 'Z_Rolling Energy Entropy',
       'Z_Rolling Normalized Energy', 'X_D1', 'X_D2', 'X_D3', 'X_A3', 'Y_D1',
       'Y_D2', 'Y_D3', 'Y_A3', 'Z_D1', 'Z_D2', 'Z_D3', 'Z_A3', 'X_Jerk',
       'Y_Jerk', 'Z_Jerk']

correlation_matrix = df[col].corr()

# Find pairs of columns with high correlation
high_corr_pairs = {}
for col in correlation_matrix.columns:
    for index in correlation_matrix.index:
        if (correlation_matrix.at[index, col] >= 0.8) and (index != col):  # High correlation and excluding the main diagonal
            if (index, col) not in high_corr_pairs and (col, index) not in high_corr_pairs:
                high_corr_pairs[(index, col)] = correlation_matrix.at[index, col]

# Display the pairs of columns and their correlation coefficients
for pair, corr_value in high_corr_pairs.items():
    print(f"Pair: {pair[0]}, {pair[1]} - Correlation: {corr_value}")


In [None]:
plot_vars = ['Z_D3', 'Z_D2', 'Y_D3', 'Y_D2', 'X_D3','Machine','Label', 'Y_Rolling Energy Entropy'] 
visualize_with_hiplot(df[plot_vars].sample(frac=0.01,random_state=0))

Despite defect signals are related to a bigger dispersion, when you try to exclude this despersion seams like you are not able to separate them

In [None]:
selected_columns = energy_col

plot_scatter_matrix_FE(df, machine='M01', process='OP06', cols=selected_columns, sample_frac=0.1, random_state=42)

In [None]:
selected_columns = ['Z_D3', 'Z_D2', 'Y_D3', 'Y_D2', 'X_D3', 'X_Rolling Energy Entropy','Y_Rolling Energy Entropy','Z_Rolling Energy Entropy', 'Y_Rolling RMS']

plot_scatter_matrix_FE(df, machine='M01', process='OP06', cols=selected_columns, sample_frac=0.1, random_state=42)

In [None]:
selected_columns = ['Z_D3', 'Z_D2', 'Y_D3', 'Y_D2', 'X_D3']

plot_scatter_matrix_FE(df, machine='M02', process='OP06', cols=selected_columns, sample_frac=0.1, random_state=42)

In [None]:
selected_columns = variance_col
plot_scatter_matrix_FE(df, machine='M01', process='OP06', cols=selected_columns, sample_frac=0.1, random_state=42)

In [None]:
selected_columns = original_col
plot_scatter_matrix_FE(df, machine='M01', process='OP06', cols=selected_columns, sample_frac=0.1, random_state=42)

# Gaussian Mixture Models

- A mixture model is a probabilistic model that represents a distribution as a mixture of simpler component distributions. In the context of clustering, a mixture model can be used to represent the distribution of data points as a mixture of Gaussian distributions, with each Gaussian representing a separate cluster.<br>

References: <br>
[1] https://www.youtube.com/watch?v=5amKlNtIoT0 <br>
[2] https://behesht.medium.com/unsupervised-learning-clustering-using-gaussian-mixture-model-gmm-c788b280932b

In [None]:
df1 = df[df['Machine']=='M01']

In [None]:
n_components = np.arange(1, 14)

columns = ['X_axis', 'Y_axis', 'Z_axis','Z_D3', 'Z_D2', 'Y_D3', 'Y_D2', 'X_D3', 'X_Rolling Energy Entropy','Y_Rolling Energy Entropy','Z_Rolling Energy Entropy', 'Y_Rolling RMS']

results = fit_gmm_evaluate(df1, columns, n_components, random_state=0,covariance_type = 'full')

In [None]:
models = results['models']
bics = results['bics']
log_likelihoods = results['log_likelihoods']
davies_bouldin_indices = results['davies_bouldin_indices']
calinski_harabasz_indices = results['calinski_harabasz_indices']

In [None]:
plt.figure(figsize=(18, 10))


plt.subplot(2, 2, 1)
plt.plot(n_components, bics, label='BIC')
plt.title('Bayesian Information Criterion')
plt.xlabel('Number of components')
plt.ylabel('BIC')

plt.subplot(2, 2, 2)
plt.plot(n_components, log_likelihoods, label='Log Likelihood')
plt.title('Log Likelihood')
plt.xlabel('Number of components')
plt.ylabel('Log Likelihood')

plt.subplot(2, 2, 3)
plt.plot(n_components[1:], davies_bouldin_indices[1:], label='Davies-Bouldin Index') # Lower is better
plt.title('Davies-Bouldin Index')
plt.xlabel('Number of components')
plt.ylabel('Davies-Bouldin Index')

plt.subplot(2, 2, 4)
plt.plot(n_components[1:], calinski_harabasz_indices[1:], label='Calinski-Harabasz Index') # Higher is better
plt.title('Calinski-Harabasz Index')
plt.xlabel('Number of components')
plt.ylabel('Calinski-Harabasz Index')

# plt.subplot(2, 3, 6)
# plt.plot(n_components[1:], silhouette_scores[1:], label='Silhouette Coefficient')
# plt.title('Silhouette Coefficient')
# plt.xlabel('Number of components')
# plt.ylabel('Silhouette Score')

plt.tight_layout()
plt.show()

In [None]:
gmm = GaussianMixture(2, covariance_type='full', random_state=0).fit(df1[columns])
# means = gmm.means_
# covariances = gmm.covariances_

labels = gmm.predict(df1[columns])

# Adding the label to the dataset with the TripNumber variable
df1['Clustering_Labels'] = labels
df1['Clustering_Labels'] = df1['Clustering_Labels'].astype('object')

In [None]:
accels_cols = ['Z_D3', 'Z_D2', 'Y_D3', 'Y_D2', 'X_D3', 'X_Rolling Energy Entropy','Y_Rolling Energy Entropy','Y_Rolling RMS']


plotly_scattermatrix(df=df1.sample(frac=0.10,random_state=0), cols=accels_cols,
                     color='Clustering_Labels', category_order={'Clustering_Labels':[0,1]},
                     width=1800, height=1100,
                     label_fontsize=18, legend_fontsize=26,
                     upload=False, filename=None 
                     )

In [None]:
df1['Clustering_Labels'] = labels

def classify_labels_ua(value):
    if value in [1]: 
        return 'Bad'
    else:
        return 'Normal'

df1['Anomalies'] = df1['Clustering_Labels'].apply(classify_labels_ua)

In [None]:
plot_vars2 = ['Anomalies','Unique_Code','Z_D3', 'Z_D2', 'Y_D3', 'Y_D2', 'X_D3','Machine','Z_Rolling Energy Entropy', 'Y_Rolling RMS']
visualize_with_hiplot(df1[plot_vars2].sample(frac=0.01,random_state=0))

- https://ravindranathsawane.medium.com/spectral-clustering-algorithm-b469938a8841
- https://github.com/koaning/drawdata?tab=readme-ov-file

What if we first try to apply PCA to separate the data?

In [None]:
features = ['X_axis', 'Y_axis', 'Z_axis','Z_D3', 'Z_D2', 'Y_D3', 'Y_D2', 'X_D3', 'X_Rolling Energy Entropy','Y_Rolling Energy Entropy','Z_Rolling Energy Entropy', 'Y_Rolling RMS']

X = df1[features]
X.reset_index(drop=True, inplace=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_scaled = pd.DataFrame(X_scaled, columns=features)

In [None]:
def plot_pca_cumulative_variance(X_scaled, n_components=None):
    """
    Fit a PCA model to the given scaled data and plot the cumulative explained variance.

    Parameters:
    - X_scaled: The scaled input data (e.g., a NumPy array or DataFrame).
    - n_components: Number of PCA components to consider (default is None, which means all).
    """
    # Initialize PCA model
    pca = PCA(n_components=n_components)
    
    # Fit PCA model to the scaled data
    pca.fit(X_scaled)
    
    # Calculate the cumulative explained variance
    cumulative_variance = pca.explained_variance_ratio_.cumsum()
    
    # Create a plot of the cumulative explained variance
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', color='b')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('PCA - Cumulative Variance Explained by Components')
    plt.grid(True)
    plt.show()

In [None]:
plot_pca_cumulative_variance(X_scaled)

In [None]:
n_components = 8
pca = PCA(n_components=n_components) 
X_pca = pca.fit_transform(X_scaled)
explained_var = pca.explained_variance_ratio_
print("Explained variance for each component:", explained_var)

In [None]:
columns = [f'PC{i+1}' for i in range(n_components)]


pca_df = pd.DataFrame(data = X_pca[:, :8], columns=columns)

In [None]:
df1.reset_index(drop=True,inplace=True)

In [None]:
pca_df['Label'] = df1['Label']

In [None]:
pca_df['Label'] = pca_df['Label'].astype('str')

In [None]:
accels_cols = columns


plotly_scattermatrix(df=pca_df.sample(frac=0.10,random_state=0), cols=accels_cols,
                     color='Label', category_order={'Label':[0,1]},
                     width=1800, height=1100,
                     label_fontsize=18, legend_fontsize=26,
                     upload=False, filename=None 
                     )

In [None]:
pca_df['Unique_Code'] = df1['Unique_Code']

In [None]:
n_components = np.arange(1, 15)
columns = ['PC1','PC2','PC3','PC4','PC5','PC6','PC8','PC8']

results = fit_gmm_evaluate(pca_df, columns, n_components, random_state=0,covariance_type = 'full')

In [None]:
models = results['models']
bics = results['bics']
log_likelihoods = results['log_likelihoods']
davies_bouldin_indices = results['davies_bouldin_indices']
calinski_harabasz_indices = results['calinski_harabasz_indices']

In [None]:
plt.figure(figsize=(18, 10))


plt.subplot(2, 2, 1)
plt.plot(n_components, bics, label='BIC')
plt.title('Bayesian Information Criterion')
plt.xlabel('Number of components')
plt.ylabel('BIC')

plt.subplot(2, 2, 2)
plt.plot(n_components, log_likelihoods, label='Log Likelihood')
plt.title('Log Likelihood')
plt.xlabel('Number of components')
plt.ylabel('Log Likelihood')

plt.subplot(2, 2, 3)
plt.plot(n_components[1:], davies_bouldin_indices[1:], label='Davies-Bouldin Index') # Lower is better
plt.title('Davies-Bouldin Index')
plt.xlabel('Number of components')
plt.ylabel('Davies-Bouldin Index')

plt.subplot(2, 2, 4)
plt.plot(n_components[1:], calinski_harabasz_indices[1:], label='Calinski-Harabasz Index') # Higher is better
plt.title('Calinski-Harabasz Index')
plt.xlabel('Number of components')
plt.ylabel('Calinski-Harabasz Index')

# plt.subplot(2, 3, 6)
# plt.plot(n_components[1:], silhouette_scores[1:], label='Silhouette Coefficient')
# plt.title('Silhouette Coefficient')
# plt.xlabel('Number of components')
# plt.ylabel('Silhouette Score')

plt.tight_layout()
plt.show()

In [None]:
columns

In [None]:
pca_df

In [None]:
gmm = GaussianMixture(4, covariance_type='full', random_state=0).fit(pca_df[columns])
# means = gmm.means_
# covariances = gmm.covariances_

labels = gmm.predict(pca_df[columns])

# Adding the label to the dataset with the TripNumber variable
pca_df['Clustering_Labels'] = labels
pca_df['Clustering_Labels'] = pca_df['Clustering_Labels'].astype('object')

In [None]:
accels_cols = columns


plotly_scattermatrix(df=pca_df.sample(frac=0.10,random_state=0), cols=accels_cols,
                     color='Clustering_Labels', category_order={'Clustering_Labels':[0,1]},
                     width=1800, height=1100,
                     label_fontsize=18, legend_fontsize=26,
                     upload=False, filename=None 
                     )

In [None]:
pca_df['Clustering_Labels'] = labels

def classify_labels_ua(value):
    if value in [3]: 
        return 'Bad'
    else:
        return 'Normal'

pca_df['Anomalies'] = pca_df['Clustering_Labels'].apply(classify_labels_ua)

In [None]:
pca_df

In [None]:
plot_vars2 = ['Anomalies','Unique_Code','PC1','PC2','PC3','PC4','PC5','PC6','PC8','PC8']
visualize_with_hiplot(pca_df[plot_vars2].sample(frac=0.01,random_state=0))

Maybe because of the bahavior of the data, GMM is not the most indicated algorithm fot this problem

In [None]:
df2['Label'] = df2['Label'].astype('str')

In [None]:
fig = px.scatter(df2[::100], x='Z_Rolling Impulse Factor', y='Z_Rolling Margin Factor', color='Label',
                 labels={
                     'Z_axis': 'Z_Rolling Impulse Factor',
                     'Y_axis': 'Z_Rolling Margin Factor',
                     'Label': 'Label'
                 },
                 title='IF and MF (Z_axis) axis for M01 - OP06')

fig.update_layout(width=800, height=600)

fig.show()

In [None]:
fig = px.scatter(df2[::100], x='Y_Rolling Impulse Factor', y='Z_Rolling Variance', color='Label',
                 labels={
                     'Z_axis': 'Z_Rolling Impulse Factor',
                     'Y_axis': 'Z_Rolling Margin Factor',
                     'Label': 'Label'
                 },
                 title='IF and MF (Z_axis) axis for M01 - OP06')

fig.update_layout(width=800, height=600)

fig.show()

In [None]:
features = ['X_Rolling Mean', 'X_Rolling Median',
       'X_Rolling Variance', 'X_Rolling Skewness', 'X_Rolling Impulse Factor',
       'X_Rolling Margin Factor', 'Y_Rolling Mean', 'Y_Rolling Median',
       'Y_Rolling Variance', 'Y_Rolling Skewness','Y_Rolling Impulse Factor',
       'Y_Rolling Margin Factor', 'Z_Rolling Mean',
       'Z_Rolling Median','Z_Rolling Variance', 'Z_Rolling Skewness', 'Z_Rolling Crest Factor',
        'Z_Rolling Impulse Factor','Z_Rolling Margin Factor']

X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
pca = PCA(n_components=2) 
X_pca = pca.fit_transform(X_scaled)
PCA()

explained_var = pca.explained_variance_ratio_
print("Explained variance for each component:", explained_var)

PCA it's not good to describe this dataset - but let's just take a look on how it goes

In [None]:
pca_df = pd.DataFrame(data = X_pca[:, :2], columns = ['PC1', 'PC2'])
pca_df['Label'] = df['Label'].values 

pca_df

In [None]:
pca_df['Label'] = pca_df['Label'].astype('str')

In [None]:
fig = px.scatter(pca_df[::100], x='PC1', y='PC2', color='Label',
                 labels={
                     'PC1': 'PC1',
                     'PC2': 'PC1'
                 },
                 title='PCA')
fig.show()

# U-MAP

In [None]:
features = ['X_Rolling Variance', 'X_Rolling Skewness', 'X_Rolling Impulse Factor',
       'X_Rolling Margin Factor', 'Y_Rolling Variance', 'Y_Rolling Skewness','Y_Rolling Impulse Factor',
       'Y_Rolling Margin Factor', 'Z_Rolling Variance', 'Z_Rolling Skewness', 'Z_Rolling Crest Factor',
        'Z_Rolling Impulse Factor','Z_Rolling Margin Factor']

X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.iloc[::100])

In [None]:
target = df['Label'].iloc[::100].values  

In [None]:
reducer = umap.UMAP(random_state=42)
X_umap = reducer.fit_transform(X_scaled, y=target)

In [None]:
embedding = reducer.embedding_

In [None]:
from matplotlib.colors import BoundaryNorm

# Criando uma figura e um subplot
fig, ax = plt.subplots(1, figsize=(8, 6))

# Define os limites para a barra de cores (inclui um limite extra para garantir a cobertura de todos os valores)
boundaries = [0, 0.5, 1]
norm = BoundaryNorm(boundaries, ncolors=256, clip=True)

# Cria o gráfico de dispersão
scatter = ax.scatter(*embedding.T, s=0.1, c=target, cmap='Spectral', norm=norm, alpha=1.0)

# Adiciona a barra de cores com limites discretos
cbar = plt.colorbar(scatter, ax=ax, ticks=[0, 1])
cbar.set_label('Target')

# Mostra o gráfico
plt.show()

# Wavelet Features

In [None]:
path_to_dataset = "../3.Feature_Engineering/Datasets/DF_Features.parquet"
df = pd.read_parquet(path_to_dataset)

In [None]:
df.columns

In [None]:
# Visualy selected features - free-style

selected_columns = ['X_D1','X_D2', 'X_D3', 'X_A3', 'Y_D1', 'Y_D2', 'Y_D3', 'Y_A3', 'Z_D1', 'Z_D2',
       'Z_D3', 'Z_A3','Label']
df2 = df[selected_columns]

In [None]:
df2['Label'] = df2['Label'].astype('str')

In [None]:
fig = px.scatter(df2[::100], x='Y_D3', y='Z_D3', color='Label',
                 labels={
                     'X_axis': 'Y_D3',
                     'Y_axis': 'Z_D3',
                     'Label': 'Label'
                 },
                 title='D3 for M01 - OP06')

fig.update_layout(width=800, height=600)

fig.show()

In [None]:
scaler = StandardScaler()
features = ['X_D1','X_D2', 'X_D3', 'X_A3', 'Y_D1', 'Y_D2', 'Y_D3', 'Y_A3', 'Z_D1', 'Z_D2',
       'Z_D3', 'Z_A3', 'X_Rolling Variance', 'X_Rolling Skewness', 'X_Rolling Impulse Factor',
       'X_Rolling Margin Factor', 'Y_Rolling Variance', 'Y_Rolling Skewness','Y_Rolling Impulse Factor',
       'Y_Rolling Margin Factor', 'Z_Rolling Variance', 'Z_Rolling Skewness', 'Z_Rolling Crest Factor',
        'Z_Rolling Impulse Factor','Z_Rolling Margin Factor', 'X_Rolling Median', 'Y_Rolling Median', 'Z_Rolling Median']
X = df[features]
X_scaled = scaler.fit_transform(X)

In [None]:
pca = PCA(n_components=2) 
X_pca = pca.fit_transform(X_scaled)
PCA()

explained_var = pca.explained_variance_ratio_
print("Explained variance for each component:", explained_var)

Low explained variance for the first 2 components

In [None]:
pca_df = pd.DataFrame(data = X_pca[:, :2], columns = ['PC1', 'PC2'])
pca_df['Label'] = df2['Label'].values 

pca_df

In [None]:
fig = px.scatter(pca_df[::100], x='PC1', y='PC2', color='Label',
                 labels={
                     'PC1': 'PC1',
                     'PC2': 'PC1'
                 },
                 title='PCA')
fig.show()

In [None]:
pca_df['Label'] = pca_df['Label'].astype('str')

### U-MAP

In [None]:
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.iloc[::100])

In [None]:
#target = df['Label'].iloc[::100].values  

reducer = umap.UMAP(random_state=40, n_neighbors=10)
X_umap = reducer.fit_transform(X_scaled,)

embedding = reducer.embedding_

In [None]:
fig, ax = plt.subplots(1, figsize=(8, 6))

target = df['Label'].iloc[::100].values  

# Define os limites para a barra de cores (inclui um limite extra para garantir a cobertura de todos os valores)
boundaries = [0, 0.5, 1]
norm = BoundaryNorm(boundaries, ncolors=256, clip=True)

# Cria o gráfico de dispersão
scatter = ax.scatter(*embedding.T, s=0.1, c=target, cmap='Spectral', norm=norm, alpha=1.0)

# Adiciona a barra de cores com limites discretos
cbar = plt.colorbar(scatter, ax=ax, ticks=[0, 1])
cbar.set_label('Target')

# Mostra o gráfico
plt.show()

In [None]:
umap_data = pd.DataFrame(X_umap, columns=['UMAP1', 'UMAP2'])
umap_data['Label'] = df['Label'].iloc[::100].values  
umap_data['Label'] = umap_data['Label'].astype('str')

In [None]:
fig = px.scatter(umap_data, x='UMAP1', y='UMAP2', color='Label', title='UMAP Projection - 2d')
fig.show()

In [None]:
X_scaled

In [None]:
num_clusters = 2
km = KMeans(n_clusters=num_clusters,random_state=10,n_init=1) # n_init, number of times the K-mean algorithm will run
km.fit(X_scaled[features])

In [None]:
labels_pred = km.labels_

# Plotting PCA results with K-means labels
plt.figure(figsize=(14, 6))

# First subplot with K-means labels
plt.subplot(1, 2, 1)
for i in range(num_clusters):
    plt.scatter(X_pca[labels_pred == i, 0], X_pca[labels_pred == i, 1], label=f'Cluster {i}')
plt.title('PCA with K-means Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()

# Second subplot with existing labels
plt.subplot(1, 2, 2)
# Assuming the existing labels are categorical and not numeric
unique_labels = df['Existing_Labels'].unique()
for label in unique_labels:
    plt.scatter(X_pca[df['Existing_Labels'] == label, 0], X_pca[df['Existing_Labels'] == label, 1], label=f'Label {label}')
plt.title('PCA with Existing Labels')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()

plt.tight_layout()
plt.show()

K-means clustering is one of the most simple clustering algorithms.  One of the limitations is that it depends on the starting point of the clusters, and the number of clusters need to be defined beforehand.


### Cluster starting points
Let's start by creating a simple dataset.


Let's now group this data into two clusters.  We will use two different random states to initialize the algorithm. Settign a the __[random state](https://numpy.org/doc/stable/reference/random/legacy.html?utm_medium=Exinfluencer&utm_source=Exinfluencer&utm_content=000026UJ&utm_term=10006555&utm_id=NA-SkillsNetwork-Channel-SkillsNetworkCoursesIBMML0187ENSkillsNetwork821-2023-01-01#numpy.random.RandomState)__ variable is useful for testing and allows us to seed the randomness (so we get the same results each time).


Clustering with a random state of 10:


In [None]:
num_clusters = 2
km = KMeans(n_clusters=num_clusters,random_state=10,n_init=1) # n_init, number of times the K-mean algorithm will run
km.fit(X)
display_cluster(X,km,num_clusters)

Clustering with a random state of 20:


In [None]:
km = KMeans(n_clusters=num_clusters,random_state=20,n_init=1)
km.fit(X)
display_cluster(X,km,num_clusters)

## Question:

Why are the clusters different when we run  the K-means twice?



It's because the starting points of the cluster centers have an impact on where the final clusters lie.  The starting point of the clusters is controlled by the random state.


### Determining optimium number of clusters

Let's create a new dataset that visually consists on a few clusters and try to group them.


In [None]:
n_samples = 1000
n_bins = 4  
centers = [(-3, -3), (0, 0), (3, 3), (6, 6)]
X, y = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0,
                  centers=centers, shuffle=False, random_state=42)
display_cluster(X)

How many clusters do you observe?


Let's run K-means with seven clusters.


In [None]:
num_clusters = 7
km = KMeans(n_clusters=num_clusters)
km.fit(X)
display_cluster(X,km,num_clusters)

Now let's re-run the algorithm with four clusters.


In [None]:
num_clusters = 4
km = KMeans(n_clusters=num_clusters)
km.fit(X)
display_cluster(X,km,num_clusters)

Should we use four or seven clusters?  


- In this case it may be visually obvious that four clusters is better than seven.  
- This is because we can easily view the data in two dimensional space.  
- However, real world data usually has more than two dimensions.  
- A dataset with a higher dimensional space is hard to visualize.  
- A way of solving this is to plot the **inertia** 

**inertia**: (sum of squared error between each point and its cluster center) as a function of the number of clusters. 


In [None]:
km.inertia_

### Problem 1:

Write code that calculates the inertia for 1 to 10 clusters, and plot the inertia as a function of the number of clusters.


In [None]:
### BEGIN SOLUTION
inertia = []
list_num_clusters = list(range(1,11))
for num_clusters in list_num_clusters:
    km = KMeans(n_clusters=num_clusters)
    km.fit(X)
    inertia.append(km.inertia_)
    
plt.plot(list_num_clusters,inertia)
plt.scatter(list_num_clusters,inertia)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia');
### END SOLUTION

Where does the elbow of the curve occur?

What do you think the inertia would be if you have the same number of clusters and data points?


### Clustering Colors from an Image


The next few exercises use an image of bell peppers. Let's start by loading it:


In [None]:
img = plt.imread('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML0187EN-SkillsNetwork/labs/module%201/images/peppers.jpg', format='jpeg')
plt.imshow(img)
plt.axis('off')

In [None]:
img.shape

The image above has 480 pixels in height and 640 pixels in width.  Each pixel has 3 values that represent how much red, green and blue it has. Below you can play with different combinations of RGB to create different colors. In total, you can create $256^3 = 16,777,216$ unique colors.


In [None]:
# assign values for the RGB.  Each value should be between 0 and 255
R = 35
G = 95
B = 131
plt.imshow([[np.array([R,G,B]).astype('uint8')]])
plt.axis('off')

First we will reshape the image into a table that has a pixel per row and each column represents the red, green and blue channel.


In [None]:
img_flat = img.reshape(-1, 3)
img_flat[:5,:]

Since there are 480x640 pixels we get 307,200 rows! 


In [None]:
img_flat.shape

Let's run K-means with 8 clusters.


In [None]:
kmeans = KMeans(n_clusters=8, random_state=0).fit(img_flat)

Now let's replace each row with its closest cluster center.


In [None]:
img_flat2 = img_flat.copy()

# loops for each cluster center
for i in np.unique(kmeans.labels_):
    img_flat2[kmeans.labels_==i,:] = kmeans.cluster_centers_[i]

We now need to reshape the data from 307,200 x 3 to 480 x 640 x 3


In [None]:
img2 = img_flat2.reshape(img.shape)
plt.imshow(img2)
plt.axis('off');

### Problem 2:
Write a function that receives the image and number of clusters (k), and returns (1) the image quantized into k colors, and (2) the inertia.


In [None]:
### BEGIN SOLUTION
def image_cluster(img, k):
    img_flat = img.reshape(img.shape[0]*img.shape[1],3)
    kmeans = KMeans(n_clusters=k, random_state=0).fit(img_flat)
    img_flat2 = img_flat.copy()

    # loops for each cluster center
    for i in np.unique(kmeans.labels_):
        img_flat2[kmeans.labels_==i,:] = kmeans.cluster_centers_[i]
        
    img2 = img_flat2.reshape(img.shape)
    return img2, kmeans.inertia_
### END SOLUTION

### Problem 3:

Call the function for k between 2 and 20, and draw an inertia curve. What is the optimum number of clusters?


In [None]:
### BEGIN SOLUTION
k_vals = list(range(2,21,2))
img_list = []
inertia = []
for k in k_vals:
#    print(k)
    img2, ine = image_cluster(img,k)
    img_list.append(img2)
    inertia.append(ine)  

In [None]:
# Plot to find optimal number of clusters
plt.plot(k_vals,inertia)
plt.scatter(k_vals,inertia)
plt.xlabel('k')
plt.ylabel('Inertia');
### END SOLUTION

Sometimes, the elbow method does not yield a clear decision (for example, if the elbow is not clear and sharp, or is ambiguous).  In such cases, alternatives such as the [silhouette coefficient](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html?utm_medium=Exinfluencer&utm_source=Exinfluencer&utm_content=000026UJ&utm_term=10006555&utm_id=NA-SkillsNetwork-Channel-SkillsNetworkCoursesIBMML0187ENSkillsNetwork821-2023-01-01) can be helpful.


### Problem 4:
Plot in a grid all the images for the different k values.


In [None]:
### BEGIN SOLUTION
plt.figure(figsize=[10,20])
for i in range(len(k_vals)):
    plt.subplot(5,2,i+1)
    plt.imshow(img_list[i])
    plt.title('k = '+ str(k_vals[i]))
    plt.axis('off');
### END SOLUTION

---
### Machine Learning Foundation (C) 2020 IBM Corporation
