In [3]:
#Data analysis libraries
import numpy as np 
import pandas as pd 
import itertools

#Visualization and statistics libraries
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.pyplot import style
from scipy import fftpack
import seaborn as sns
style.use('seaborn')

# Model related libraries
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

In [4]:
bearing = pd.read_csv("df_96features.csv", low_memory=False)

In [5]:
bearing.drop([''])

Index(['hert_std', 'hert_median', 'hert_max', 'hert_min', 'hert_entropy',
       'hert_impulse', 'hert_margin', 'hert_frequence', 'hert_MS_F',
       'hert_RMQ_F', 'hert_RV_F', 'hert_crest_F', 'w_std', 'w_median', 'w_max',
       'w_min', 'w_entropy', 'w_impulse', 'w_margin', 'w_frequence', 'w_MS_F',
       'w_RMQ_F', 'w_RV_F', 'w_crest_F', 'a1_x_std', 'a1_x_median', 'a1_x_max',
       'a1_x_min', 'a1_x_entropy', 'a1_x_impulse', 'a1_x_margin',
       'a1_x_frequence', 'a1_x_MS_F', 'a1_x_RMQ_F', 'a1_x_RV_F',
       'a1_x_crest_F', 'a2_x_std', 'a2_x_median', 'a2_x_max', 'a2_x_min',
       'a2_x_entropy', 'a2_x_impulse', 'a2_x_margin', 'a2_x_frequence',
       'a2_x_MS_F', 'a2_x_RMQ_F', 'a2_x_RV_F', 'a2_x_crest_F', 'a1_y_std',
       'a1_y_median', 'a1_y_max', 'a1_y_min', 'a1_y_entropy', 'a1_y_impulse',
       'a1_y_margin', 'a1_y_frequence', 'a1_y_MS_F', 'a1_y_RMQ_F', 'a1_y_RV_F',
       'a1_y_crest_F', 'a2_y_std', 'a2_y_median', 'a2_y_max', 'a2_y_min',
       'a2_y_entropy', 'a2_y_impul

In [1]:
combinations = list(itertools.combinations(bearing.columns,2))
print(combinations)


# add cluster index to dataframe

bearing = bearing[['a1_x_entropy','a1_y_entropy','a1_z_entropy', 'hert_median', 'hert_max', 'hert_min',
                  'a1_x_median', 'a1_x_max','a1_x_min', 'a1_x_entropy']]
scaler = StandardScaler()
bearing = pd.DataFrame(scaler.fit_transform(bearing), index=bearing.index, columns=bearing.columns)


for feature1,feature2 in combinations:    
    X = bearing[[feature1, feature2]].values
    range_n_clusters = [2, 3, 4, 5, 6]
    silhouette_avg_n_clusters = []

    print(f'With 2-features combination of: {feature1}, {feature2}')

    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=42)
        cluster_labels = clusterer.fit_predict(X)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        silhouette_avg_n_clusters.append(silhouette_avg)
        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X[:, 0], X[:, 1], marker='.', s=300, lw=0, alpha=0.7,
                    c=colors, edgecolor='k')

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                    c="white", alpha=1, s=200, edgecolor='k')

        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                        s=50, edgecolor='k')

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel(feature1)
        ax2.set_ylabel(feature2)

        plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                      "with n_clusters = %d" % n_clusters),
                     fontsize=14, fontweight='bold')

    plt.show()


    style.use("fivethirtyeight")
    plt.plot(range_n_clusters, silhouette_avg_n_clusters)
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("silhouette score")
    plt.show()


NameError: name 'itertools' is not defined