## Importing libraries

In [1]:
# Importing libaries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import operator
from itertools import combinations

In [2]:
# Importing Clusterling/ML libaries
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from yellowbrick.cluster import SilhouetteVisualizer

In [3]:
# Importing csv
bearing = pd.read_csv('data_bearings_classification.csv')
bearing.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data_bearings_classification.csv'

'Formatting' the dataframe

In [None]:
# Only keeping faulty bearings (status=0)
bearing = bearing[bearing['status'] == 0]
bearing = bearing.reset_index(drop=True)

In [None]:
# Removing status and bearing_id columns. Fixing spelling mistake in one column name. Also dropping rpm_mean as hz_mean has the same values.
df = bearing.drop(['status','bearing_id','rpm_mean'], 1)
df = df.rename(columns={'a1_x_ff_range': 'a1_x_fft_range'})

In [None]:
# Normalizing the dataframe (0-1). Required step for KMeans
x = df.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_normalized = pd.DataFrame(x_scaled, columns=df.columns, index=df.index)

In [None]:
df_normalized.head()

In [None]:
# Starting with only the 'mean' columns
df_mean_normalized = df_normalized.iloc[:, 0: 8]
df_mean_normalized.head()

## Clustering

In [None]:
# Having a quick look at relation between all variables of the 'mean' category. If we add all features, too much information.
fig = px.scatter_matrix((df_mean_normalized),
width=1200, height=1600)
fig.show()

We can see interesting patterns and possible clusters. Especially when including a2_x_mean.

In order to find the best 2 features to create clusters, we'll create functions to loop over all possible combinations and by looking at the silhouette score, we'll know which combinations of features to use to cluster.

In [None]:
# Creates a list with tuples of all combinations possible of features of a dataframe. Required = dataframe, number_of_features
# Return a list with all combinations possible (list_combinations)
def combination_features(df, number_features):
    list_combinations = []
    A = list(df.columns)
    temp = combinations(A, number_features)
    for i in list(temp):
        list_combinations.append(i)
    return list_combinations

In [None]:
# Will return a dictionary with the top 10 combinations of features based on silhouette score.
# Take the following arguments: dataframe, list_of_combinations, max_number_clusters and random_state
def score_features(df, list_combinations, max_number_clusters, random_state):
    score_dict = dict()
    for combination in list_combinations:
        #print(combination, list(combination), combination[index], list_combinations[index], list(list_combinations[index]))
        X = df[list(combination)]
        for n in range(2, max_number_clusters+1):
            kmeans = KMeans(n_clusters=n, init='k-means++', n_init=100, random_state=random_state).fit(X)
            score = silhouette_score(X, kmeans.labels_, metric='euclidean')
            score_dict[combination, n] = score
    score_dict_desc = sorted(score_dict.items(),key=operator.itemgetter(1),reverse=True)
    return score_dict_desc[0:10]

In [None]:
def plot_features(df, dictionary):
    X = df[list(dictionary[0][0][0])]
    number_clusters = dictionary[0][0][1]
    kmeans = KMeans(n_clusters=number_clusters, init="k-means++",n_init=100, tol=1e-04, random_state=42)
    kmeans.fit(X)
    score = silhouette_score(X, kmeans.labels_, metric='euclidean')
    clusters=pd.DataFrame(X,columns=df.columns)
    clusters['label']=kmeans.labels_
    number_of_features = len(list(dictionary[0][0][0]))
    
    if number_of_features == 2:
        fig1 = px.scatter(clusters, x=dictionary[0][0][0][0], y=dictionary[0][0][0][1], color='label')
        fig1.update_layout(title=f"{dictionary[0][0][0][0]} vs {dictionary[0][0][0][1]} with {number_clusters} clusters and s-score = {round(score,3)}")
        fig1.show()
    elif number_of_features == 3:
        fig2 = px.scatter_3d(clusters, x=dictionary[0][0][0][0], y=dictionary[0][0][0][1], z=dictionary[0][0][0][2], color = 'label',symbol='label')
        fig2.update_layout(title=f"{dictionary[0][0][0][0]} vs {dictionary[0][0][0][1]} vs {dictionary[0][0][0][2]} with {number_clusters} clusters and s-score = {round(score,3)}")
        fig2.show()
    else:
        print("Can't see more than 3d")
        
    visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick')
    visualizer.fit(X)
    visualizer.show()

In [None]:
list_combinations = combination_features(df_mean_normalized,6)

In [None]:
features_dict_top_10 = score_features(df_mean_normalized, list_combinations, 5, 42)

In [None]:
features_dict_top_10

In [None]:
plot_features(df_mean_normalized, features_dict_top_10)

In [None]:
df_mean_3d = df_mean_normalized[['hz_mean',"a2_x_mean","a2_z_mean"]]

In [None]:
kmeans = KMeans(n_clusters=3, init="k-means++",n_init=100, tol=1e-04, random_state=42)
X = df_mean_3d
kmeans.fit(X)
clusters=pd.DataFrame(X,columns=df_mean_3d.columns)
clusters['label']=kmeans.labels_
score = silhouette_score(df_mean_3d, kmeans.labels_, metric='euclidean')
fig3 = px.scatter_3d(clusters, x='hz_mean', y="a2_x_mean", z="a2_z_mean", color = 'label',symbol='label')
fig3.update_layout(title=f"'hz_mean' vs 'a2_x_mean' vs 'a2_z_mean' with 3 clusters and s-score = {round(score,2)}")
fig3.show()


In [None]:
#df_mean_normalized_no_outliers = df_mean_normalized[df_mean_normalized['a2_y_mean']>.5]