In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn import cluster
from sklearn import preprocessing

## Overall principal of iterated feature selection
1. Perform k-means on each of the features individually for some k. 
2. For each cluster measure some clustering performance metric like the Dunn's index or silhouette. 
3. Take the feature which gives you the best performance and add it to Sf
4. Perform k-means on Sf and each of the remaining features individually
5. Take the feature which gives you the best performance and add it to Sf
4. If you have reached the desired number of features stop, else go back to 4

In [3]:
dataset = datasets.load_iris()
X = dataset.data
y = dataset.target

In [4]:
n, m = X.shape[0], X.shape[1]
print(n, m)

150 4


In [21]:
X

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.5,  1.5,  0.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4

In [80]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [83]:
pd.value_counts(pd.Series(y))

2    50
1    50
0    50
dtype: int64

### Feature selection - code version 1
#### Organize Iris dataset into a dataframe in order to better imitate our project data format

1. Perform k-means on each of the features individually for some k. 
2. For each cluster measure some clustering performance metric like the Dunn's index or silhouette. 
3. Take the feature which gives you the best performance and add it to Sf
4. Perform k-means on Sf and each of the remaining features individually
5. Take the feature which gives you the best performance and add it to Sf
4. If you have reached the desired number of features stop, else go back to 4

In [5]:
dataset = datasets.load_iris()
X = dataset.data
y = dataset.target

In [52]:
dir(dataset)

['DESCR', 'data', 'feature_names', 'target', 'target_names']

In [6]:
iris = pd.DataFrame(X)
iris.columns = dataset.feature_names

In [54]:
iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [7]:
n, m = iris.shape[0], iris.shape[1]
print("number of data points:", n, ". number of variables:", m)

number of data points: 150 . number of variables: 4


In [8]:
#Let's first use all the features to perform K-Means clustering
model_test = cluster.KMeans(n_clusters=3)
model_test.fit(iris) 
pred_y=model_test.labels_
print("True class labels", "\n", pd.value_counts(pd.Series(y)))
print("Clustered class labels:", "\n", pd.value_counts(pd.Series(pred_y)))

True class labels 
 2    50
1    50
0    50
dtype: int64
Clustered class labels: 
 2    62
1    50
0    38
dtype: int64


In [9]:
# let's assume there are 3 clusters
num_of_cluster = 3
num_of_iter = 3
model = cluster.KMeans(n_clusters=num_of_cluster)
score = np.zeros([num_of_iter, m]) # the sum of squared distances of samples to their closest cluster center
exclude_columns = [] # best performed models with selected features will be added to this list after every iteration
include_columns = [i for i in range(np.shape(score)[1]) if i not in exclude_columns] # rest of the features
# Let's assume we're going to select 3 features out of 4 features, therefore we're going to iterate 3 times
for iteration in range(3):
    # The first iteration, we're going to test clustering models on each individual variables
    if iteration == 0:
        print("Now processing iteration %d" %iteration, "\n")   
        for i in range(m):
            model.fit(iris[[i]])
            pred_y = model.labels_
            print("cluster labels based on variable %s:" %iris.columns[i], "\n", pd.value_counts(pd.Series(pred_y)))
            score[iteration][i] = model.inertia_
            print("the sum of squared distances of samples to their closest cluster center based on variable %s" \
                  %iris.columns[i], "is:", score[iteration][i])   
        #score = score[:,include_columns]
        selected_feature_index = np.argmin(score[iteration], axis=0) 
        selected_feature_score = np.amin(score[iteration], axis=0) 
        selected_feature = iris[[selected_feature_index]]
        exclude_columns.append(selected_feature_index)
        print("Conclusion: cluster based on variable %s" %iris.columns[selected_feature_index], "gives the best performance", "\n") 
    #for following iteration, we're going to add the rest the feature to the selected feature and perform cluster model
    else:
        print("Now processing iteration %d" %iteration, "\n") 
        for i in range(m):
            if i not in exclude_columns:
                # Generate data with features selected from last iteration plus each individual rest of the features
                data = pd.concat([selected_feature, iris[[i]]], axis=1)
                model.fit(data)
                pred_y = model.labels_
                print("cluster labels based on variables:", data.columns, "\n", pd.value_counts(pd.Series(pred_y)))
                score[iteration][i] = model.inertia_
                print("the sum of squared distances of samples to their closest cluster center based on variables:", \
                 data.columns, "is:", score[iteration][i]) 
        include_columns = [i for i in range(np.shape(score)[1]) if i not in exclude_columns]
        selected_feature_score = np.amin(score[:,include_columns][iteration], axis=0) 
        selected_feature_index = np.where(score[iteration] == selected_feature_score)[0][0]
        selected_feature = pd.concat([selected_feature, iris[[selected_feature_index]]], axis=1)
        exclude_columns.append(selected_feature_index)
        print("Conclusion: cluster based on variable %s" %iris.columns[exclude_columns], "gives the best performance", "\n") 
print("Selected features are %s" %iris.columns[exclude_columns])

Now processing iteration 0 

cluster labels based on variable sepal length (cm): 
 2    61
0    59
1    30
dtype: int64
the sum of squared distances of samples to their closest cluster center based on variable sepal length (cm) is: 15.816622673
cluster labels based on variable sepal width (cm): 
 0    81
1    36
2    33
dtype: int64
the sum of squared distances of samples to their closest cluster center based on variable sepal width (cm) is: 5.26434343434
cluster labels based on variable petal length (cm): 
 2    54
1    50
0    46
dtype: int64
the sum of squared distances of samples to their closest cluster center based on variable petal length (cm) is: 24.5138312399
cluster labels based on variable petal width (cm): 
 2    52
0    50
1    48
dtype: int64
the sum of squared distances of samples to their closest cluster center based on variable petal width (cm) is: 4.93217435897
Conclusion: cluster based on variable petal width (cm) gives the best performance 

Now processing iteration

#### That completes three iterations of feature selection and selected three iterations for best performance

### Feature selection - code version 2

* Perform k-means on each of the features individually for some k. 
* For each cluster measure some clustering performance metric like the Dunn's index or silhouette. 
* Take the feature which gives you the best performance and add it to Sf

In [165]:
for d in range(1,5):
    exec(f'X{d} = np.zeros(n)')
    for i in range(n):
        exec(f'X{d}[i] = X[i][d-1]')

In [93]:
score = [0]*m
for i in range(1, m+1):
    print("Now processing model %d" %i, ", which uses feature # %d only" %i)
    exec(f'model_{i} = cluster.KMeans(n_clusters=3)')
    exec(f'X{i} = X{i}[:, np.newaxis]')
    exec(f'model_{i}.fit(X{i})')
    exec(f'pred_y = model_{i}.labels_')
    print("Now let's compare the true class label to the class labels obtained by clustering")
    print(pd.value_counts(pd.Series(pred_y)))
    exec(f'score[i-1] = model_{i}.inertia_')
    print("For model %d:" %i,"the sum of squared distances of samples to their closest cluster center is %s" %score[i-1])

Now processing model 1 , which uses feature # 1 only
Now let's compare the true class label to the class labels obtained by clustering
2    63
1    52
0    35
dtype: int64
For model 1: the sum of squared distances of samples to their closest cluster center is 15.7581196581
Now processing model 2 , which uses feature # 2 only
Now let's compare the true class label to the class labels obtained by clustering
1    81
0    36
2    33
dtype: int64
For model 2: the sum of squared distances of samples to their closest cluster center is 5.26434343434
Now processing model 3 , which uses feature # 3 only
Now let's compare the true class label to the class labels obtained by clustering
0    58
1    50
2    42
dtype: int64
For model 3: the sum of squared distances of samples to their closest cluster center is 24.6580407225
Now processing model 4 , which uses feature # 4 only
Now let's compare the true class label to the class labels obtained by clustering
2    52
0    50
1    48
dtype: int64
For mo

In [167]:
selected_feature = score.index(min(score)) + 1
print("The # %d feature" % (score.index(min(score)) + 1), "gives the best performance")

The # 4 feature gives the best performance
