# Simulations from shuffled vectors

Importing the needed libraries.

In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics

Storing the features matrix in a pandas dataframe and then converting it to a numpy array. The features matrix stand for the matrix in which each row repesents a disease and each column a feature (whether a particular gene is present or no in a disease).

In [26]:
original = pd.read_csv('all_feature_matrix_prots_CUI.csv')
original_np = original.to_numpy()

### First experiment
- 100 simulations (100 different shuffled vectors)
- eps: 0.85

In [41]:
n_simulations = 100


clusters = list()
noise = list()
silhouettes = list()

for n in range(n_simulations):
    
    shuffled = np.copy(original_np)

    # Shuffling elements in each column of the matrix
    for i in range(shuffled.shape[1]):
        np.random.shuffle(shuffled[:,i])


    labels = DBSCAN(eps = 0.85, min_samples = 2, metric = 'cosine').fit_predict(shuffled)
    
    if not 0 in labels:
        n_clusters = 0
        n_noise = list(labels).count(-1)
        silh_coef = -1
                
    else:
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)
        silh_coef = metrics.silhouette_score(shuffled, labels, metric = 'cosine')

    clusters.append(n_clusters)
    noise.append(n_noise)
    silhouettes.append(silh_coef)

Storing the obtained results in a pandas dataframe to obtain the statistics of the experiment. For each experiment, the number of clusters, the number of outliers and the value of Silhouette coefficient are stored.

In [42]:
clus_np = np.array(clusters)
noise_np = np.array(noise)
silh_np = np.array(silhouettes)

results = pd.DataFrame(list(zip(clusters, noise, silhouettes)),
                      columns = ['Cluster', 'Noise', 'Silhouette'])

First five results in the experiment:

In [43]:
results.head()

Unnamed: 0,Cluster,Noise,Silhouette
0,298,2617,-0.045167
1,269,2619,-0.045741
2,303,2617,-0.044875
3,257,2597,-0.046461
4,280,2638,-0.045661


Statistics of the dataframe, we can get the standard deviation for the 3 parameters under study.

In [44]:
results.describe()

Unnamed: 0,Cluster,Noise,Silhouette
count,100.0,100.0,100.0
mean,267.71,2610.67,-0.045996
std,17.015914,28.457324,0.001043
min,230.0,2549.0,-0.048304
25%,257.0,2591.5,-0.046598
50%,266.5,2611.5,-0.046174
75%,276.25,2628.25,-0.045413
max,314.0,2661.0,-0.042951


### Second experiment
- Number of similations: 100
- eps: 0.1

In [26]:
n_simulations = 100

clusters = list()
noise = list()
silhouettes = list()

for n in range(n_simulations):
    
    #np.random.seed(n)
    
    shuffled = np.copy(original_np)

    # Shuffling elements in each column of the matrix
    for i in range(shuffled.shape[1]):
        np.random.shuffle(shuffled[:,i])



    distances_cos = cosine_distances(shuffled)


    labels = DBSCAN(eps = 0.1, min_samples = 2, metric='precomputed').fit_predict(distances_cos)
    
    if not 0 in labels:
        n_clusters = 0
        n_noise = list(labels).count(-1)
        silh_coef = -1
                
    else:
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)
        silh_coef = metrics.silhouette_score(distances_cos, labels, metric='precomputed')

    clusters.append(n_clusters)
    noise.append(n_noise)
    silhouettes.append(silh_coef)

Getting the statistics of the second experiment.

In [27]:
clus_np = np.array(clusters)
noise_np = np.array(noise)
silh_np = np.array(silhouettes)

results = pd.DataFrame(list(zip(clusters, noise, silhouettes)),
                      columns = ['Cluster', 'Noise', 'Silhouette'])
results.describe()

Unnamed: 0,Cluster,Noise,Silhouette
count,100.0,100.0,100.0
mean,0.0,3671.0,-1.0
std,0.0,0.0,0.0
min,0.0,3671.0,-1.0
25%,0.0,3671.0,-1.0
50%,0.0,3671.0,-1.0
75%,0.0,3671.0,-1.0
max,0.0,3671.0,-1.0


### Third experiment
- Number of simulations: 100
- eps: 0.4

In [28]:
n_simulations = 100

clusters = list()
noise = list()
silhouettes = list()

for n in range(n_simulations):
    
    #np.random.seed(n)
    
    shuffled = np.copy(original_np)

    # Shuffling elements in each column of the matrix
    for i in range(shuffled.shape[1]):
        np.random.shuffle(shuffled[:,i])



    distances_cos = cosine_distances(shuffled)


    labels = DBSCAN(eps = 0.4, min_samples = 2, metric='precomputed').fit_predict(distances_cos)
    
    if not 0 in labels:
        n_clusters = 0
        n_noise = list(labels).count(-1)
        silh_coef = -1
                
    else:
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)
        silh_coef = metrics.silhouette_score(distances_cos, labels, metric='precomputed')

    clusters.append(n_clusters)
    noise.append(n_noise)
    silhouettes.append(silh_coef)

Getting the statistics of the third experiment.

In [29]:
clus_np = np.array(clusters)
noise_np = np.array(noise)
silh_np = np.array(silhouettes)

results = pd.DataFrame(list(zip(clusters, noise, silhouettes)),
                      columns = ['Cluster', 'Noise', 'Silhouette'])
results.describe()

Unnamed: 0,Cluster,Noise,Silhouette
count,100.0,100.0,100.0
mean,0.0,3671.0,-1.0
std,0.0,0.0,0.0
min,0.0,3671.0,-1.0
25%,0.0,3671.0,-1.0
50%,0.0,3671.0,-1.0
75%,0.0,3671.0,-1.0
max,0.0,3671.0,-1.0
