# *Getting DBSCAN clusters on given parameters*

**Author: Lucía Prieto Santamaría **(lucia.prieto.santamaria@alumnos.upm.es)

This notebook was written to develop complete DBSCAN clustering analysis on similarity matrices stored in csv files. It returns the clusters that are generated for DBSCAN implementation on previous estimated combination of parameters that have to be settled.

In [1]:
# Import of the needed libraries
import conexion_edsssdb # This is a module already scripted that makes connection automatized
import csv # Module to get the data from the similarity matrices stored in csv files
import numpy as np # Library needed to structure the data before implementing the algorithm
from sklearn.cluster import DBSCAN # Extension of scikit-learn that implements the algorithm DBSCAN
from sklearn import metrics # Extension of scikit-learn that will be used to compute silhouette coefficient

In [2]:
# VARIABLES DECLARATION

# Variables needed for DBSCAN algorithm
metric = 'prot_cos'
epsilon = 0.4
ms = 2

# We need to specify here the total number of diseases in the subset we are working with
number_of_diseases = 3671

# Folder and group name to identify the correct directory
# IMPORTANT!!!! THE FOLDER NEEDS TO BE PREVIOUSLY CREATED BY THE USER
directory_name = 'weighted_score/wesco'

# Dictionary that will store as keys the number identifying the clusters and as values, the lists of diseases in each
# cluster
clusters_diseases = {}

In [3]:
# FUNCTIONS DECLARATION

def get_diseases_names(cnx_obj):
    """Function that will return a dictionary with all the correspondences between diseases IDs and names that
    are present in DisGeNET table of EDSSSDB."""
    
    diseases_IDname = {}
    
    mysql_query = ("""
                   SELECT 
                        dd.umls_id, dd.dis_name
                   FROM
                        edsssdb.disgenet_disease dd;
                   """)

    cursor = cnx.cursor(buffered = True)
    cursor.execute(mysql_query)
    data=cursor.fetchall()
    
    for row in data:
        diseases_IDname[row[0]] = row[1]
        
    return diseases_IDname



In [4]:
# RETRIEVING THE NEEDED DATA FROM THE DATABASE

# Connection object creation with connection variables. EDSSSDB is the database to connect to.
cnx = conexion_edsssdb.conectar('user', 'password', 'host', 'port', 'db')

# We get the names of the diseases from the database in order to later know which diseases are in each cluster.
diseases_dict = get_diseases_names(cnx)

# Closing the connection to the database when finished
cnx.close()

Conectado a edsssdb


In [5]:
# Getting the list of diseases we are going to work with

file_name = 'excels/' + directory_name + 'S_' + metric + '.csv'

with open(file_name) as f:
    reader = csv.reader(f, delimiter = ",")
    diseases = next(reader)

diseases.pop(0)
    
f.close()

In [6]:
epsilon_str = str(epsilon)
ms_str = str(ms) 

columns = tuple(range(1, number_of_diseases + 1)) # Columns to get from the similarity matrix excel file

my_data = np.genfromtxt(file_name, delimiter= ",", skip_header = 1, usecols = columns)
# Getting the data from the excel similarity matrix

X = 1 - my_data# Convert similarity measure into distance
    

labels = DBSCAN(eps=epsilon, min_samples=ms, metric='precomputed').fit_predict(X)
silh_coef = metrics.silhouette_score(X, labels, metric='precomputed')
 

# Counting the number of clusters in model labels, ignoring noise if present.
n_clusters =  len(set(labels)) - (1 if -1 in labels else 0) 
n_noise = list(labels).count(-1)

print("Number of clusters for %s" %metric, "MS %s" %ms, "eps %s: " %epsilon_str, n_clusters)
print("Number of outliers  for %s" %metric, "MS %s" %ms, "eps %s: " %epsilon_str, n_noise)

for i in range(len(diseases)): 
    if labels[i] != -1:
        if labels[i] in clusters_diseases:
            clusters_diseases[labels[i]].append(diseases[i])
        else:
            clusters_diseases[labels[i]] = [diseases[i]] 

Number of clusters for prot_cos MS 2 eps 0.4:  460
Number of outliers  for prot_cos MS 2 eps 0.4:  1801


In [7]:
# WRITING THE RESULTS IN FILES

foutname = "clusters/weighted_score_clust_" + metric + "_ms" + ms_str + "_eps" + epsilon_str + ".txt"
fileout = open(foutname, "w")

print('Writing the results in file: ', foutname)

fileout.write("CLUSTERS for [weigthed] " + metric + ", MS " + ms_str + ", eps " + epsilon_str + "\n")
fileout.write("\t--> Number of clusters: " + str(n_clusters) + "\n")
fileout.write("\t--> Number of outliers: " + str(n_noise) + "\n")
fileout.write("\t--> Silhouette coefficient: " + str(silh_coef) + "\n\n")


for clus, diseases_list in clusters_diseases.items():
    
    diseases_list.sort()
    fileout.write(str("Cluster " + str(clus) + ":\n"))
    
    for dis in diseases_list:
        fileout.write(str("\t" + str(dis) + "\t" + str(diseases_dict[dis]) + "\n"))
    
    fileout.write(str("\tNumber of diseases: " + str(len(diseases_list)) + "\n\n\n"))


print('\tDONE!')

fileout.close() # Once written the results, we close the file object

Writing the results in file:  clusters/weighted_score_clust_prot_cos_ms2_eps0.4.txt
	DONE!
