In [20]:
import os
import sys

import numpy as np
import scipy.sparse.csgraph

from Cluster import Cluster
from sklearn import metrics
from ast import literal_eval

from Visualization import plot_clusters


# Inserts joined item into candidates list only if its dimensionality fits
def insert_if_join_condition(candidates, item, item2, current_dim):
    joined = []
    for i in range(len(item)):
        joined.append(item[i])
    for i in range(len(item2)):
        joined.append(item2[i])

    # Count number of dimensions
    dims = set()
    for i in range(len(joined)):
        dims.add(int(joined[i][0]))

    # Insert if it fits
    if len(dims) == current_dim:
        candidates.append(joined)


# Prune all candidates, which has a (k-1) dimensional projection not in (k-1) dim dense units
def prune(candidates, prev_dim_dense_units):
    for i in range(len(candidates)):
        for j in range(len(candidates[i])):
            if not prev_dim_dense_units.__contains__([candidates[i][j]]):
                candidates.remove(candidates[i])
                break


def self_join(prev_dim_dense_units, dim):
    candidates = []
    for i in range(len(prev_dim_dense_units)):
        for j in range(i + 1, len(prev_dim_dense_units)):
            insert_if_join_condition(
                candidates, prev_dim_dense_units[i], prev_dim_dense_units[j], dim)
    return candidates


def is_data_in_projection(tuple, candidate, xsi):
    for dim in candidate:
        element = tuple[dim[0]]
        if int(element * xsi % xsi) != dim[1]:
            return False
    return True


def get_dense_units_for_dim(data, prev_dim_dense_units, dim, xsi, tau):
    candidates = self_join(prev_dim_dense_units, dim)
    prune(candidates, prev_dim_dense_units)

    # Count number of elements in candidates
    projection = np.zeros(len(candidates))
    number_of_data_points = np.shape(data)[0]
    for dataIndex in range(number_of_data_points):
        for i in range(len(candidates)):
            if is_data_in_projection(data[dataIndex], candidates[i], xsi):
                projection[i] += 1
    print("projection: ", projection)

    # Return elements above density threshold
    is_dense = projection > tau * number_of_data_points
    print("is_dense: ", is_dense)
    return np.array(candidates)[is_dense]


def build_graph_from_dense_units(dense_units):
    graph = np.identity(len(dense_units))
    for i in range(len(dense_units)):
        for j in range(len(dense_units)):
            graph[i, j] = get_edge(dense_units[i], dense_units[j])
    return graph


def get_edge(node1, node2):
    dim = len(node1)
    distance = 0

    for i in range(dim):
        if node1[i][0] != node2[i][0]:
            return 0
        distance += abs(node1[i][1] - node2[i][1])
        if distance > 1:
            return 0
    return 1


def save_to_file(clusters, file_name):
    file = open(os.path.join(os.path.abspath(os.path.dirname(
        __file__)), file_name), encoding='utf-8', mode="w+")
    for i, c in enumerate(clusters):
        c.id = i
        file.write("Cluster " + str(i) + ":\n" + str(c))
    file.close()


def get_cluster_data_point_ids(data, cluster_dense_units, xsi):
    point_ids = set()

    # Loop through all dense unit
    for i in range(np.shape(cluster_dense_units)[0]):
        tmp_ids = set(range(np.shape(data)[0]))
        # Loop through all dimensions of dense unit
        for j in range(np.shape(cluster_dense_units)[1]):
            feature_index = cluster_dense_units[i][j][0]
            range_index = cluster_dense_units[i][j][1]
            tmp_ids = tmp_ids & set(
                np.where(np.floor(data[:, feature_index] * xsi % xsi) == range_index)[0])
        point_ids = point_ids | tmp_ids

    return point_ids


def get_clusters(dense_units, data, xsi):
    graph = build_graph_from_dense_units(dense_units)
    number_of_components, component_list = scipy.sparse.csgraph.connected_components(
        graph, directed=False)

    dense_units = np.array(dense_units)
    clusters = []
    # For every cluster
    for i in range(number_of_components):
        # Get dense units of the cluster
        cluster_dense_units = dense_units[np.where(component_list == i)]
        print("cluster_dense_units: ", cluster_dense_units.tolist())

        # Get dimensions of the cluster
        dimensions = set()
        for j in range(len(cluster_dense_units)):
            for k in range(len(cluster_dense_units[j])):
                dimensions.add(cluster_dense_units[j][k][0])

        # Get points of the cluster
        cluster_data_point_ids = get_cluster_data_point_ids(
            data, cluster_dense_units, xsi)
        # Add cluster to list
        clusters.append(Cluster(cluster_dense_units,
                                dimensions, cluster_data_point_ids))
    #print(clusters)
    return clusters


def get_one_dim_dense_units(data, tau, xsi):
    number_of_data_points = np.shape(data)[0]
    number_of_features = np.shape(data)[1]
    projection = np.zeros((xsi, number_of_features))
    for f in range(number_of_features):
        for element in data[:, f]:
            projection[int(element * xsi % xsi), f] += 1
    print("1D projection:\n", projection, "\n")
    is_dense = projection > tau * number_of_data_points
    print("is_dense:\n", is_dense)
    one_dim_dense_units = []
    for f in range(number_of_features):
        for unit in range(xsi):
            if is_dense[unit, f]:
                one_dim_dense_units.append([[f, unit]])
    return one_dim_dense_units


# Normalize data in all features (1e-5 padding is added because clustering works on [0,1) interval)
def normalize_features(data):
    normalized_data = data
    number_of_features = np.shape(normalized_data)[1]
    for f in range(number_of_features):
        normalized_data[:, f] -= min(normalized_data[:, f]) - 1e-5
        normalized_data[:, f] *= 1 / (max(normalized_data[:, f]) + 1e-5)
    return normalized_data


def evaluate_clustering_performance(df, clusters, labels):
    set_of_dimensionality = set()
    for cluster in clusters:
        set_of_dimensionality.add(frozenset(cluster.dimensions))
    max_f1 =[]
    max_p = []
    # Evaluating performance in all dimensionality
    for dim in set_of_dimensionality:
        print("\nEvaluating clusters in dimension: ", list(dim))
        # Finding clusters with same dimensions
        clusters_in_dim = []
        for c in clusters:
            if c.dimensions == dim:
                clusters_in_dim.append(c)
        clustering_labels = np.zeros(np.shape(labels))
        for i, c in enumerate(clusters_in_dim):
            clustering_labels[list(c.data_point_ids)] = i + 1
            clustering_labels1 = clustering_labels.astype(int)
            #clustering_labels1 = clustering_labels1.tolist()
            #print("clusterssssss", clustering_labels1)
        print("Number of clusters: ", len(clusters_in_dim))
        
        #print(dim)
        #if (list(dim) == [40]):
        #    print("label ",labels)
        #    print("predicts",clustering_labels1)
        orig_l = list(set(clustering_labels1))
        #print(orig_l)
        d = len(list(set(clustering_labels1)))
        #print("predicts labels size: ",d)
        pred_l = (np.arange(d)).tolist()
        for i in range(d):
            #print(orig_l[i])
            #print(pred_l[i])
            clustering_labels1[clustering_labels1 == orig_l[i]] = pred_l[i]
        #print(pred_l)
        #print("clusterssssss", clustering_labels1)
        best_k = d
        print(best_k)
        maxi,clus = max_ele(labels,clustering_labels1,best_k) 
        print("max index is ",maxi)
        print("max nos in ach aray",clus)
        precision,recall,f1_score = metrics(maxi,labels,clus,best_k) # get metrics
        
        f1_value = np.average(f1_score)

        print("F1-Value of the clusters is: ",f1_value)
        purity = purity_fn(df,clus,maxi,best_k)
        
        max_f1.append(f1_value)
        print(max_f1)
        max_p.append(purity)
        
    return max_f1,max_p
        #print("Number of clusters: ", len(clusters_in_dim))
        #print("Adjusted Rand index: ", metrics.adjusted_rand_score(
        #    labels, clustering_labels))
        #print("Mutual Information: ", metrics.adjusted_mutual_info_score(
        #    labels, clustering_labels))
        
        #print("Homogeneity, completeness, V-measure: ",
        #      metrics.homogeneity_completeness_v_measure(labels, clustering_labels))

        #print("Fowlkes-Mallows: ",
        #      metrics.fowlkes_mallows_score(labels, clustering_labels))


def run_clique(data, xsi, tau):
    # Finding 1 dimensional dense units
    dense_units = get_one_dim_dense_units(data, tau, xsi)

    # Getting 1 dimensional clusters
    clusters = get_clusters(dense_units, data, xsi)

    # Finding dense units and clusters for dimension > 2
    current_dim = 2
    number_of_features = np.shape(data)[1]
    while (current_dim <= number_of_features) & (len(dense_units) > 0):
        print("\n", str(current_dim), " dimensional clusters:")
        dense_units = get_dense_units_for_dim(
            data, dense_units, current_dim, xsi, tau)
        for cluster in get_clusters(dense_units, data, xsi):
            clusters.append(cluster)
        current_dim += 1

    return clusters


def read_labels(delimiter, label_column, path):
    return np.genfromtxt(path, dtype="U10", delimiter=delimiter, usecols=[label_column])


def read_data(delimiter, feature_columns, path):
    return np.genfromtxt(path, dtype=float, delimiter=delimiter, usecols=feature_columns)

import arffreader as ar



In [21]:
from collections import Counter

def max_ele(label,predicts,k):
    a = Counter(label) #label's count
    s = len(predicts)
    
    x = len(a)
    #print(x)
    ocr = []
    for i in range(x):
        temp = []
        for j in range(s):
            if label[j] == i:
                temp.append(predicts[j])
        ocr.append(temp)
            
    clus = []
    for i in range(x):
        y =  Counter(ocr[i])
        #print(y)
        clus_ocr = []
        for j in range(k):
            clus_ocr.append(y[j])
        
        clus.append(clus_ocr)
    #print(clus)
    
    maxi = []
    maxi_count =[]
    idx = 0
    for j in range(k):
        ma = 0 
        for i in range(x):
            if (ma < clus[i][j]):
                idx = i
            ma = max(clus[i][j],ma)
        maxi.append(idx)
        maxi_count.append(ma)
    #print(maxi)
    return maxi,clus


def metrics(maxi,label,clus,k):
    prec = []
    rec = []
    f1 = []
    ct = Counter(label)
    #print(clus)
    for j in range(k):
        x = (clus[maxi[j]][j])
        y = (sum([item[j] for item in clus]))
        z = float(x)/float(y)
        #print(z)
        prec.append(z)
        rc = (clus[maxi[j]][j])
        rec.append(float(rc)/ct[maxi[j]])
        f = (2*prec[j]*rec[j])/(prec[j]+rec[j])
        f1.append(f)

    print("precision : ",prec)
    print("recall are: ",rec)
    print("f1-score is: ",f1)
    return prec,rec,f1

def purity_fn(df,clus,maxi,k):
    shape = df.shape
    r_len = shape[0]
    num = 0
    for i in range(k):
        num = num + float(clus[maxi[i]][i])
    purity = num/r_len
    print("Purity is: ",purity)
    return purity


In [None]:
import arffreader as ar
import numpy as np
import pandas as pd
# Sample run: python Clique.py mouse.csv [0,1] 2 3 0.3 " " output_clusters.txt
if __name__ == "__main__":
    # Clustering with command line parameters
    '''if len(sys.argv) > 7:
        file_name = sys.argv[1]
        feature_columns = literal_eval(sys.argv[2])
        label_column = int(sys.argv[3])
        xsi = int(sys.argv[4])
        tau = float(sys.argv[5])
        delimiter = sys.argv[6]
        output_file = sys.argv[7]'''
    # Sample clustering with default parameters
    #else:
    #file_name = "datasets/Archive/diabetes.csv"
    #original_data , labels = ar.readarff("datasets/Archive/glass.arff") #read from arff
    #print(original_data)
    #original_data = np.genfromtxt('/home/munindra/Major_proj/datasets/Archive/B-cell1.csv',delimiter=',',skip_header=1 ) #path to csv file
    #print(original_data)
    # change column value here
    
    df = pd.read_csv('/home/munindra/Major_proj/datasets/Archive/B-cell3.csv',delimiter=',') #path to csv file
    df = df.replace({'ACL': 0, 'GCL': 1, 'CLL': 2, 'ABB': 3, 'FL': 4, 'TCL': 5, 'RAT': 6, 'RBB': 7, 'GCB': 8, 'NIL': 9, 'DLBCL': 10})
    original_data = df.as_matrix()
    
    sup = [item[-1] for item in original_data]
    sup = np.array(sup)
    labels = sup.ravel()
    labels = labels.astype(int)
    labels = labels.tolist()
    print(original_data)
    #feature_columns = [0, 1]
    #label_column = 74
    xsi = 10
    tau = 0.2
    delimiter = ','
    #output_file = "synthetic.txt"

    #print("Running CLIQUE algorithm on dataset, feature columns = " +
    #      str(feature_columns) + ", label column = " + str(label_column) + ", xsi = " +
    #      str(xsi) + ", tau = " + str(tau) + "\n")

    # Read in data with labels
    #path = os.path.join(os.path.abspath(os.path.dirname(__file__)), file_name)
    #original_data = read_data(delimiter, feature_columns, file_name)
    #labels = read_labels(delimiter, label_column, path)

    # Normalize each dimension to the [0,1] range
    data = normalize_features(original_data)
    #data = data[~np.isnan(data)]
    #data = data[np.isfinite(data)]
    #print(np.shape(data))
    clusters = run_clique(data=data,
                          xsi=xsi,
                          tau=tau)
    #save_to_file(clusters, output_file)
    #print("\nClusters exported to " + output_file)
    #print(clusters)
    # Evaluate results
    f1 , p = evaluate_clustering_performance(original_data, clusters, labels)
    print("f1_value is :",np.average(f1))
    print("purity is :",np.average(p))

    # Visualize clusters
    #title = ("DS: " + file_name + " - Params: Tau=" +
    #         str(tau) + " Xsi=" + str(xsi))
    #if len(feature_columns) <= 2:
    #    plot_clusters(data, clusters, title, xsi)

[[ 0.46      0.7       0.67     ... -0.04      0.16      0.      ]
 [ 0.02      0.59      0.45     ... -0.14     -1.15      0.      ]
 [-0.32     -0.63     -0.46     ...  0.29      0.25      0.      ]
 ...
 [ 0.09     -0.078375  0.85     ...  0.09     -0.53      2.      ]
 [ 0.34     -0.078375  0.36     ... -0.33      0.01061   2.      ]
 [ 0.37      0.38      0.3      ...  0.46      0.8       1.      ]]
('1D projection:\n', array([[ 1.,  1.,  1., ...,  3.,  2., 23.],
       [ 5.,  1.,  4., ...,  4., 11., 22.],
       [ 3.,  2.,  7., ...,  4., 21., 11.],
       ...,
       [19., 19.,  1., ...,  2.,  1.,  2.],
       [ 9., 10.,  0., ...,  1.,  0.,  2.],
       [ 3.,  3.,  1., ...,  1.,  1.,  1.]]), '\n')
('is_dense:\n', array([[False, False, False, ..., False, False,  True],
       [False, False, False, ..., False, False,  True],
       [False, False, False, ..., False,  True, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., F

('cluster_dense_units: ', [[[355, 6]]])
('cluster_dense_units: ', [[[356, 6]], [[356, 7]], [[356, 8]]])
('cluster_dense_units: ', [[[357, 7]], [[357, 8]]])
('cluster_dense_units: ', [[[358, 7]], [[358, 8]]])
('cluster_dense_units: ', [[[359, 7]], [[359, 8]]])
('cluster_dense_units: ', [[[360, 7]], [[360, 8]]])
('cluster_dense_units: ', [[[361, 5]], [[361, 6]]])
('cluster_dense_units: ', [[[362, 6]], [[362, 7]], [[362, 8]]])
('cluster_dense_units: ', [[[363, 7]], [[363, 8]]])
('cluster_dense_units: ', [[[364, 7]], [[364, 8]]])
('cluster_dense_units: ', [[[365, 7]], [[365, 8]]])
('cluster_dense_units: ', [[[366, 6]], [[366, 7]]])
('cluster_dense_units: ', [[[367, 7]], [[367, 8]]])
('cluster_dense_units: ', [[[368, 7]], [[368, 8]]])
('cluster_dense_units: ', [[[369, 7]], [[369, 8]]])
('cluster_dense_units: ', [[[370, 7]], [[370, 8]]])
('cluster_dense_units: ', [[[371, 6]], [[371, 7]], [[371, 8]]])
('cluster_dense_units: ', [[[372, 6]], [[372, 7]]])
('cluster_dense_units: ', [[[373, 6]], [

('cluster_dense_units: ', [[[1269, 4]]])
('cluster_dense_units: ', [[[1270, 6]]])
('cluster_dense_units: ', [[[1271, 5]]])
('cluster_dense_units: ', [[[1272, 2]]])
('cluster_dense_units: ', [[[1275, 2]]])
('cluster_dense_units: ', [[[1276, 2]]])
('cluster_dense_units: ', [[[1277, 2]]])
('cluster_dense_units: ', [[[1277, 4]]])
('cluster_dense_units: ', [[[1278, 2]], [[1278, 3]]])
('cluster_dense_units: ', [[[1280, 1]]])
('cluster_dense_units: ', [[[1281, 1]]])
('cluster_dense_units: ', [[[1282, 4]]])
('cluster_dense_units: ', [[[1283, 4]]])
('cluster_dense_units: ', [[[1284, 4]]])
('cluster_dense_units: ', [[[1285, 2]], [[1285, 3]]])
('cluster_dense_units: ', [[[1286, 3]]])
('cluster_dense_units: ', [[[1287, 5]]])
('cluster_dense_units: ', [[[1287, 7]]])
('cluster_dense_units: ', [[[1288, 5]]])
('cluster_dense_units: ', [[[1291, 4]]])
('cluster_dense_units: ', [[[1293, 3]]])
('cluster_dense_units: ', [[[1294, 5]]])
('cluster_dense_units: ', [[[1295, 5]], [[1295, 6]]])
('cluster_dense_un

('cluster_dense_units: ', [[[1537, 4]], [[1537, 5]]])
('cluster_dense_units: ', [[[1538, 5]]])
('cluster_dense_units: ', [[[1539, 3]]])
('cluster_dense_units: ', [[[1540, 3]]])
('cluster_dense_units: ', [[[1541, 3]], [[1541, 4]]])
('cluster_dense_units: ', [[[1542, 3]]])
('cluster_dense_units: ', [[[1543, 3]], [[1543, 4]]])
('cluster_dense_units: ', [[[1544, 4]], [[1544, 5]]])
('cluster_dense_units: ', [[[1545, 4]], [[1545, 5]]])
('cluster_dense_units: ', [[[1547, 4]], [[1547, 5]]])
('cluster_dense_units: ', [[[1548, 3]], [[1548, 4]]])
('cluster_dense_units: ', [[[1549, 4]]])
('cluster_dense_units: ', [[[1549, 6]]])
('cluster_dense_units: ', [[[1550, 4]], [[1550, 5]]])
('cluster_dense_units: ', [[[1551, 2]], [[1551, 3]]])
('cluster_dense_units: ', [[[1552, 4]]])
('cluster_dense_units: ', [[[1553, 5]]])
('cluster_dense_units: ', [[[1554, 5]]])
('cluster_dense_units: ', [[[1555, 5]], [[1555, 6]]])
('cluster_dense_units: ', [[[1556, 4]]])
('cluster_dense_units: ', [[[1557, 5]]])
('cluster

('cluster_dense_units: ', [[[2128, 4]]])
('cluster_dense_units: ', [[[2129, 4]]])
('cluster_dense_units: ', [[[2130, 1]]])
('cluster_dense_units: ', [[[2131, 3]], [[2131, 4]]])
('cluster_dense_units: ', [[[2132, 5]], [[2132, 6]]])
('cluster_dense_units: ', [[[2133, 5]]])
('cluster_dense_units: ', [[[2134, 5]]])
('cluster_dense_units: ', [[[2135, 4]]])
('cluster_dense_units: ', [[[2136, 5]]])
('cluster_dense_units: ', [[[2137, 6]]])
('cluster_dense_units: ', [[[2138, 4]], [[2138, 5]]])
('cluster_dense_units: ', [[[2139, 6]], [[2139, 7]]])
('cluster_dense_units: ', [[[2140, 4]], [[2140, 5]]])
('cluster_dense_units: ', [[[2141, 4]]])
('cluster_dense_units: ', [[[2142, 5]]])
('cluster_dense_units: ', [[[2143, 2]], [[2143, 3]], [[2143, 4]]])
('cluster_dense_units: ', [[[2144, 3]], [[2144, 4]]])
('cluster_dense_units: ', [[[2145, 2]], [[2145, 3]]])
('cluster_dense_units: ', [[[2146, 2]], [[2146, 3]]])
('cluster_dense_units: ', [[[2147, 2]]])
('cluster_dense_units: ', [[[2148, 2]], [[2148, 3]

('cluster_dense_units: ', [[[2707, 6]]])
('cluster_dense_units: ', [[[2710, 5]]])
('cluster_dense_units: ', [[[2712, 6]]])
('cluster_dense_units: ', [[[2713, 4]], [[2713, 5]]])
('cluster_dense_units: ', [[[2717, 5]], [[2717, 6]]])
('cluster_dense_units: ', [[[2719, 6]]])
('cluster_dense_units: ', [[[2721, 4]]])
('cluster_dense_units: ', [[[2722, 5]]])
('cluster_dense_units: ', [[[2723, 5]], [[2723, 6]]])
('cluster_dense_units: ', [[[2724, 4]]])
('cluster_dense_units: ', [[[2725, 5]]])
('cluster_dense_units: ', [[[2726, 4]]])
('cluster_dense_units: ', [[[2727, 4]]])
('cluster_dense_units: ', [[[2728, 5]], [[2728, 6]]])
('cluster_dense_units: ', [[[2729, 5]]])
('cluster_dense_units: ', [[[2731, 5]]])
('cluster_dense_units: ', [[[2732, 5]], [[2732, 6]], [[2732, 7]]])
('cluster_dense_units: ', [[[2734, 5]], [[2734, 6]]])
('cluster_dense_units: ', [[[2735, 6]]])
('cluster_dense_units: ', [[[2736, 6]]])
('cluster_dense_units: ', [[[2737, 7]]])
('cluster_dense_units: ', [[[2742, 5]]])
('clust

('cluster_dense_units: ', [[[3986, 1]], [[3986, 2]], [[3986, 3]]])
('cluster_dense_units: ', [[[3987, 2]], [[3987, 3]]])
('cluster_dense_units: ', [[[3988, 5]]])
('cluster_dense_units: ', [[[3992, 4]], [[3992, 5]]])
('cluster_dense_units: ', [[[3993, 6]], [[3993, 7]]])
('cluster_dense_units: ', [[[3994, 3]], [[3994, 4]]])
('cluster_dense_units: ', [[[3995, 5]]])
('cluster_dense_units: ', [[[3996, 4]], [[3996, 5]]])
('cluster_dense_units: ', [[[3997, 2]]])
('cluster_dense_units: ', [[[3998, 2]]])
('cluster_dense_units: ', [[[3998, 4]]])
('cluster_dense_units: ', [[[3999, 5]]])
('cluster_dense_units: ', [[[4000, 5]]])
('cluster_dense_units: ', [[[4001, 4]], [[4001, 5]]])
('cluster_dense_units: ', [[[4002, 4]]])
('cluster_dense_units: ', [[[4003, 5]]])
('cluster_dense_units: ', [[[4004, 5]]])
('cluster_dense_units: ', [[[4005, 4]], [[4005, 5]]])
('cluster_dense_units: ', [[[4006, 4]]])
('cluster_dense_units: ', [[[4007, 4]], [[4007, 5]]])
('cluster_dense_units: ', [[[4008, 4]], [[4008, 5]