In [32]:
#imports
import numpy as np
from scipy.stats import chisquare
from scipy.stats import poisson
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import math
from sklearn.mixture import GaussianMixture # For Expectation maximisation algorithm

In [33]:
# Load Data
data = np.genfromtxt('scale-d-10d.csv', delimiter=' ')
data = data[:,0:9]

labels = np.genfromtxt('scale-d-10d.csv', delimiter=' ',dtype="|U5")
labels = labels[:,10]

In [34]:
#dependencies: numpy, scipy (for scipy.stats.chisquare)
class P3C:
    
    #class variables
    _alpha = 0.001 #alpha for chi-squared-test

    #Poisson threshold is the only parameter for P3C 
    def __init__(self, poisson_threshold): 
        
        #sklearn-like attributes
        self.labels_ = None
        self.cluster_centers_ = None #"cluster cores"
        
        #internally used variables
        self._poisson_threshold = poisson_threshold
        self._support_set = []
        self._supports = []
        self._approx_proj = []   #nested list: 1st level: attributes, 2nd level:
                                 #different intervals, 3rd level: start and end of inteval.
                                 #used as interface between part 3.1 and 3.2
    
    
    # Methods for 3.1: Projections of true p-signatures (Mahdi and Robert)
    
    def __compute_support(self, M):
        '''Computes Supports of each bin
        This function computes support set of each interval S and its support.
        then assigns the values to self._support_set = [] and self._supports = []
        SupportSet(S)= {x ∈ D | x.aj ∈ S }
        Support(S) = |SupportSet(S)|
        
            
        Parameters
        ----------
        self, 
        
        M : numpy.array 
        
        Returns
        -------
        
        '''
        n = M.shape[0] # n = number of data objects
        attribute_number = M.shape[1] # number of attributes 
        bin_number = int(1 + math.log(n,2)) # number of bins

        for i in range(attribute_number):
            supp_set = [[]for i in range(bin_number)] # a set containing supports of an attr
                                                      # in different bins

            aj_max = 1
            aj_min = -1

            interval_length = (aj_max - aj_min ) / bin_number

            bins = np.zeros([bin_number])

            for k in range(bin_number):
                bins[k] = aj_min + (k+1)*interval_length

            for j in range(n):
                supp_set_index = 0
                for k in range(len(bins)):
                    if M[j,i] > bins[k]:
                        supp_set_index += 1

                
                supp_set[supp_set_index].append(M[j])
                


            supp = []
            for supp_pts in supp_set:
                supp.append (len(supp_pts))
            self._supports.append (supp)
            self._support_set.append(supp_set)
            
            
        # Uncomment the part below for projections
#         for i in range (len(self._supports)):
#             for j in range(len(self._supports[i])):
#                 print(len(self._supports[i][j])) 
                
        return
    
    def __approximate_projection(self):
        
        bins = np.zeros((len(self._supports), len(self._supports[0])), dtype=int)
        for attr_number in range(len(self._supports)): #loop over all attributes
                
            # part 1: create bin array            
            supp = self._supports[attr_number].copy() #make a copy of the supports of the current attribute
            while self.__uniformity_test(supp) == False: #if not uniform find highest element
                max_index = supp.index(max(supp))
                supp.pop(max_index) #remove highest element from list
                
                i = 1 
                while i <= max_index: #loop to adjust max_index according to previousely deleted elements
                    max_index += bins[attr_number, i]
                    i += 1
                bins[attr_number, max_index]  = 1 #mark highest bin
            
            #part 2: create _approx_proj list from bin array
            interval_list = [] #2d list for current attribute
            interval = [] #current interval
            open_interval = False
            
            for i in range(len(bins[attr_number])):
                if open_interval == False: #open new interval
                    if bins[attr_number, i] == 1:
                        interval.append(i)
                        open_interval = True
                if open_interval == True: #close current interval
                    if bins[attr_number, i] == 0:
                        interval.append(i)
                        interval_list.append (interval)
                        interval = []
                        open_interval = False
                    if (i == len(bins[attr_number])-1) and (bins[attr_number, i] == 1): #last bin marked 1
                        interval.append (len(bins[attr_number]))
                        interval_list.append (interval)                
        
            self._approx_proj.append (interval_list)
        
    def __uniformity_test(self, attr):
        if chisquare(attr)[0] < self._alpha:
            #print ("uniform")
            return True
        #print ("non-uniform")
        return False  
    
    # Methods for 3.3: Cluster Cores (Akshey and Jonas)
    
    def __compute_support_sig(p_signature, dataset):
        '''Computes support for p-signature
        This function computes the support by removing data points
        that do not lie in any of the intervals of the given p-signature
            
        Parameters
        ----------
        p_signature : dictronary e.g. {0:[0,0.1], 3:[0.1,0.2]} -> Intervals for attributes 0 and 3
        
        dataset : numpy.ndarray 
        
        Returns
        -------
        data.shape[0] : number of points in p-signature
        
        '''
        
        data = np.copy(dataset)
        for attribute in p_signature:
            interval = p_signature[attribute]
            remove = []
            for i, point in enumerate(data):
                if  interval[0] > point[attribute] or point[attribute] > interval[1]:
                    remove.append(i)
            data = np.delete(data, remove, 0)
        return data.shape[0]
    
    
    def __compute_exp_support(p_signature, interval, data):
        ''' Computes expected support for a p-signature
            
        Parameters
        ----------
        p-signature : dictronary e.g. {0:[0,0.1], 3:[0.1,0.2]} -> Intervals for attributes 0 and 3
            
        interval : list with start and end value of interval
        
        data : normalized np.ndarray
    
        Returns
        -------
        support * width : 
        
        '''
        
        support = __compute_support_sig(p_signature, data)
        width = abs(interval[0] - interval[1])
        return support*width
    
    
    def __diff_interval(p_signature, pplus1_signature):
        '''Helper function to compute difference in interval for two p-signatures.
           Used for possion threshold
           
        Parameters
        ---------- 
        p_signature : dictronary e.g. {0:[0,0.1], 3:[0.1,0.2]} -> Intervals for attributes 0 and 3
         
        pplus1_signature : dictronary e.g. {0:[0,0.1], 3:[0.1,0.2]} -> Intervals for attributes 0 and 3
           
        Returns
        -------
        interval : 
        
        '''
        
        diff = list(set(pplus1_signature) - set(p_signature))
        interval = pplus1_signature[diff[0]]
        return interval

 
    def __check_core_condition(p_signature, pplus1_signature, dataset, threshold=1e-20):
        ''' Checks if probability is smaller than possion threshold: 
        Possion(Supp(k+1 signature), ESupp(k+1 signature)) < possion_threshold
        Returns True is poisson value is smaller than threshold

        and

        Checks if support is larger than expected support: 
        Supp(k+1 signature) > ESupp(k+1 siganature)
        ESupp = Supp(S) * width(S')
        
        Parameters
        ----------
        p-signatue : dictronary e.g. {0:[0,0.1], 3:[0.1,0.2]} -> Intervals for attributes 0 and 3
        
        pplus1_signature : dictronary e.g. {0:[0,0.1], 3:[0.1,0.2]} -> Intervals for attributes 0 and 3
        
        dataset : numpy.ndarray
        
        threshold : poisson_threshold -> defined by user. default: 1e-20
        
        Returns
        -------
        true/false : 
        
        '''
        
        interval = diff_interval(p_signature, pplus1_signature)
        support = __compute_support_sig(pplus1_signature, dataset)
        expected_support = __compute_exp_support(pplus1_signature, interval, dataset)
        base_condition = support > expected_support
        if base_condition:
            possion_value = poisson.pmf(support, expected_support) 
            if poisson_value < threshold:
                return True
    
    
    def __apriori_cores(approx_proj, supports):
        ''' Computes cluster cores in apriori fashion. 
        The function computes maximal p-signatures that fulfill 
        two conditions. 

        Parameters
        ----------
        approx_proj :

        supports : 

        Returns
        -------
        max_p_signatures : 

        '''
        
          # Loop through attributes and intervals (ignore same dimensions)

            # Compute k+1 signatures from valid k signatures

            # Check condition 1 for each signature (check_supp_expected_supp())

            # Check condition 2 for each signature

            # Prune away infrequent k+1 signatures

          # Select maximal p-signatures

        pass
    
     # Methods for 3.3: Computing projected clusters (Manju)
    def __fuzzy_membership_matrix(cluster_core_i,data): 
        '''
        Refines the cluster cores into projected clusters
        Parameters
        ----------
        cluster_core_i:
        
        data:
        
        Returns
        -------
        fuzzy_membership_matrix:
        '''
        fuzzy_membership_matrix=[]
        for i in range(1,n):
            for l in range(1,k):
                if (i in data and l in support_set(cluster_core_i)):
                    if(i not in support_set(cluster_core_i)): 
                        fuzzy_membership_matrix=0
                       # unassigned_datapoints=cluster_core_i.append(i)
                    elif (i in support_set(cluster_core_i)):
                        fuzzy_membership_matrix=(1/support_set(cluster_core_i))[data]
        
        return fuzzy_membership_matrix   
    
                
    def __probability_of_datapoint(fuzzy_membership_matrix,max_iterations):
        '''
          For each data point compute the probability of belonging to each projected cluster using Expectation 
          Maximization(EM)algorithm.
          Parameters
          ----------
          fuzzy_membership_matrix:
          
          max_iterations:
          
          Returns
          -------
          probability_matrix:
        
        '''
        # initialise EM with fuzzy_membership_matrix.cluster members have shorter mahalanobis distances to cluster
        # means than non-cluster members
        max_iterations=10
        gaussian_mixture = GaussianMixture(n_components=2, covariance_type='full').fit(fuzzy_membership_matrix)
        gaussian_mixture.means_
        gaussian_mixture.fit()
        label=gaussian_mixture.predict(fuzzy_membership_matrix)
        
        return gaussian_mixture
        


    #data X is numpy.ndarray with samples x features (no label!)  
    def fit (self, X):
        #all the method calls of 3.1 and 3.2 we have to implement go here...
        self.__compute_support(X)
        self.__approximate_projection()
        
        #self.cluster_center_ = ... #used as interface between part 3.2. and 3.3
        
    #data X is numpy.ndarray with samples x features (no label!)  
    def predict (self, X):
        pass #remove pass when implementing predict (X)
        
        #all the method calls of 3.3, 3.4 and 3.5 we have to implement go here...
        
        #self.labels_ = #final result of the algorithm.
        
    #data X is numpy.ndarray with samples x features (no label!)  
    def fit_predict (self, X):
        self.fit (X)
        self.predict (X)

In [29]:
p3c = P3C (10)
p3c.fit (data)
print (p3c._approx_proj)

[[[1, 14]], [[1, 14]], [[0, 1], [2, 14]], [[0, 13]], [[1, 14]], [[1, 13]], [[1, 14]], [[2, 14]], [[1, 14]]]


  terms = (f_obs.astype(np.float64) - f_exp)**2 / f_exp
  f_exp = f_obs.mean(axis=axis, keepdims=True)


In [35]:
def preprocess(data_add):
    normalized_data = []
    labels = []
    
    
    csv = np.genfromtxt(data_add, delimiter=' ')
    num_rows = csv.shape[0]
    num_cols = csv.shape[1]
    data = csv[:,0:num_cols-1]

    normalized_data = preprocessing.normalize(data)
    labels = np.loadtxt(data_add, delimiter=' ', dtype=str, usecols = range(num_cols-1,num_cols))
    
    return normalized_data, labels

In [36]:
normalized_data, labels = preprocess('scale-d-10d.csv')
p3c = P3C (10)
p3c.fit (normalized_data)
print (p3c._approx_proj)

[[[0, 1], [2, 14]], [[0, 14]], [[0, 14]], [[0, 1], [2, 13]], [[0, 14]], [[0, 14]], [[1, 14]], [[0, 14]], [[0, 1], [2, 13]], [[0, 14]]]


  terms = (f_obs.astype(np.float64) - f_exp)**2 / f_exp
  f_exp = f_obs.mean(axis=axis, keepdims=True)
