In [2]:
from sklearn import datasets
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from mpl_toolkits import mplot3d
import numpy as np
import time
import copy
import math
from sklearn.decomposition import PCA
import itertools

In [3]:
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('fast')

In [5]:
n_samples = 1500
blobs = datasets.make_blobs(n_samples=[500,500,500], random_state =3, cluster_std =[1.5,1.5,1.5])
# Anisotropicly distributed data
random_state = 1
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.5, -0.6], [-0.4, 0.7]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)
#skewed norm

In [4]:
#Euclidean distance
def dist(a, b):
    return np.linalg.norm(a - b, None)

#Tau-distance
def dist_fun_vtau(a,m,tau):
    dist = np.zeros(len(m))
    for i in range(len(m)):
        d = a - m[i]
        ele = 0
        for j in range(len(d)):
            col = d[j]
            ad = (1-tau[i,j])* sum(col[col<0]**2) + tau[i,j]* sum(col[col>=0]**2)
            ele = ele + ad
        dist[i] = ele
    return dist

#Expectile estimation
def expectile_fun(group, tau):
    e = np.mean(group, axis=0)
    e_new = np.zeros(e.shape)
    while dist(e_new , e) != 0:
        c = group[:,:]- e
        e = copy.deepcopy(e_new)
        for i in range(len(c[0])):
            d = c[:,i]
            a_co = group[:,i]
            neg = a_co[d<0]
            pos = a_co[d>=0]
            norm = tau[i]*len(pos)+ (1-tau[i])*len(neg)
            e_new[i] = (tau[i]* sum(pos) + (1-tau[i])* sum(neg))/norm
    return  e_new

#Estimate optimal taus
def tau_fun(points, mu):
    tau_list = []
    dis = points - mu
    for i in range(len(mu)):
        res = dis[:,i]
        e_neg = -sum(res[res < 0])/len(res[res < 0])
        e_pos = sum(res[res >= 0])/len(res[res >= 0])
        c = e_neg/e_pos
        tau = c/(1+c)
        tau_list.append(tau)
    return tau_list

In [7]:
# K expectile clustering with unknown taus
def k_expectile_vtau(X, k): 
    X = np.array(X)
# Initialize cluster centers as K means cluster centers
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    C = kmeans.cluster_centers_

# To store the value of centroids when it updates
    C_old = np.zeros(C.shape)
    clusters = np.zeros(len(X))
# Initialize tau = 0.5
    tau_list = np.ones((k, len(C[0])))*0.5
# Initialize var = var
    var_list = np.zeros((k, len(C[0])))
    for j in range(len(C[0])):
        var_list[:,j] = np.var
# Error func. - Distance between new centroids and old centroids
    error = dist(C, C_old)
# Loop will run till the error 
    while error >= 0.05:
    # Assigning each value to its closest cluster
        for i in range(len(X)):
            for j in range (len(C[0])):
                distances = dist_fun_vtau(X[i], C, tau_list)
                cluster = np.argmin(distances)
                clusters[i] = cluster
    # Storing the old centroid values
        C_old = copy.deepcopy(C)
    # Finding the new centroids and tau
        for d in range(k):
            points = [X[i] for i in range(len(X)) if clusters[i] == d]
            points = np.array(points)
    # Updating taus
            tau = tau_fun(points,C[d])
            C[d] = expectile_fun(points,tau)
            tau_list[d] = tau
        error = dist(C, C_old)
        print(tau_list)
        print(error)
    return C, clusters

In [6]:
p = [2,3,4]
np.sum(p)

9