In [9]:
import pandas as pd
import numpy as np

import time
import copy

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# from sklearn.cluster import Birch
# import matplotlib.pyplot as plt




In [10]:
def k_means_model_with_best_sil_score(X, random_seed = 0, kmax = 10):
  highest_score = 0
  k_means_model = None

  for ks in range(2, kmax+1):
    kmeans = KMeans(n_clusters = ks, random_state=random_seed).fit(X)
    cur_score = silhouette_score(X, kmeans.labels_, metric = 'euclidean')
    if cur_score > highest_score:
      highest_score = cur_score
      k_means_model = kmeans

  return k_means_model

In [11]:
# data structure
class node:
    def __init__(self, id = None, entity = None, center = None, index = None, up_lv = None, down_lv = None):
        self.id = id # unique id for node
        self.entity = entity # sorted center by value with column name
        self.center = center # cluster center (vector)
        self.index = index # cashe of search result, first 100? result close to center (list of index)
        self.up_lv = up_lv # super-cluster (node)
        self.down_lv = down_lv # sub-cluster (lsit of node)

class sim:
    def l1(vector_1, vector_2):
        return sum(abs(vector_1-vector_2))

    def l2(vector_1, vector_2):
        return sum((vector_1-vector_2)**2)
             

# graph tree (cluster)
class graph_tree:
    def __init__(self, attr_name, X, X_maxmin, min_cluster_size, searhing_query = None, recom = None):
        self.attr_name = attr_name
        self.X = X
        self.X_maxmin = X_maxmin
        self.min_cluster_size = min_cluster_size
        self.searhing_query = searhing_query
        self.recom = recom
        
    
        # create root node
        init_center = np.mean(X, axis=0)
        init_idx = np.array([i for i in range(len(X))])
        self.root = node(id=0, entity=sorted(list(zip(self.attr_name, init_center))), center=init_center, index=init_idx[np.argsort(np.sum(abs(X-init_center), axis=1))])
        del (init_center, init_idx)
        print("root node is created")
    
    def search(self, query_vector):
        self.searhing_query = query_vector
        cur = self.root
        
        while cur.down_lv:
            cur_center_distance = sim.l2(query_vector, cur.center)
            # print(cur_center_distance)

            closest = cur_center_distance
            closest_node_position = -1

            for node in range(len(cur.down_lv)):
                if sim.l2(query_vector, cur.down_lv[node].center) < closest:
                    closest = sim.l2(query_vector, cur.down_lv[node].center)
                    closest_node_position = node

            if closest_node_position < 0:
                break
            
            cur = cur.down_lv[closest_node_position]

        return cur
        


    def build_tree(self):
        
        recom_max = self.root.center.copy()
        recom_min = self.root.center.copy()
        recom = [[self.root, self.root] for _ in range(len(self.attr_name))]
        print(recom_max, recom_min, recom)
        
        queue = []
        queue.append(self.root)
        
        counting_id = 1
        lv = 0

        while queue:
            size = len(queue)
            print("queue", size, queue)
            print(f"\n---------------------------------- level {lv} no of node {size} ----------------------------------------")
            for node_in_this_lv in range(size):
                print(f"\nhandling node {node_in_this_lv}, info:")
                # k means required parameter and train the model
                cur = queue.pop(0) # get cur node
                print(f"cur node {cur}")
                print(f"no. of cur node data {len(cur.index)}")
                if len(cur.index) > self.min_cluster_size:
                    cur_X = self.X[cur.index] # get cur data by index
                    k_means_model = k_means_model_with_best_sil_score(cur_X) # get kmeans model
                    print(f"optimal K {k_means_model.n_clusters}\n")

                    # # relationship
                    # attr_no = 3
                    # largest_indices = np.argpartition(np.std(k_means_model.cluster_centers_, axis=0), -attr_no)[-attr_no:]
                    # print(largest_indices)

                    # relationship = {}
                    # np.argpartition(np.std(k_means_model.cluster_centers_, axis=0), -attr_no)[-attr_no:]
                    # for i in largest_indices:
                    #     tmp_entity[self.attr_name[i]]=k_means_model.cluster_centers_[groups][i]

                    # create required node
                    new_down_lv = [node(id=i+counting_id, entity=sorted(list(zip(self.attr_name, k_means_model.cluster_centers_[i])), key=lambda item: item[1], reverse=True), center=k_means_model.cluster_centers_[i], up_lv=cur) for i in range(k_means_model.n_clusters)]
                    
                    # sorting the index
                    #cal the distance between each data to coresponding center
                    distance_to_self_center = []
                    for i in range(len(cur_X)):
                        distance_to_self_center.append(sim.l1(cur_X[i], k_means_model.cluster_centers_[k_means_model.labels_[i]]))
                    
                    sorted_indices = np.argsort(np.array(distance_to_self_center))
                    sorted_index = cur.index[sorted_indices]
                    sorted_labels = k_means_model.labels_[sorted_indices]

                    for i in range(k_means_model.n_clusters):
                        new_down_lv[i].index = sorted_index[np.where(sorted_labels == i)[0]]

                    new_down_lv.sort(key=lambda x: sim.l1(x.center, cur.center))
                    cur.down_lv = new_down_lv
                    
                    queue += new_down_lv
                    # print(f"down lv of this node {new_down_lv}")

                    counting_id += k_means_model.n_clusters
                else:
                    for i in range(len(cur.center)):
                        if cur.center[i] >= recom_max[i]:
                            recom_max[i] = cur.center[i]
                            recom[i][0] = cur
                            print(f"in {self.attr_name[i]} max")
                            print(recom)
                        elif cur.center[i] <= recom_min[i]:
                            recom_min[i] = cur.center[i]
                            recom[i][1] = cur
                            print(f"in {self.attr_name[i]} min")
                            print(recom)
                    

            lv += 1
            
        self.recom = recom

        return 0
    
    def print_tree(self, simple=1):
        queue = [self.root]
        lv = 0
        while queue:
            print(f"---------- lv: {lv}, node num.: {len(queue)} ----------")
            new_queue = []
            for n in queue:
                if simple == 0:
                    print(n.id)
                if n.down_lv:
                    new_queue += n.down_lv
            queue = new_queue
            lv += 1

    # def flip(self):
        



In [12]:
raw = pd.read_csv(r'../../Data/train.csv')
# print(raw)

random_seed = 42

# print(list(raw.columns)[:])

In [13]:
data = raw.iloc[:,2:-1]
data = data.fillna(0)
data = data[:5000]
print(data.shape)
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data)

# label
y = raw.iloc[:,-1]
# print(y)

col_name = list(raw.columns)[2:-1]
print(col_name)
# print(normalized_data)

(5000, 14)
['Popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_in min/ms', 'time_signature']


In [14]:
data_min = scaler.data_min_
data_max = scaler.data_max_

print(data_min, data_max)

[ 0.0000e+00  5.9900e-02  3.0100e-03  0.0000e+00 -3.6214e+01  0.0000e+00
  2.2500e-02  0.0000e+00  0.0000e+00  1.3600e-02  2.5900e-02  3.4132e+01
  5.0165e-01  1.0000e+00] [9.900000e+01 9.820000e-01 9.980000e-01 1.100000e+01 1.355000e+00
 1.000000e+00 9.370000e-01 9.960000e-01 9.960000e-01 9.920000e-01
 9.850000e-01 2.160910e+02 1.412451e+06 5.000000e+00]


In [15]:
test_tree = graph_tree(attr_name = col_name, X = normalized_data, X_maxmin=[scaler.data_max_,scaler.data_min_], min_cluster_size=200)
# test_tree.show_root()
test_tree.build_tree()

root node is created
[0.43906465 0.52589225 0.66144496 0.47741818 0.7539839  0.639
 0.06208811 0.25001819 0.13598254 0.1839972  0.47853415 0.48415781
 0.14202648 0.7291    ] [0.43906465 0.52589225 0.66144496 0.47741818 0.7539839  0.639
 0.06208811 0.25001819 0.13598254 0.1839972  0.47853415 0.48415781
 0.14202648 0.7291    ] [[<__main__.node object at 0x0000020D5968F5C0>, <__main__.node object at 0x0000020D5968F5C0>], [<__main__.node object at 0x0000020D5968F5C0>, <__main__.node object at 0x0000020D5968F5C0>], [<__main__.node object at 0x0000020D5968F5C0>, <__main__.node object at 0x0000020D5968F5C0>], [<__main__.node object at 0x0000020D5968F5C0>, <__main__.node object at 0x0000020D5968F5C0>], [<__main__.node object at 0x0000020D5968F5C0>, <__main__.node object at 0x0000020D5968F5C0>], [<__main__.node object at 0x0000020D5968F5C0>, <__main__.node object at 0x0000020D5968F5C0>], [<__main__.node object at 0x0000020D5968F5C0>, <__main__.node object at 0x0000020D5968F5C0>], [<__main__.nod

0

In [16]:
import pickle

with open("graph_tree.pkl", 'wb') as f:
    pickle.dump(test_tree, f)

In [17]:
seaching = np.mean(normalized_data[100:101], axis=0)
print(seaching)
cur = test_tree.search(seaching)

[0.24242424 0.46751979 0.5035126  0.36363636 0.60243818 0.
 0.03728814 0.62751004 0.79718876 0.21300082 0.12417892 0.69124913
 0.18320677 0.75      ]


In [18]:
def super_cluster(cur):
    # super cluster
    print(f"super cluster")
    if cur.up_lv:
        print(cur.up_lv.id)
    else:
        print(f"there is no super cluster")
    print("\n", end="")

def neibour_cluster(cur):
    # neibour in the same lv
    print(f"neibour in the same lv")
    if cur.up_lv:
        if len(cur.up_lv.down_lv) > 1:
            for neibour in cur.up_lv.down_lv:
                if neibour != cur:
                    print(neibour.id)
        else:
            print(f"there is no neibour")
    else:
        print(f"there is no neibour")
    print("\n", end="")

def sub_cluster(cur):
    # sub cluster
    print(f"sub cluster")
    if cur.down_lv:
        for sub in cur.down_lv:
            print(sub.id)
    else:
        print(f"there is no sub cluster")
    print("\n", end="")


def suround_cluster(cur):
    # cur cluster
    print(f"current cluster")
    print(cur.id)
    print("\n", end="")
    
    super_cluster(cur)
    neibour_cluster(cur)
    sub_cluster(cur)
    

In [19]:
# specific searching
# cur = cur.down_lv[0]

#generic searching
# cur = cur.up_lv

#simular searching
# cur = cur.up_lv.down_lv[1]

suround_cluster(cur)

current cluster
15

super cluster
6

neibour in the same lv
16

sub cluster
there is no sub cluster



In [20]:
print(normalized_data.shape)
print(np.mean(normalized_data[:100], axis=0))

print(test_tree.root.center)

(5000, 14)
[0.42919192 0.53548422 0.6287219  0.45818182 0.74335117 0.72
 0.05481575 0.24029304 0.13223035 0.16829518 0.49376186 0.47735809
 0.13475585 0.72      ]
[0.43906465 0.52589225 0.66144496 0.47741818 0.7539839  0.639
 0.06208811 0.25001819 0.13598254 0.1839972  0.47853415 0.48415781
 0.14202648 0.7291    ]


In [21]:
from collections import Counter

"""
create a new index (something like tf-idf)
data reduction rate
vs
accurate
"""
def score(search_data, acc, smoothing = 0.001, lam = 0.5):
    return lam*(acc/100)+(1-lam)*(1-(search_data/(1000+smoothing)))

my_score = []
normal_kmeans_score = []

tmp = [i*100 for i in range(10)]
for i in tmp:
    b = i
    u = i+100
    # print(normalized_data[b:u])
    print(f"-- mine result{i}")
    seaching = np.mean(normalized_data[b:u], axis=0)
    result = test_tree.search(seaching)
    
    print("search number: ", len(result.index))
    right = np.where((result.index>=b) & (result.index<u))[0] # return index
    print("hitted right result:", len(right))
    my_score.append(score(len(result.index), len(right)))

    ############
    print("-- kmeans result")
    checking = KMeans(n_clusters=10, random_state=0).fit(normalized_data)

    counts1 = Counter(checking.labels_)
    print(counts1)

    counts2 = Counter(checking.labels_[b:u])
    print(counts2)
    belongs_group = counts2.most_common()[0][0]
    print("belongs to group: ", belongs_group, "\n")
    # print(counts1.get(belongs_group), counts2.get(belongs_group))
    normal_kmeans_score.append(score(counts1.get(belongs_group), counts2.get(belongs_group)))

    

-- mine result0
search number:  5000
hitted right result: 100
-- kmeans result
Counter({np.int32(9): 944, np.int32(0): 659, np.int32(3): 656, np.int32(1): 619, np.int32(4): 607, np.int32(7): 577, np.int32(5): 299, np.int32(8): 281, np.int32(6): 191, np.int32(2): 167})
Counter({np.int32(9): 22, np.int32(1): 17, np.int32(0): 13, np.int32(4): 12, np.int32(7): 10, np.int32(3): 9, np.int32(8): 6, np.int32(5): 5, np.int32(2): 5, np.int32(6): 1})
belongs to group:  9 

-- mine result100
search number:  5000
hitted right result: 100
-- kmeans result
Counter({np.int32(9): 944, np.int32(0): 659, np.int32(3): 656, np.int32(1): 619, np.int32(4): 607, np.int32(7): 577, np.int32(5): 299, np.int32(8): 281, np.int32(6): 191, np.int32(2): 167})
Counter({np.int32(3): 17, np.int32(9): 15, np.int32(4): 15, np.int32(7): 11, np.int32(5): 10, np.int32(1): 10, np.int32(0): 9, np.int32(6): 6, np.int32(8): 5, np.int32(2): 2})
belongs to group:  3 

-- mine result200
search number:  5000
hitted right result: 100

In [22]:
print(my_score)
print(normal_kmeans_score)

print(sum(my_score))
print(sum(normal_kmeans_score))

[-1.4999975000024999, -1.4999975000024999, -1.4999975000024999, -1.4999975000024999, -1.4999975000024999, -1.4999975000024999, -1.4999975000024999, -1.4999975000024999, -1.4999975000024999, -1.4999975000024999]
[0.13800047199952797, 0.25700032799967204, 0.13300047199952797, 0.13300047199952797, 0.27550032949967046, 0.12300047199952799, 0.2965002884997115, 0.28650030349969646, 0.12300047199952799, 0.14300047199952798]
-14.999975000025
1.9085040814959182
