# Home

In [1]:
import warnings
import numpy as np 
import pandas as pd
from sklearn import datasets
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.cluster import contingency_matrix

In [2]:
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True, linewidth=150, precision=2)

In [3]:
df = pd.read_csv("vk.csv")

In [4]:
df.columns

Index(['ID', 'countFriends', 'countFollowers', 'boolComments',
       'countOwnerPosts', 'countOwnerReposts', 'countPhotos', 'countVideos',
       'countLikesPhotoes', 'sex'],
      dtype='object')

In [5]:
X = np.array([df.countFriends, df.countFollowers, df.countOwnerReposts, df.countOwnerReposts, df.countPhotos, df.countVideos]).transpose()
y = np.array(df.sex)

# Normalization

In [16]:
# Where axis 0 columns and 1 rows
me = np.mean(X, axis=0) # mean 
ra = np.ptp(X, axis=0)  # max - min # range 
Y = np.divide(np.subtract(X, me), ra) # normilizad
print(f"mean:{me}")
print(f"range: {ra}")
print(f"Y: {Y}")

mean:[1459.55  807.1     6.15    6.15   16.69  361.58]
range: [ 9996 57037    20    20   275  7719]
Y: [[-0.14 -0.01  0.14  0.14 -0.05 -0.04]
 [-0.13 -0.01  0.39  0.39 -0.03 -0.04]
 [-0.11 -0.   -0.31 -0.31  0.08 -0.04]
 ...
 [-0.14 -0.01 -0.16 -0.16 -0.05 -0.02]
 [ 0.1  -0.01 -0.31 -0.31 -0.06 -0.05]
 [-0.14 -0.01 -0.11 -0.11 -0.05 -0.03]]


In [7]:
batch_size = 10
centers = np.array([Y[1, :], Y[52,:]]) # Выюираем рандом
n_clusters = len(centers)
n_clusters

2

# Apply K-Means (batch)

К = 5

К = 9

In [8]:
for i in range(5):
    
    mbk = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, init=centers)
    mbk.fit(Y)  # Compute K-mean
    ms = mbk.labels_
    
    cluster1 = X[np.where(ms==0)]
    cluster2 = X[np.where(ms==1)]
    cluster3 = X[np.where(ms==2)]
    
    mc1 = np.mean(cluster1, axis=0)
    mc2 = np.mean(cluster2, axis=0)
    mc3 = np.mean(cluster3, axis=0)
    
    #center of class normalize
    d1 = 100*(np.divide(np.subtract(mc1, me), me))
    d2 = 100*(np.divide(np.subtract(mc2, me), me))
    d3 = 100*(np.divide(np.subtract(mc3, me), me))
    
    
    print("delta1:", d1, "#element:", cluster1.shape[0]) 
    print("delta2:", d2, "#element:", cluster2.shape[0])
    print("delta3:", d3, "#element:", cluster3.shape[0]) 
    print("inertia:", mbk.inertia_) # сумма квадратных ошибок чем меньше чем лучше
    cont_tb = contingency_matrix(labels_true=y, labels_pred=ms) #Как на самом деле (строки) и как мой алгоритм (столбцы) 
    print(cont_tb)
    
    print(" ")

delta1: [  3.09 -41.75 137.23 137.23   9.32  14.29] #element: 322
delta2: [ -1.47  19.89 -65.37 -65.37  -4.44  -6.81] #element: 676
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 118.73563366321248
[[124 287]
 [198 389]]
 
delta1: [ -2.19 -42.91 127.28 127.28  11.11  10.93] #element: 355
delta2: [  1.21  23.69 -70.27 -70.27  -6.13  -6.04] #element: 643
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 118.37173804757661
[[140 271]
 [215 372]]
 
delta1: [ -3.49 -42.53 138.92 138.92  10.57  15.04] #element: 317
delta2: [  1.62  19.8  -64.66 -64.66  -4.92  -7.  ] #element: 681
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 117.45342228152116
[[123 288]
 [194 393]]
 
delta1: [ -5.41 -43.93 127.83 127.83  11.57  11.11] #element: 353
delta2: [  2.96  24.04 -69.96 -69.96  -6.33  -6.08] #element: 645
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 118.84516910868788
[[140 271]
 [213 374]]
 
delta1: [ -3.49 -42.53 138.92 138.92  10.57  15.04] #element: 317
delta2: [ 

In [9]:
for i in range(9):
    
    mbk = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, init=centers)
    mbk.fit(Y)  # Compute K-mean
    ms = mbk.labels_
    centers = mbk.cluster_centers_
    
    cluster1 = X[np.where(ms==0)]
    cluster2 = X[np.where(ms==1)]
    cluster3 = X[np.where(ms==2)]
    
    mc1 = np.mean(cluster1, axis=0)
    mc2 = np.mean(cluster2, axis=0)
    mc3 = np.mean(cluster3, axis=0)
    
    #center of class normalize
    d1 = 100*(np.divide(np.subtract(mc1, me), me))
    d2 = 100*(np.divide(np.subtract(mc2, me), me))
    d3 = 100*(np.divide(np.subtract(mc3, me), me))
    
    
    print("delta1:", d1, "#element:", cluster1.shape[0]) 
    print("delta2:", d2, "#element:", cluster2.shape[0])
    print("delta3:", d3, "#element:", cluster3.shape[0]) 
    print("inertia:", mbk.inertia_) # сумма квадратных ошибок чем меньше чем лучше
    cont_tb = contingency_matrix(labels_true=y, labels_pred=ms) #Как на самом деле (строки) и как мой алгоритм (столбцы) 
    print(cont_tb)
    print(f"center: \n{centers}")
    print()
    
    print(" ")

delta1: [ -3.49 -42.53 138.92 138.92  10.57  15.04] #element: 317
delta2: [  1.62  19.8  -64.66 -64.66  -4.92  -7.  ] #element: 681
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 117.58776481310824
[[123 288]
 [194 393]]
center: 
[[ 0.   -0.01  0.44  0.44  0.02  0.01]
 [ 0.    0.   -0.2  -0.2   0.   -0.  ]]

 
delta1: [ -5.46 -43.83 128.1  128.1    9.31  11.42] #element: 352
delta2: [  2.98  23.88 -69.8  -69.8   -5.08  -6.22] #element: 646
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 119.33390869978231
[[140 271]
 [212 375]]
center: 
[[-0.03 -0.01  0.4   0.4  -0.01  0.01]
 [-0.03 -0.   -0.22 -0.22  0.   -0.01]]

 
delta1: [ -2.19 -42.91 127.28 127.28  11.11  10.93] #element: 355
delta2: [  1.21  23.69 -70.27 -70.27  -6.13  -6.04] #element: 643
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 118.56802406650945
[[140 271]
 [215 372]]
center: 
[[ 0.01 -0.    0.38  0.38  0.    0.01]
 [ 0.02  0.   -0.22 -0.22 -0.    0.  ]]

 
delta1: [ -3.49 -42.53 138.92 138.92  10

In [33]:
np.ptp([
        [1, 8, 6, 4],
        [10, 1, 3, 7],
        [2, 2, 3, 2],
            [2, 2, 3, 2],
            [2, 2, 3, 2],
            [2, 2, 3, 2],
            [2, 2, 3, 2],
       ], axis = 0)

array([9, 7, 3, 5])

In [34]:
import cv2 