# Home

In [1]:
import warnings
import numpy as np 
import pandas as pd
from sklearn import datasets
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.cluster import contingency_matrix

In [2]:
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True, linewidth=150, precision=2)

In [3]:
df = pd.read_csv("vk.csv")

In [4]:
df.columns

Index(['ID', 'countFriends', 'countFollowers', 'boolComments',
       'countOwnerPosts', 'countOwnerReposts', 'countPhotos', 'countVideos',
       'countLikesPhotoes', 'sex'],
      dtype='object')

In [5]:
X = np.array([df.countFriends, df.countFollowers, df.countOwnerReposts, df.countOwnerReposts, df.countPhotos, df.countVideos]).transpose()
y = np.array(df.sex)

# Normalization

In [6]:
# Where axis 0 columns and 1 rows
me = np.mean(X, axis=0) # mean 
ra = np.ptp(X, axis=0)  # range 
Y = np.divide(np.subtract(X, me), ra) # normilizad
Y.shape

(998, 6)

In [7]:
batch_size = 10
centers = np.array([Y[1, :], Y[52,:]]) # Выюираем рандом
n_clusters = len(centers)
n_clusters

2

# Apply K-Means (batch)

К = 5

К = 9

In [8]:
for i in range(5):
    
    mbk = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, init=centers)
    mbk.fit(Y)  # Compute K-mean
    ms = mbk.labels_
    
    cluster1 = X[np.where(ms==0)]
    cluster2 = X[np.where(ms==1)]
    cluster3 = X[np.where(ms==2)]
    
    mc1 = np.mean(cluster1, axis=0)
    mc2 = np.mean(cluster2, axis=0)
    mc3 = np.mean(cluster3, axis=0)
    
    #center of class normalize
    d1 = 100*(np.divide(np.subtract(mc1, me), me))
    d2 = 100*(np.divide(np.subtract(mc2, me), me))
    d3 = 100*(np.divide(np.subtract(mc3, me), me))
    
    
    print("delta1:", d1, "#element:", cluster1.shape[0]) 
    print("delta2:", d2, "#element:", cluster2.shape[0])
    print("delta3:", d3, "#element:", cluster3.shape[0]) 
    print("inertia:", mbk.inertia_) # сумма квадратных ошибок чем меньше чем лучше
    cont_tb = contingency_matrix(labels_true=y, labels_pred=ms) #Как на самом деле (строки) и как мой алгоритм (столбцы) 
    print(cont_tb)
    
    print(" ")

delta1: [ -3.49 -42.53 138.92 138.92  10.57  15.04] #element: 317
delta2: [  1.62  19.8  -64.66 -64.66  -4.92  -7.  ] #element: 681
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 117.80346282351572
[[123 288]
 [194 393]]
 
delta1: [ -4.38 -42.64 139.21 139.21   6.41  15.39] #element: 316
delta2: [  2.03  19.76 -64.5  -64.5   -2.97  -7.13] #element: 682
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 118.42685598514242
[[123 288]
 [193 394]]
 
delta1: [ -3.43 -42.66 138.58 138.58  13.07  14.68] #element: 318
delta2: [  1.61  19.95 -64.8  -64.8   -6.11  -6.86] #element: 680
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 118.22005239869996
[[123 288]
 [195 392]]
 
delta1: [ -6.85 -42.41 140.1  140.1    7.2   14.99] #element: 313
delta2: [  3.13  19.38 -64.02 -64.02  -3.29  -6.85] #element: 685
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 119.05297759155808
[[123 288]
 [190 397]]
 
delta1: [ -7.29 -43.14 142.24 142.24   0.08  14.45] #element: 306
delta2: [ 

In [14]:
for i in range(9):
    
    mbk = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, init=centers)
    mbk.fit(Y)  # Compute K-mean
    ms = mbk.labels_
    centers = mbk.cluster_centers_
    
    cluster1 = X[np.where(ms==0)]
    cluster2 = X[np.where(ms==1)]
    cluster3 = X[np.where(ms==2)]
    
    mc1 = np.mean(cluster1, axis=0)
    mc2 = np.mean(cluster2, axis=0)
    mc3 = np.mean(cluster3, axis=0)
    
    #center of class normalize
    d1 = 100*(np.divide(np.subtract(mc1, me), me))
    d2 = 100*(np.divide(np.subtract(mc2, me), me))
    d3 = 100*(np.divide(np.subtract(mc3, me), me))
    
    
    print("delta1:", d1, "#element:", cluster1.shape[0]) 
    print("delta2:", d2, "#element:", cluster2.shape[0])
    print("delta3:", d3, "#element:", cluster3.shape[0]) 
    print("inertia:", mbk.inertia_) # сумма квадратных ошибок чем меньше чем лучше
    cont_tb = contingency_matrix(labels_true=y, labels_pred=ms) #Как на самом деле (строки) и как мой алгоритм (столбцы) 
    print(cont_tb)
    print(f"center: \n{centers}")
    print()
    
    print(" ")

delta1: [ -2.39 -42.92 126.96 126.96  12.55  15.43] #element: 356
delta2: [  1.32  23.8  -70.4  -70.4   -6.96  -8.56] #element: 642
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 120.17489454585954
[[141 270]
 [215 372]]
center: 
[[ 0.   -0.    0.35  0.35  0.01  0.02]
 [ 0.   -0.   -0.23 -0.23 -0.02  0.01]]

 
delta1: [ -8.23 -43.63 128.66 128.66  12.27  11.56] #element: 350
delta2: [  4.44  23.57 -69.49 -69.49  -6.63  -6.24] #element: 648
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 118.32526561879196
[[139 272]
 [211 376]]
center: 
[[-0.02 -0.01  0.39  0.39  0.02 -0.01]
 [ 0.    0.01 -0.22 -0.22 -0.   -0.  ]]

 
delta1: [ -3.49 -42.53 138.92 138.92  10.57  15.04] #element: 317
delta2: [  1.62  19.8  -64.66 -64.66  -4.92  -7.  ] #element: 681
delta3: [nan nan nan nan nan nan] #element: 0
inertia: 118.08119829156618
[[123 288]
 [194 393]]
center: 
[[-0.02 -0.01  0.45  0.45 -0.   -0.  ]
 [ 0.    0.   -0.2  -0.2  -0.01 -0.01]]

 
delta1: [ -2.19 -42.91 127.28 127.28  11