In [1]:
import numpy as np
import scipy
import scipy.sparse
from sklearn.cluster import MiniBatchKMeans
import time
import datetime
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances

In [2]:
#params
skip_preprocessing = True
input_data = '/Users/lo/Programming/emoji_scripts/m2v_specific_true_sep.csv'
file_name = 'm2v_specific_true'

data = pd.read_csv(input_data, sep=';')
number_of_cols = len(data.columns)
number_of_rows = len(data)
print('cols', number_of_cols, 'rows', number_of_rows)

number_of_clusters = (int)(number_of_rows/1000)
batch_size = (int)(0.3*number_of_rows)
maximum_kmeans_iterations = 100
number_of_initialisations = 10

output_file = str(file_name + '_medoids_clus_' + str(number_of_clusters) + '_b_size_' + str(batch_size) + '_iter_' + str(maximum_kmeans_iterations) + '_init_' + str(number_of_initialisations) + '.csv')
print(output_file)

cols 302 rows 2003064
m2v_specific_true_medoids_clus_2003_b_size_600919_iter_100_init_10.csv


In [3]:
#remove one value columns
if not skip_preprocessing:
    start_time = time.time()
    print('start time', time.ctime())
    columns_to_remove = []

    for col in data.columns:
        column_data = data.loc[:,col]

        if len(column_data.unique()) == 1:
            print('only one value in column', col)
            columns_to_remove.append(col)

    print('before dropping empty columns', len(data.columns))
    data.drop(columns=columns_to_remove)
    print('after dropping empty columns', len(data.columns))
    
    elapsed_time = time.time() - start_time
    print('end time', str(datetime.timedelta(seconds=elapsed_time)))

In [4]:
# def find_nan_in_csr(X, verbose=False):
#     num_of_nans = 0
#     num_of_infs = 0
    
#     X = scipy.sparse.coo_matrix(X)
#     for i, j, v in zip(X.row, X.col, X.data):
#         if np.isnan(v):
#             num_of_nans += 1
#             if verbose:
#                 print(i, j, v)
#         elif np.isinf(v):
#             num_of_infs += 1
#             if verbose:
#                 print(i, j, v)
            
#     print('num of nans', num_of_nans)
#     print('num of infs', num_of_infs)
#     return None

In [5]:
# find_nan_in_csr(data_sparse, verbose=True)
# np.nan_to_num(data_sparse.data, copy=False)
# data_sparse.eliminate_zeros()
# find_nan_in_csr(data_sparse)

In [6]:
start_time = time.time()
print('start time', time.ctime())
print('number_of_clusters', number_of_clusters)
kmeans = MiniBatchKMeans(n_clusters=number_of_clusters, init='k-means++', max_iter=maximum_kmeans_iterations, batch_size=batch_size, verbose=True, compute_labels=True, n_init=number_of_initialisations)
kmeans = kmeans.fit(data)
elapsed_time = time.time() - start_time
print('inertia value', kmeans.inertia_)
print('end time', str(datetime.timedelta(seconds=elapsed_time)))

start time Fri Mar  8 01:43:28 2019
number_of_clusters 2003
Init 1/10 with method: k-means++
Inertia for init 1/10: 162184296452.050385
Init 2/10 with method: k-means++
Inertia for init 2/10: 162090686218.869629
Init 3/10 with method: k-means++
Inertia for init 3/10: 161544993206.261810
Init 4/10 with method: k-means++
Inertia for init 4/10: 162277957095.464111
Init 5/10 with method: k-means++
Inertia for init 5/10: 162247280894.825684
Init 6/10 with method: k-means++
Inertia for init 6/10: 162166584337.349335
Init 7/10 with method: k-means++
Inertia for init 7/10: 161323039375.063141
Init 8/10 with method: k-means++
Inertia for init 8/10: 162252580534.059967
Init 9/10 with method: k-means++
Inertia for init 9/10: 161734563642.818512
Init 10/10 with method: k-means++
Inertia for init 10/10: 161729225779.395081
Minibatch iteration 1/400: mean batch inertia: 89566.666839, ewa inertia: 89566.666839 
Minibatch iteration 2/400: mean batch inertia: 89166.587468, ewa inertia: 89326.619416 
Mi

In [7]:
# (kmeans.cluster_centers_.tolist())
# print(kmeans.labels_)

cluster_assignment_row_indices = {}

# print('max cluster label', kmeans.labels_.max())

for i in range(0, kmeans.labels_.max()+1):
    cluster_assignment_row_indices[i] = []
    

for idx, val in enumerate(kmeans.labels_):
    cluster_assignment_row_indices[val].append(idx)
    
# print(cluster_assignment_row_indices)

medoid_indices = []
empty_clusters = 0
print('number of centres', len(kmeans.cluster_centers_))
for key in cluster_assignment_row_indices:
#     print(key)
    assigned_points = cluster_assignment_row_indices[key]
    
    if len(assigned_points) != 0:
#         print('points', assigned_points)
        points_data = data.iloc[assigned_points, :]
#         print('points data len', len(points_data))
    #     print(points_data)
        centre = kmeans.cluster_centers_[key]
    #     print(type(centre))
        centre_sparse = scipy.sparse.csr_matrix(centre)
    #     print('centre:\n', centre_sparse)
        distances = euclidean_distances(X=[centre], Y=points_data)
    #     print(distances)
        centroid_closes_point_index = np.argmin(distances)
    #     print('index of closest point', centroid_closes_point_index)
        medoid_indices.append(assigned_points[centroid_closes_point_index])
    else:
        empty_clusters += 1
    
print('number of medoids', len(medoid_indices))
print('number of empty clusters', empty_clusters)
medoid_data = data.iloc[medoid_indices, :]

number of centres 2003
number of medoids 2003
number of empty clusters 0


In [8]:
display(medoid_data.head())
print('number of columns', len(medoid_data.columns))

cols_to_remove = []
for col in medoid_data.columns:
#     print(col)
#     print(medoid_data_dense[col])
    if len(medoid_data.loc[:, col].unique()) == 1:
        cols_to_remove.append(col)
        
print('number_of_columns_to_remove', len(cols_to_remove))
# print(cols_to_remove)
medoid_data.drop(columns=cols_to_remove, inplace=True)
display(medoid_data.head())
print('number of rows', len(medoid_data), 'columns', len(medoid_data.columns))

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,291,292,293,294,295,296,297,298,299,300
1476549,1476549,-0.037389,-0.161603,0.00251,-0.084047,-0.046386,0.051671,-0.177254,-0.05338,-0.021205,...,-0.045205,-0.039144,-0.060718,0.072623,0.013894,0.071807,-0.015601,-0.094214,0.080718,1.0
556515,556515,0.04881,-0.059084,0.028075,-0.108739,0.009216,0.037656,0.021714,0.0504,-0.075699,...,-0.043923,-0.0208,-0.16736,0.076087,-0.037813,0.064219,-0.030496,-0.007044,0.059909,0.0
158581,158581,0.040506,-0.11648,0.050503,-0.061919,-0.067655,0.019092,-0.111906,0.050519,0.028107,...,-0.017497,-0.039151,-0.04537,0.003349,-0.063243,-0.052938,-0.037063,-0.058466,0.036524,0.0
974756,974756,0.072404,-0.141246,0.01183,-0.010276,-0.02961,-0.04387,-0.102447,0.013691,0.003988,...,-0.036338,-0.163747,-0.032808,-0.005834,-0.07033,-0.006092,0.035623,-0.076001,0.07644,0.0
1843195,1843195,0.056374,-0.100911,0.039099,-0.030588,-0.004959,0.002027,-0.181162,0.054123,-0.044202,...,-0.06788,-0.063911,-0.053409,0.067435,-0.049773,-0.033076,0.0134,-0.034776,-0.021918,0.0


number of columns 302
number_of_columns_to_remove 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,291,292,293,294,295,296,297,298,299,300
1476549,1476549,-0.037389,-0.161603,0.00251,-0.084047,-0.046386,0.051671,-0.177254,-0.05338,-0.021205,...,-0.045205,-0.039144,-0.060718,0.072623,0.013894,0.071807,-0.015601,-0.094214,0.080718,1.0
556515,556515,0.04881,-0.059084,0.028075,-0.108739,0.009216,0.037656,0.021714,0.0504,-0.075699,...,-0.043923,-0.0208,-0.16736,0.076087,-0.037813,0.064219,-0.030496,-0.007044,0.059909,0.0
158581,158581,0.040506,-0.11648,0.050503,-0.061919,-0.067655,0.019092,-0.111906,0.050519,0.028107,...,-0.017497,-0.039151,-0.04537,0.003349,-0.063243,-0.052938,-0.037063,-0.058466,0.036524,0.0
974756,974756,0.072404,-0.141246,0.01183,-0.010276,-0.02961,-0.04387,-0.102447,0.013691,0.003988,...,-0.036338,-0.163747,-0.032808,-0.005834,-0.07033,-0.006092,0.035623,-0.076001,0.07644,0.0
1843195,1843195,0.056374,-0.100911,0.039099,-0.030588,-0.004959,0.002027,-0.181162,0.054123,-0.044202,...,-0.06788,-0.063911,-0.053409,0.067435,-0.049773,-0.033076,0.0134,-0.034776,-0.021918,0.0


number of rows 2003 columns 302


In [9]:
medoid_data.to_csv(output_file)