In [35]:
import csv
import sys
import random
import os
import math
import itertools
import numpy as np
import numpy.linalg as npla
import scipy
from scipy import sparse
from scipy import linalg
import scipy.sparse.linalg as spla
from scipy.spatial import distance
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.mlab as mlab
from mpl_toolkits.mplot3d import axes3d
%matplotlib tk

In [2]:
sys.path.append("../Python_code")
from reddit import *
from path import *
from project_data_analysis import *
from generate_proj_to_remove import *

In [3]:
projects_to_remove = get_list_of_removed_proj(output_filename = "", writeto_file = False)

In [4]:
project_dict = dict()
with open('../data/atlas_complete.json') as f:
    data = json.load(f)
projects_count = 0
for element in data['atlas']:
    pic_id = int(element["id"])
    if pic_id not in projects_to_remove:
        project_dict[pic_id] = projects_count
        projects_count += 1
print(projects_count)

1920


In [20]:
users_dict = dict()
users_count = 0
with open("../data/sorted_tile_placements_proj.csv",'r') as file:
    # Skip first line (header row)
    next(file, None)

    reader = csv.reader(file)
    
    for r in reader:
        user = r[1]
        if user not in users_dict:
            users_dict[user] = users_count
            users_count += 1

In [11]:

def distance_per_project(input_file_proj, projects_to_remove,sample_size):
    #input_file_proj= "../data/sorted_tile_placements_proj.csv"

    user_project_mat,users_dict,projects_dict = user_project_matrix_pixel_color(input_file_proj, projects_to_remove)

    proj_euc_dist = dict()  # Average euclidean distance between sampled users
    proj_cos_dist = dict()  # Average cosine distance between sampled users
    sample_size = 100

    users_per_proj=users_per_project_list(input_file_proj, projects_to_remove)

    for pic_id in users_per_proj:
        all_users = list(users_per_proj[pic_id]) # make a list
        if len(all_users) > sample_size: # sample from it
            all_users = random.sample(all_users, sample_size)

        count=0
        euc_dis=0.0
        cos_dis=0.0
        for i in range(len(all_users)):
            user1 = all_users[i]

            # If the user has no contributions that made it to the final canvas, then skip him
            if np.sum(user_project_mat[users_dict[user1], : ]) == 0:
                continue

            for j in range(i + 1, len(all_users)):

                # If the user has no contributions that made it to the final canvas, then skip him
                user2 = all_users[j]
                if np.sum(user_project_mat[users_dict[user2], : ]) == 0:
                    continue
                count=count+1
                vec1 = user_project_mat[users_dict[user1], : ].toarray()
                vec2 = user_project_mat[users_dict[user2], :].toarray()
                euc_dis = euc_dis+distance.euclidean( vec1 , vec2 )
                cos_dis = cos_dis+distance.cosine( vec1 , vec2 )   

        proj_euc_dist[pic_id]=euc_dis/count
        proj_cos_dist[pic_id]=cos_dis/count
    return proj_euc_dist, proj_cos_dist

Euc_dist_proj, Cos_dist_proj = distance_per_project("../data/sorted_tile_placements_proj.csv", projects_to_remove,100)


KeyboardInterrupt: 

In [None]:
sorted_euc_dist_per_proj = sorted(Euc_dist_proj.items(), key=operator.itemgetter(1), reverse=True)
#Top-10
for i in range(10):
    proj = sorted_euc_dist_per_proj[i][0]
    n = sorted_euc_dist_per_proj[i][1]
    name = names[int(proj)]
    desc = descriptions[int(proj)]
    
    print("#", i, ", project: ", name, ", distance: ", n, ", \ndesc: ", desc, "\n")

In [22]:
%%time
"""
    Type1 matrix: position (i,j) is the number of pixels by user i in project j 
"""

# Syntax:
# euc_dist = distance.euclidean(a,b)
# cos_dist = distance.cosine(a,b)

"""
    user_euc_dist and user_cos_dist is a dictionary of the following format:
        pic_id: user_distance list
    Create a new list for every project where every element is the euclidean distance or
    cosine distance between two users in a project.
    
    user_distances is a dictionary that stores the distance between two user vectors so we do not have to recompute it.
"""

users_euc_dist = dict()  # Position (i,j) is the euclidean distance between user vectors i and j
users_cos_dist = dict()  # Position (i,j) is the cosine distance between user vectors i and j
user_distances = dict()

# Calculating distance between every pair of users takes too long. (Several years)
# Sample at most 100 users for each project
sample_size = 100

for pic_id in users_per_project:
    all_users = list(users_per_project.get(pic_id))
#     print(len(all_users))
    
    
    if len(all_users) > sample_size:
        all_users = random.sample(all_users, sample_size)

    users_euc_dist[pic_id] = []
    users_cos_dist[pic_id] = []
#     print(user_matrix[pic_id].shape())
    
    for i in range(len(all_users)):
        user1 = all_users[i]
        
        # If the user has no contributions that made it to the final canvas, then skip him
        if np.sum(type1[users_dict[user1], : ]) == 0:
            continue
        
        for j in range(i + 1, len(all_users)):
            
            # If the user has no contributions that made it to the final canvas, then skip him
            user2 = all_users[j]
            if np.sum(type1[users_dict[user2], : ]) == 0:
                continue
            
            
            # Make sure user1 is the smaller value
            if user2 < user1:
                temp = user2
                user2 = user1
                user1 = temp
                
            if (user1, user2) not in user_distances:
                
                vec1 = type1[users_dict[user1], : ].toarray()
                vec2 = type1[users_dict[user2], :].toarray()
                euc_dist = distance.euclidean( vec1 , vec2 )
                cos_dist = distance.cosine( vec1 , vec2 )

                user_distances[(user1, user2)] = (euc_dist, cos_dist)

            users_euc_dist[pic_id].append(user_distances[(user1, user2)][0])
            users_cos_dist[pic_id].append(user_distances[(user1, user2)][1])
    
    

CPU times: user 11min 5s, sys: 7.27 s, total: 11min 12s
Wall time: 11min 8s


In [6]:
users_count

1166924

In [21]:
def distance_users(project_user,users_project, source_user, target_user):
    dict_source=users_project[source_user]
    dict_target=users_project[target_user]
    dist=0
    for pic_id in project_user:
        if pic_id in dict_source and pic_id in dict_target:
            dist=dist+0.0
        elif pic_id not in dict_source and pic_id not in dict_target:
            dist=dist+0.0
        else:
            dist=dist+1.0
    return math.sqrt(dist)      

In [26]:
def cosine_distance_users(project_user,users_project, source_user, target_user):
    dict_source=users_project[source_user]
    dict_target=users_project[target_user]
    dist=0
    dist_s= len(dict_source)
    dist_t= len(dict_source)
    for pic_id in project_user:
        if pic_id in dict_source and pic_id in dict_target:
            dist=dist+1.0          
        else:
            dist=dist+0.0
        
    return dist/(math.sqrt(dist_s*dist_t))

In [28]:

#1 - Position (i,j) is the number of pixels by user i in project j (final)
#type1 = np.zeros((users_count, projects_count))
#2 - Position (i,j) is the number of updates by user i that agree with pixels in project j.
#type2 = np.zeros((users_count, projects_count))
#3 - Position (i,j) is the number of updates by user i that in the bounding area of project j and disagree with that pixel in j. These are updates likely against the project.
#type3 = np.zeros((users_count, projects_count))
    
users_project=dict()
project_user=dict()

user_dist_proj=dict()
#with open("../data/sorted_tile_placements_proj.csv", "r") as file:
with open("../data/sorted_tile_placements_proj.csv", "r") as file:
    reader = csv.reader(file, delimiter = ",")
     # Skip first line (header row)
    next(file, None)
    # user:1, pic_id: 5, pixel: 6, pixel_color: 7
    for r in reader:
        #print(users_project)
        #print(project_user)
        #print("-------------")
        user = r[1]
        pic_id = int(r[5])
        pixel = int(r[6])
        pixel_color = int(r[7])
        if pixel==1:
            
            if pic_id not in projects_to_remove:
                if pixel_color == 1:
                    if pic_id not in project_user:
                        project_user[pic_id]=[]
                        project_user[pic_id].append(user)
                        #print("hey")
                    else:
                        if user not in project_user[pic_id]:
                            project_user[pic_id].append(user)
                        
                            
                            
                    if user not in users_project:
                        #print("hey")
                        proj_dic=dict()
                        proj_dic[pic_id]=1
                        users_project[user]=proj_dic
                    else:
                        if pic_id not in users_project[user]:
                            users_project[user][pic_id]=1
                        else:
                            users_project[user][pic_id]=users_project[user][pic_id]+1
#print(project_user) 
#print(users_project)
sample_size=10
for pic_id in project_user:
        all_users_source = list(project_user[pic_id])
        all_users_target = list(project_user[pic_id])
        
        if(sample_size<len(all_users_source)):
            all_users_source = random.sample(all_users_source, sample_size)
            all_users_target = random.sample(all_users_target, sample_size)
        
        
        avg_dist=0.0;
        for i in range(0,len(all_users_target)):
            avg_dist=avg_dist+ distance_users(project_user,users_project, all_users_source[i], all_users_target[i])
        
        user_dist_proj[pic_id]=avg_dist/sample_size
        print(user_dist_proj[pic_id])
        #print(pic_id)


2.1829159797129507
2.799284524163065
2.3221422880844313
1.5543203766865055
2.3380771057732685
2.3819864582044072
2.5968004053396796
1.4886349517372675
1.9662545694057534
1.9822587087510009
2.0938166082500502
2.177010214117348
2.798341803751984
1.9675013582531957
1.1114383155010639
1.6009967675098244
2.1776033787853604
2.0957527767097925
1.4376589663793085
1.4624973034561046
1.4617187870101171
0.3863703305156273
2.4267978754625332
1.006449510224598
1.616551208609136
1.3292528739883944
2.167047088834458
2.1394596501626415
2.882805295727564
1.8045243855508328
1.5032613887314645
1.4562660442953432
2.7137822616260663
2.546206526108746
1.8911475186413973
1.9882496571450452
1.7497731865508839
1.5945844149342396
3.1539817885622012
1.8073327514014683
2.1980334518870093
2.433604947384244
2.031461385106796
1.9125752247980716
1.8341276706445804
2.1158419378551843
2.155372180999993
1.9295923705362696
2.113477072952194
2.7888636566418703
2.0904160263422056
2.090628969044242
3.057132722470787
1.59269

2.704615399192739
2.4516953784128384
2.593932729825736
1.642757620295009
1.8074462777344227
2.6177411804141038
1.9758922011234847
1.238572731292051
2.8183591692967696
1.2545246247676256
1.2260647524952613
1.4437580654985536
1.1705529847416658
1.6482103630097824
1.9930494886488659
2.362659490727286
1.9450162202795993
2.275892201123485
2.0079539127740995
3.3503296503844338
0.8242640687119286
1.487831517751085
3.0727372339598293
1.4674861087325706
2.3468594373196985
2.0439949502709522
2.3142466813516993
2.728155850634143
1.1135047463066146
2.4241391455341708
1.6310535577717271
0.0
2.179119602824481
2.3396602205500416
2.335439646602581
2.2116671280635525
1.4830609635384557
2.337659809643324
2.224635183959092
1.7593714556303925
2.4462841798905672
2.460611647319714
1.8977045168377864
1.4765681057939963
1.7164460170806748
3.193489777947238
1.6883627011906739
1.6228766310021279
1.6496715502452404
1.9375030679963252
1.4448869930498944
2.1535855578418874
2.0951908578754717
1.6750052822528947
1.7

In [26]:
%%time 
#1 - Position (i,j) is the number of pixels by user i in project j (final)
type1 = np.zeros((users_count, projects_count))
#2 - Position (i,j) is the number of updates by user i that agree with pixels in project j.
#type2 = np.zeros((users_count, projects_count))
#3 - Position (i,j) is the number of updates by user i that in the bounding area of project j and disagree with that pixel in j. These are updates likely against the project.
#type3 = np.zeros((users_count, projects_count))
    

users_per_project = dict()
with open("../data/sorted_tile_placements_proj.csv", "r") as file:
    reader = csv.reader(file, delimiter = ",")
     # Skip first line (header row)
    next(file, None)
    # user:1, pic_id: 5, pixel: 6, pixel_color: 7
    for r in reader:
        user = r[1]
        pic_id = int(r[5])
        pixel = int(r[6])
        pixel_color = int(r[7])

        try:
            if pic_id not in projects_to_remove:

                if pic_id not in users_per_project:
                    users_per_project[pic_id] = set()

                users_per_project[pic_id].add(user)

                if pixel == 1:
                    type1[users_dict[user],project_dict[pic_id]] += 1

                #if pixel_color == 1:
                    #type2[users_dict[user],project_dict[pic_id]] += 1

                #elif pixel_color == 0:
                    #type3[users_dict[user],project_dict[pic_id]] += 1
        except Exception as e:
            print(e)
            print(pic_id)
            print(user)
            break
 
    type1 = sparse.csr_matrix(type1)
    #type2 = sparse.csr_matrix(type2)
    #type3 = sparse.csr_matrix(type3)


MemoryError: 

In [15]:
# Calculate average users per project
average_num_users = int(users_count / len(users_per_project))
print(average_num_users)

615


In [None]:
%matplotlib inline
# plot the user distances of a single project given the pic_id
# Testing on some pic_ids
def plot_distances_icdf(pic_id):
    euc_dict = dict()
    cos_dict = dict()
    for i in range(len(users_euc_dist[pic_id])):
        euc_dict[i] = int(users_euc_dist[pic_id][i]*1000)
        cos_dict[i] = int(users_cos_dist[pic_id][i]*1000)
        
    plt.clf()

    #Computing ICDF
    count_euc = icdf(euc_dict)
    count_cos = icdf(cos_dict)
    ax = plt.subplot(111)
    ax.plot(np.arange(count_euc.shape[0])/1000, count_euc, color="red", linewidth=4, label="euclidean distance")
    ax.plot(np.arange(count_cos.shape[0])/1000, count_cos, color="blue", linewidth=4, label="cosine distance")
    ax.set_ylabel('ICDF', fontsize=30)
    ax.set_xlabel('distance between users', fontsize=30)
    ax.tick_params(labelsize=23)
    #ax.set_xlim(1, 1000000)
    ax.legend()
    ax.set_title('user distances plot for pic_id: '+str(pic_id))
    #plt.savefig(output_file_name, dpi=300, bbox_inches='tight')

plot_distances_icdf(23)

In [None]:
#This part is not necessary

def distance_users(project_user,users_project, source_user, target_user):
	dict_source=users_project[source_user]
	dict_target=users_project[target_user]
	dist=0
	for pic_id in project_user:
		if pic_id in dict_source and pic_id in dict_target:
			dist=dist+0.0
		elif pic_id not in dict_source and pic_id not in dict_target:
			dist=dist+0.0
		else:
			dist=dist+1.0
	return math.sqrt(dist)



def user_distance_per_project(input_file_proj, projects_to_remove,sample_size):

 	"""
		#1 - Position (i,j) is the number of pixels by user i in project j (final)
		#type1 = np.zeros((users_count, projects_count))
		#We only consider type1 here

		#2 - Position (i,j) is the number of updates by user i that agree with pixels in project j.
		#type2 = np.zeros((users_count, projects_count))
		#3 - Position (i,j) is the number of updates by user i that in the bounding area of project j and disagree with that 			pixel in j. These are updates likely against the project.
		#type3 = np.zeros((users_count, projects_count))
	"""    
	users_project=dict()
	project_user=dict()

	user_dist_proj=dict()
#with open("../data/sorted_tile_placements_proj.csv", "r") as file:
	with open(input_file_proj, "r") as file:
		reader = csv.reader(file, delimiter = ",")
     # Skip first line (header row)
		next(file, None)
	# user:1, pic_id: 5, pixel: 6, pixel_color: 7
		for r in reader:
		#print(users_project)
		#print(project_user)
		#print("-------------")
			user = r[1]
			 pic_id = int(r[5])
			pixel = int(r[6])
			pixel_color = int(r[7])
			if pixel==1:
            
				if pic_id not in projects_to_remove:
					if pixel_color == 1:
						if pic_id not in project_user:
							project_user[pic_id]=[]
							project_user[pic_id].append(user)
                        #print("hey")
						else:
							if user not in project_user[pic_id]:
								project_user[pic_id].append(user)
                        
                            
                            
						if user not in users_project:
                        #print("hey")
							proj_dic=dict()
							proj_dic[pic_id]=1
							users_project[user]=proj_dic
						else:
							if pic_id not in users_project[user]:
								users_project[user][pic_id]=1
							else:
								users_project[user][pic_id]=users_project[user][pic_id]+1

	for pic_id in project_user:
		all_users_source = list(project_user[pic_id])
		all_users_target = list(project_user[pic_id])
        
		if(sample_size<len(all_users_source)):
			all_users_source = random.sample(all_users_source, sample_size)
			all_users_target = random.sample(all_users_target, sample_size)
        
        
		avg_dist=0.0;
		for i in range(0,len(all_users_target)):
			avg_dist=avg_dist+ distance_users(project_user,users_project, all_users_source[i], all_users_target[i])
        
		user_dist_proj[pic_id]=avg_dist/sample_size
        #print(user_dist_proj[pic_id])
        #print(pic_id)
	return user_dist_proj

In [55]:
from sklearn.cluster.bicluster import SpectralCoclustering

input_file_proj = "../data/sorted_tile_placements_proj.csv"
user_proj_matrix_pixel, users_dict, proj_dict = user_project_matrix_pixel(input_file_proj, projects_to_remove)

n_clusters = 500

cocluster = SpectralCoclustering(n_clusters=n_clusters,
                                 svd_method='arpack', random_state=0)

cocluster.fit(user_proj_matrix_pixel)
y_cocluster = cocluster.row_labels_

x_cocluster = cocluster.column_labels_


In [56]:
locations = store_locations("../data/atlas_complete.json")
for project in proj_dict:
    print(project, locations[project].get_name(), " in cluster ",x_cocluster[ proj_dict[project] ])

230 Nine Inch Nails  in cluster  313
1031 University of Colorado Boulder  in cluster  324
118 In the Aeroplane Over the Sea  in cluster  374
748 Totoro  in cluster  224
206 R.I.P. Steve Irwin  in cluster  162
1038 Rainbow Grid  in cluster  273
1030 University of Pennsylvania  in cluster  284
212 AniList  in cluster  304
440 Android Gaming  in cluster  355
283 Poké balls  in cluster  241
83 DotA 2  in cluster  155
437 Professor Layton  in cluster  252
378 Nintendo Switch  in cluster  103
474 Urho Kekkonen  in cluster  486
398 Link  in cluster  252
380 Skeletor  in cluster  63
154 Radiohead  in cluster  313
115 Portland Trail Blazers  in cluster  258
145 Parahumans  in cluster  127
1618 /r/parahumans worm  in cluster  127
1329 Free Tyler1  in cluster  84
572 Bisexual pride flag  in cluster  194
179 He-Man  in cluster  150
382 AvoArmy  in cluster  433
1187 Vermont  in cluster  339
2024 Tatlin's Tower  in cluster  196
426 Wally  in cluster  262
41 Flag of India  in cluster  325
1058 The Al

1779 Azerbaijan  in cluster  254
754 Red-White-Blue Ribbon  in cluster  383
69 Anarcho-Communist Flag  in cluster  53
225 Tampa Bay Lightning  in cluster  228
479 Game Attack  in cluster  126
372 Flag of Russia  in cluster  97
1756 Coat of friendship between Serbia and Russia  in cluster  400
1267 FREESM!  in cluster  0
1476 Memorial to SUP  in cluster  493
84 Creeper  in cluster  392
324 Coat of arms of the Netherlands  in cluster  64
615 Hunter × Hunter  in cluster  69
634 Deccepticon Insignia  in cluster  49
455 Wall St. Bets  in cluster  471
730 Flag of Devon  in cluster  375
922 Devon and Dorset  in cluster  375
815 Space Invaders on the Rainbow Road  in cluster  298
1854 Argentina Heart  in cluster  144
1223 Flag of the Second Spanish Republic heart  in cluster  306
406 Chicago  in cluster  284
894 K-On!  in cluster  246
1173 Russian heart  in cluster  362
953 Redox OS  in cluster  386
204 The Queen  in cluster  165
1933 South Korean Avocado  in cluster  433
816 Flag of Artsakh  

1915 The Sovereign Triangle  in cluster  419
1297 Erase The Place  in cluster  324
117 Stardew Valley  in cluster  419
1436 Mini flag of Estonia  in cluster  193
1932 Brady Haran  in cluster  413
508 Beer  in cluster  89
1697 Australian Drop Bear  in cluster  404
579 Australian Aboriginal Flag  in cluster  303
375 The Trees Network icon  in cluster  419
499 Asriel Dreemurr  in cluster  418
338 Magic: The Gathering  in cluster  320
1288 r/wine  in cluster  120
1850 ZeldaPower  in cluster  53
719 Taiwan R.O.C.  in cluster  478
607 Fox  in cluster  348
1774 Gmail logo  in cluster  90
374 Flag of Acadia  in cluster  80
570 University of Oklahoma Logo  in cluster  324
1274 camu.  in cluster  249
1309 lofi.hiphop  in cluster  70
223 Day[9]TV Logo  in cluster  120
1768 Brd  in cluster  108
912 Belgium-Pakistan-Bakchodi war  in cluster  0
642 A-NI  in cluster  135
1106 Tsundere shark  in cluster  298
1180 Voids second core  in cluster  147
447 D2 Highway  in cluster  486
1865 TriHard  in clust

In [62]:
# Group all projects in the same cluster together in a dictionary
# Key is cluster number
# Value is list of projects
proj_clusters = dict()
for project in proj_dict:
    if proj_clusters.get( x_cocluster[ proj_dict[project] ] ) == None:
        proj_clusters[ x_cocluster[ proj_dict[project] ] ] = list()
    
    proj_clusters[ x_cocluster[ proj_dict[project] ] ].append( project + " " + locations[project].get_name() )

In [65]:
# Group all users in the same cluster together in a dictionary
# Key is cluster number
# Value is list of users
user_clusters = dict()
for user in users_dict:
    if user_clusters.get( y_cocluster[ users_dict[user] ] ) == None:
        user_clusters[ y_cocluster[ users_dict[user] ] ] = list()
        
    user_clusters[ y_cocluster[ users_dict[user] ] ].append( user )

In [90]:
# Get all the clusters that only contain users and no projects
clusters_with_only_users = set(user_clusters.keys()) - set( proj_clusters.keys() )
print(len(clusters_with_only_users), "clusters with no projects")

# Get all the clusters that only contain projects and no users
clusters_with_only_projects =  set( proj_clusters.keys() ) - set(user_clusters.keys())
print(len(clusters_with_only_projects), "clusters with no users")


145 clusters with no projects
6 clusters with no users


In [78]:
def print_top_k_clustsers(proj_clusters, user_clusters, k = 10):
    '''
        Print the k clusters that have the largest number of projects in 
        If k >= number of clusters, then print out every cluster.
        Also print the number of users in the same cluster
    '''
    if k >= len(proj_clusters):
        k = len(proj_clusters)
    sorted_by_longest_clusters = sorted([(len(v), k) for k, v in proj_clusters.items()], reverse = True)
    
    for i in range(k):
        cluster_size, cluster_num = sorted_by_longest_clusters[i]
        print("Cluster:",cluster_num, ", Size:", cluster_size)
        if user_clusters.get(cluster_num) != None:
            print("Number users:", len(user_clusters[cluster_num]))
        else:
            print("Number users: 0")
        for project in proj_clusters[cluster_num]:
            print("\t",project)
    

In [79]:
def print_bottom_k_clustsers(proj_clusters, user_clusters, k = 10):
    '''
        Print the k clusters that have the lowest number of projects in order.
        If k >= number of clusters, then print out every cluster
    '''
    if k >= len(proj_clusters):
        k = len(proj_clusters)
    
    sorted_by_longest_clusters = sorted([(len(v), k) for k, v in proj_clusters.items()])
    
    for i in range(k):
        cluster_size, cluster_num = sorted_by_longest_clusters[i]
        print("Cluster:",cluster_num, ", Size:", cluster_size)
        if user_clusters.get(cluster_num) != None:
            print("Number users:", len(user_clusters[cluster_num]))
        else:
            print("Number users: 0")
        for project in proj_clusters[cluster_num]:
            print("\t",project)
    

In [80]:
print_top_k_clustsers(proj_clusters, user_clusters, 1000)

Cluster: 120 , Size: 28
Number users: 4321
	 456 Dwarf Fortress
	 397 Flag of the EU
	 20 Flag of Germany
	 1469 Heart of Bangladesh
	 1090 Asexual Heart
	 1558 Welsh Heart
	 320 The Kekistan/Rainbow Flag
	 350 FIFA World Cup Trophy
	 1135 KAKAO FRIENDS Heart
	 516 The flag of Vietnam in /r/placehearts
	 587 Kanye Heart
	 1781 Flag of El Salvador
	 1464 Galicia Heart
	 1182 Colombian Heart
	 1385 Dominican Republic heart
	 475 Flag of Spain
	 1437 Luxembourg heart
	 868 Fernsehturm Berlin
	 19 European Union
	 1157 dale
	 1957 Malta's Heart
	 1714 /u/loulan's croissant
	 1050 American Flag Banner
	 1288 r/wine
	 223 Day[9]TV Logo
	 613 Battle with the Void/Relocated Indonesian Flag
	 1018 #1 in moon landings
	 503 Terraria's guide
Cluster: 144 , Size: 24
Number users: 1370
	 950 South Korea heart
	 1235 /r/kpop banner
	 1317 Mamamoo
	 657 Irish Fighter
	 1079 Denver Broncos Logo
	 1399 Lipstick Lesbian pride flag
	 119 Doctor Who
	 1462 The TARDIS
	 1693 Egypt flag heart
	 1854 Argenti

	 121 RURURURURU
	 1356 Nebraska Huskers
	 822 Last_Grey_Wolf Icon
	 328 Hugo Chavez
	 402 University Of Texas at Austin
Cluster: 487 , Size: 5
Number users: 300
	 833 Apocalypse Rising
	 784 Temple University
	 1426 /r/counting
	 1494 Seattle Mariners logo
	 812 Goomy (Pokémon)
Cluster: 477 , Size: 5
Number users: 101
	 727 Coldplay: Parachutes
	 1538 Coldplay: A Rush of Blood to the Head
	 1537 Coldplay: X&Y
	 1602 TOPS
	 1592 Make Trade Fair
Cluster: 470 , Size: 5
Number users: 1117
	 288 New Zealand
	 1458 Snoo
	 1785 Danish heart
	 780 Kingdom Hearts
	 1503 Australia flag
Cluster: 430 , Size: 5
Number users: 952
	 191 Wario
	 453 Undertale corner
	 1442 Greek heart
	 1151 Maltese Flag Heart
	 687 Kingdom Key
Cluster: 426 , Size: 5
Number users: 164
	 114 Liquicity logo
	 1174 Remnants of the /r/PictureGame Kiwi
	 1411 The r/picturegame logo
	 1879 Picture Game
	 364 Joey's Face
Cluster: 413 , Size: 5
Number users: 113
	 1712 Nail and Gear
	 1962 Gear and Flask
	 1566 CGP Grey
	 19

Number users: 315
	 1756 Coat of friendship between Serbia and Russia
Cluster: 396 , Size: 1
Number users: 0
	 932 Toucan
Cluster: 392 , Size: 1
Number users: 84
	 84 Creeper
Cluster: 382 , Size: 1
Number users: 103
	 1049 University of Central Florida
Cluster: 381 , Size: 1
Number users: 62
	 839 Risk of Rain - Commando
Cluster: 378 , Size: 1
Number users: 75
	 583 The heart of Esperanto
Cluster: 373 , Size: 1
Number users: 103
	 238 Spicy Chef and Sad Crab
Cluster: 372 , Size: 1
Number users: 6
	 1376 Sydney FC
Cluster: 368 , Size: 1
Number users: 54
	 1142 Gomez (Fez Game)
Cluster: 366 , Size: 1
Number users: 65
	 740 Sheffield
Cluster: 363 , Size: 1
Number users: 230
	 496 The Rise of MMA
Cluster: 361 , Size: 1
Number users: 520
	 1956 Avacado
Cluster: 353 , Size: 1
Number users: 353
	 316 Summit1G
Cluster: 352 , Size: 1
Number users: 10
	 36 Mozilla
Cluster: 346 , Size: 1
Number users: 663
	 245 The Rolling Stones
Cluster: 339 , Size: 1
Number users: 37
	 1187 Vermont
Cluster: 335

In [61]:
print_bottom_k_clustsers(clusters)

Cluster: 1 , Size: 1
	 1520 The R
Cluster: 17 , Size: 1
	 1491 Fishy
Cluster: 21 , Size: 1
	 845 Ashley
Cluster: 23 , Size: 1
	 77 Flag of Peru
Cluster: 25 , Size: 1
	 1095 Century Club
Cluster: 30 , Size: 1
	 112 Austyn
Cluster: 33 , Size: 1
	 986 Surprised Patrick
Cluster: 39 , Size: 1
	 153 One-Punch Man
Cluster: 42 , Size: 1
	 35 Rust
Cluster: 46 , Size: 1
	 1445 Frame
