In [None]:


import numpy as np # linear algebra


import os
for dirname, _, filenames in os.walk(''):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

from sklearn.cluster import KMeans
from collections import defaultdict
import io

In [2]:
def file_preprocess(file_name, nwords):
  '''
  function to readtext file and process it to create data for clustering
  
  RETURNS: numpy array of vector dimension and list of words
  '''
  np_arrays = []
  wordlist = []
  with io.open(file_name, mode='r', encoding='utf-8') as f:
    #print("len" , len(f.readlines()))
    for index, line in enumerate(f):
      tokens = line.split()
      wordlist.append(tokens[0])
      np_arrays.append( np.array([float(i) for i in tokens[1:]]) )

      if index == nwords:
        return np.array( np_arrays ), wordlist

  return np.array( np_arrays ), wordlist

In [3]:
def assign_word2cluster(word_list, cluster_labels):
  '''
  RETURNS: dict {"cluster":[words  assigend to cluster]}
  '''
  cluster_to_words = defaultdict(list)
  for index, cluster in enumerate(cluster_labels):
    cluster_to_words[ cluster ].append( word_list[index] )
  return cluster_to_words
  
  

In [5]:
if __name__ == "__main__":
    

  cluster_data_file = "clustering_data.txt" 

  #Number of words to analyse according to memory availability
  n_words = 3000 # Number of words to analyse according to memory availability
  reduction_factor =.1  # Amount of dimension reduction {0,1}
  n_clusters = int( n_words * reduction_factor ) # Number of clusters to make
  cluster_data, wordlist = file_preprocess(cluster_data_file, nwords = n_words)
  
  kmeans_model = KMeans(init='k-means++', n_clusters=n_clusters, n_init=15,random_state=1,max_iter=500,verbose=0)
  kmeans_model.fit(cluster_data)

  cluster_labels  = kmeans_model.labels_ #returns all cluster number assigned to each word respectively
  cluster_to_words  = assign_word2cluster(wordlist, cluster_labels)
  #saving output in outut.text file
  with io.open("output.txt",mode='w+',encoding="UTF-8") as file:
    for key in sorted(cluster_to_words.keys()) :
        file.writelines("Cluster "+str(key) +" :: "+ "|".join( k for k in cluster_to_words[key])+"\n")
        print("Cluster "+str(key) , " :: " , "|".join( k for k in cluster_to_words[key]))
    file.close()

Cluster 0  ::  china|chinese|korea|taiwan|beijing|vietnam
Cluster 1  ::  older|generation
Cluster 2  ::  cut|fall|raised|drop|reserve|grew|fed|drew|marks|falls|sharp
Cluster 3  ::  lead|key|ahead|failed|helped|effort|chance|reach|success|hopes|challenge|target|opportunity|managed|advantage|advance|closer|leads|surprise|secure|chances|crucial
Cluster 4  ::  %|40|60|70|80|35|45|90|65|32|33|36|75|34|44|38|37|48
Cluster 5  ::  i|my|'re|me|'m|am|'d|guy|guys|everybody
Cluster 6  ::  interest|job|gain|retirement|returns|employment
Cluster 7  ::  title|grand|super|classic|titles
Cluster 8  ::  already|remain|remains|throughout|remained|mostly|largely|mainly|presence|increasingly|heavily|abroad|elsewhere
Cluster 9  ::  live|!|love|fans|watch|happy|kids|walk|watching|dream|fun
Cluster 10  ::  its|itself|whole|create|entire|complex|fully|structure|completely|creating|core|creation
Cluster 11  ::  held|moved|division|served|opened|joined|camp|formed|placed|entered|serving|headed|branch|newly|settl