In [1]:
import pandas as pd
import numpy as np
import nltk #NATURAL LANGUAGE PROCESSING TOOLKIT
from nltk.tokenize import wordpunct_tokenize
import json
import string

In [2]:
tweets = {}
for line in open('tweets.json', 'r'):
    tweet = (json.loads(line))
    tweets[tweet['id']] = tweet['text']

In [3]:
txt_file = open("initial_centroids.txt", "r")
file_content = txt_file.read()
centroids_init = file_content.split(",\n")
txt_file.close()
centroids_init = [int(c) for c in centroids_init]
centroids_init

[323906397735641088,
 323906483584655360,
 323906657333682176,
 323907258301939713,
 323909308188344320,
 323913403460636673,
 324067437886713856,
 324117950774775809,
 324138055772561408,
 324219503401644033,
 324320247018573824,
 324346553835868161,
 324372750330363904,
 324408472441585664,
 324422817565257728,
 324448013999304704,
 324785120085176320,
 325059351209443329,
 325060324992643072,
 325162944931438592,
 325253327048822784,
 325337623910559745,
 325409910642835456,
 325701934273134594,
 325946633986641920]

In [4]:
def jaccardDistance(setA, setB):
  return 1 - float(len(setA.intersection(setB))) / float(len(setA.union(setB)))

In [5]:
def tokenize(tweet_text):
  words = wordpunct_tokenize(tweet_text.translate(str.maketrans('', '', string.punctuation)))
  return set(words)

In [6]:
def assign_cluster(data, centroids):
    assignments = {}

    for key, val in data.items():
        dist_point_clust = []

        for centroid in centroids:
            words_centroid = tokenize(tweets[centroid])
            words_data = tokenize(val)
            d_clust = jaccardDistance(words_centroid, words_data)
            dist_point_clust.append(d_clust)
        
        assignment = centroids[np.argmin(dist_point_clust)]
        assignments[key] = assignment

    return assignments

In [7]:
a = assign_cluster(tweets, centroids_init)

In [8]:
def new_centroids(data, centroids, assignments):
    new_centroids = []
    for centroid in centroids:
        pt_cluster = []
        for key, val in data.items():
                if (assignments[key] == centroid):
                    pt_cluster.append(key)        
        new_centroids.append(pt_cluster[len(pt_cluster)//2])

    return new_centroids

In [9]:
print(new_centroids(tweets, centroids_init, a))

[323906398993932289, 323916051614138368, 323906653651079168, 323910330457669633, 324229792834674689, 323922279182520320, 324210976629067776, 324117952695771137, 324160230760005632, 324375472681148416, 324425542629732352, 324310958418232934, 324418112688623616, 324439832715747328, 324423160525103105, 324448711860158464, 324785131569168384, 325059996704452608, 325060154087309312, 325171288526163968, 325253670746849280, 325337640322871296, 325409996529602560, 325702375430057984, 325946918842818560]


In [10]:
def errors(data, assignments, centroids):
    errors = []
    
    for key, val in data.items():
        centroid = assignments[key]
    
        words_centroid = tokenize(data[centroid])
        words_data = tokenize(val)
        error = jaccardDistance(words_centroid, words_data)
        
        errors.append(error**2)
        
    total_error = sum(errors)
    
    return total_error

In [11]:
def KMeans(data, K, max_iter = 100, tol = pow(10,-3)):
    it = -1
    es = []
    assignments = []
    
    centroids = centroids_init
   
    while (len(es)<=1 or (it < max_iter and np.absolute(es[it] - es[it-1])/es[it-1] >= tol)):
        it += 1
        assignments = assign_cluster(data, centroids)
        
        centroids = new_centroids(data, centroids, assignments)
        
        kmeans_error = errors(data, assignments, centroids)
        es.append(kmeans_error)        
        
     
    return (assignments, centroids, data, it+1)

In [12]:
kmeans = KMeans(tweets, K = 25)

In [13]:
final_assigns = kmeans[0]
final_centroids = kmeans[1]

In [14]:
clustering= {}
for cent in final_centroids:
  clustering[cent] = []
  for key, val in final_assigns.items():
    if (val == cent):
      clustering[cent].append(key)

In [15]:
print(clustering)

{323906398993932289: [323906397609791488, 323906397618196483, 323906397735641088, 323906397853073410, 323906397962121216, 323906398012461057, 323906398230544385, 323906398314438656, 323906398352195585, 323906398826164225, 323906398993932289, 323906399149109248, 323906399295926273, 323906399300100096, 323906656318676993, 323907087551836160, 323907771256938496, 323908455545049088, 323908795254312962, 323908795396943872], 323916051614138368: [323906483584655360, 323906485249789952, 323911610236293120, 323915000567697409, 323916051614138368, 323920146454425600, 323921510282702848, 323923559799996417, 323925264352546816], 323906653651079168: [323906650987692034, 323906651209994241, 323906653651079168, 323906657333682176], 323910330457669633: [323906398176030720, 323906567294562306, 323907258301939713, 323910330315075584, 323910330457669633, 323955716392112128, 323963901769297921, 324226045052071936, 323932094190439874], 324229792834674689: [323909308188344320, 324229792834674689], 323922279

In [17]:
with open('clustering.txt', 'w') as f:
     for key, value in clustering.items(): 
        f.write('%s:%s\n' % (key, value))