In [None]:
import pandas as pd
import numpy as np
import nltk #NATURAL LANGUAGE PROCESSING TOOLKIT
from nltk.tokenize import wordpunct_tokenize
import json
import string

In [None]:
tweets = {}
for line in open('tweets.json', 'r'):
    tweet = (json.loads(line))
    tweets[tweet['id']] = tweet['text']

In [None]:
txt_file = open("initial_centroids.txt", "r")
file_content = txt_file.read()
centroids_init = file_content.split(",\n")
txt_file.close()
centroids_init = [int(c) for c in centroids_init]
centroids_init

In [None]:
def jaccardDistance(setA, setB):
  return 1 - float(len(setA.intersection(setB))) / float(len(setA.union(setB)))

In [None]:
def tokenize(tweet_text):
  words = wordpunct_tokenize(tweet_text.translate(str.maketrans('', '', string.punctuation)))
  return set(words)

In [None]:
def assign_cluster(data, centroids):
    assignments = {}

    for key, val in data.items():
        dist_point_clust = []

        for centroid in centroids:
            words_centroid = tokenize(tweets[centroid])
            words_data = tokenize(val)
            d_clust = jaccardDistance(words_centroid, words_data)
            dist_point_clust.append(d_clust)
        
        assignment = centroids[np.argmin(dist_point_clust)]
        assignments[key] = assignment

    return assignments

In [None]:
def new_centroids(data, centroids, assignments):
    new_centroids = []
    for centroid in centroids:
        pt_cluster = []
        for key, val in data.items():
                if (assignments[key] == centroid):
                    pt_cluster.append(key)        
        new_centroids.append(pt_cluster[len(pt_cluster)//2])

    return new_centroids

In [None]:
def errors(data, assignments, centroids):
    errors = []
    
    for key, val in data.items():
        centroid = assignments[key]
    
        words_centroid = tokenize(data[centroid])
        words_data = tokenize(val)
        error = jaccardDistance(words_centroid, words_data)
        
        errors.append(error**2)
        
    total_error = sum(errors)
    
    return total_error

In [None]:
def KMeans(data, K, max_iter = 100, tol = pow(10,-3)):
    it = -1
    es = []
    assignments = []
    
    centroids = centroids_init
   
    while (len(es)<=1 or (it < max_iter and np.absolute(es[it] - es[it-1])/es[it-1] >= tol)):
        it += 1
        assignments = assign_cluster(data, centroids)
        
        centroids = new_centroids(data, centroids, assignments)
        
        kmeans_error = errors(data, assignments, centroids)
        es.append(kmeans_error)        
        
     
    return (assignments, centroids, data, it+1)

In [None]:
def initialize_centroids(data, k, random_state=42):
    

    np.random.seed(random_state)
    centroids = [data[list(data.keys())[0]]]

    for _ in range(25):
        dist_sq = np.array([min([jaccardDistance(tokenize(data[c]), tokenize(x)) for c in centroids]) for x in data.values()])
        key_seq = 
        probs = dist_sq/dist_sq.sum()
        cumulative_probs = probs.cumsum()
        r = np.random.rand()
        
        for j, p in enumerate(cumulative_probs):
            if r < p:
                i = j
                break
        
        centroids.append(list(data.keys())[i])

    return np.array(centroids)
