In [54]:
# general imports
import pandas as pd
import numpy as np
import re
import itertools
from tabulate import tabulate

In [48]:
# create a dataframe of tweets from LA Times Health and ignore the id and timestamp columns
tweets_df = pd.read_csv('https://raw.githubusercontent.com/nikhilmanda9/Tweets-Similarity/main/latimeshealth.txt', delimiter='|', names=('id', 'timestamp', 'tweet'), usecols=['tweet'],header=None)

# convert the dataframe to a nested list of tweets
tweets = tweets_df.values.tolist()

In [51]:
# function to perform data preprocessing
def preprocess_tweets(tweets):

  # regular expression to extract URL
  url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

  # regular expression to extract '@' pattern
  mention_pattern = re.compile(r'@(\w+)')

  # regular expression to extract '#' pattern
  hashtag_pattern = re.compile(r'#(\w+)')


  for k in range(len(tweets)):
    # remove the URL from the tweet
    tweets[k][0] = url_pattern.sub('', tweets[k][0])

    # remove the '@' character from the tweet
    tweets[k][0] = mention_pattern.sub(r'\1', tweets[k][0])

    # remove the '#' character from the tweet
    tweets[k][0] = hashtag_pattern.sub(r'\1', tweets[k][0])

    # remove trailing whitespace at the end of the tweet
    tweets[k][0] = tweets[k][0].strip()

    # ensure all words in the tweet are lowercase
    tweets[k][0] = tweets[k][0].lower()

In [52]:
# function computes the jaccard distance metric
def jaccard_distance(a,b):
  intersection = len(a.intersection(b))
  union = len(a.union(b))
  return 1 - (intersection / union)

In [55]:
# function used to update and recompute centroid for a cluster
def compute_centroid(cluster):
  # Compute centroid as the tweet with the minimum distance to all other tweets in same cluster
  min_distances = [float('inf')] * len(cluster)
  centroid_index = 0

  for k, tweet in enumerate(cluster):
    distance_sum = sum(jaccard_distance(tweet, other) for j, other in enumerate(cluster) if k != j)
    if distance_sum < min_distances[k]:
      min_distances[k] = distance_sum
      centroid_index = k

  return cluster[centroid_index]

In [None]:
# function to perform the k means clustering algorithm
def k_means_clustering(tweets, k):
  # Initialize centroids randomly
  np.random.shuffle(tweets)
  centroids = tweets[:k]

  # Initialize clusters
  clusters = [[] for _ in range(k)]

  # repeat until convergence
  while True:
    # Assign tweets to the nearest cluster
    new_clusters = [[] for _ in range(k)]
    for tweet in tweets:
      distances = [jaccard_distance(tweet, centroid) for centroid in centroids]
      nearest_cluster = np.argmin(distances)
      new_clusters[nearest_cluster].append(tweet)

    # check for convergence
    if new_clusters == clusters:
      break

    # Update centroids
    clusters = new_clusters
    for i in range(k):
      centroids[i] = compute_centroid(clusters[i])

  return clusters

In [None]:
def compute_sse(clusters):
  sse = 0
  for j, cluster in enumerate(clusters):
    centroid = compute_centroid(cluster)
    sse += sum(jaccard_distance(tweet, centroid) ** 2 for tweet in cluster)
  return sse

In [None]:
# preprocess the tweets data
preprocess_tweets(tweets)

# convert tweets to sets in order to compute jaccard distance
tweet_sets = [set(tweet[0].split()) for tweet in tweets]

k_values = [2,4,6,8,10]

results = [(compute_sse(k_means_clustering(tweet_sets, k)), [len(cluster) for cluster in k_means_clustering(tweet_sets, k)]) for k in k_values]

with open('k_means_logs.txt', "a") as logfile:
  for k, (sse, cluster_sizes) in zip(k_values, results):
    for i, size in enumerate(cluster_sizes):
      logfile.write(f"k = {k}\n")
      logfile.write(f"SSE: {sse}\n")
      for i, size in enumerate(cluster_sizes):
        logfile.write(f"Cluster {i + 1} Size: {size}\n")
      logfile.write('\n')

  values = [[k, sse, {f"Cluster {i + 1}": len(cluster) for i,
                      cluster in enumerate(k_means_clustering(tweet_sets, k))}]
            for k, (sse, _) in zip(k_values, results)]

  logfile.write(tabulate(values, headers=['k', 'SSE', 'Size of each cluster']))