In [1]:
# General Imports
import pandas as pd
import numpy as np
import re
import itertools
from tabulate import tabulate
from collections import defaultdict

In [2]:
# Create a dataframe of tweets from the Fox News Health dataset
# Ignore the tweet id and timestamp columns
tweets_df = pd.read_csv('https://raw.githubusercontent.com/nikhilmanda9/Tweets-Similarity/main/foxnewshealth.txt',
                        delimiter='|', names=('id', 'timestamp', 'tweet'),
                        usecols=['tweet'], header=None, encoding='cp1252')
print(tweets_df.head())

# Convert the dataframe to a list of tweets
tweets = list(tweets_df['tweet'])

                                               tweet
0  Injury prevention programs unpopular with high...
1  6 dietary changes to make midlife http://ow.ly...
2  Massachusetts governor gets head shaved to sup...
3  Dad wins 3 marathons in 8 days; winnings to he...
4     Possible cure for melanoma? http://ow.ly/LlLg8


In [3]:
# Function to perform data preprocessing
def preprocess_tweets(tweets):
  # Remove any word that starts with the symbol '@' using regular expression
  mention_pattern = re.compile(r'@(\w+)')
  tweets = [mention_pattern.sub(r'\1', tweet) for tweet in tweets]

  # Remove any hashtag symbols using regular expression
  hashtag_pattern = re.compile(r'#(\w+)')
  tweets = [hashtag_pattern.sub(r'\1', tweet) for tweet in tweets]

  # Remove the URL from the tweet using regular expression
  url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
  tweets = [url_pattern.sub('', tweet) for tweet in tweets]

  # Ensure all words in the tweet are lowercase
  tweets = [tweet.lower() for tweet in tweets]

In [4]:
# Function to compute the Jaccard distance assuming each tweet is
# considered as an unordered set of words
def jaccard_distance(a, b):
  intersection = float(len(a.intersection(b)))
  union = float(len(a.union(b)))
  return 1 - (intersection / union)

In [5]:
# Function used to update centroid for a cluster
# The new centroid will be the tweet with the minimum sum squared distance to all
# other tweets in the same cluster. More specifically, find the tweet whose
# squared distance to all other tweets in the cluster summed is the smallest.
def update_centroids(cluster):
  # Find the distance matrix to make calculations faster
  distance_matrix = []
  for i in range(len(cluster)):
      distance_matrix.append([jaccard_distance(cluster[i], cluster[j]) ** 2 for j in range(len(cluster))])

  distances = [sum(distance_matrix[i]) ** 2 for i in range(len(cluster))]
  centroid_index = np.argmin(distances)

  return cluster[centroid_index]

In [6]:
# Function to perform the k-means clustering algorithm
def k_means_clustering(tweets, k):
  # Initialize centroids randomly by shuffling the tweets and selecting the
  # first k tweets as cluster centroids
  np.random.shuffle(tweets)
  centroids = tweets[:k]

  # Initialize clusters
  clusters = [[] for _ in range(k)]
  test = 0
  # Repeat until convergence (when clusters no longer change)
  while True:
    new_clusters = [[] for _ in range(k)]

    # Assign tweets to the nearest cluster
    # This is done by finding the cluster whose distance from the tweet
    # is the smallest and assigning the tweet to that cluster
    for tweet in tweets:
      cluster_distances = [jaccard_distance(tweet, centroid) for centroid in centroids]
      nearest_cluster = np.argmin(cluster_distances)
      new_clusters[nearest_cluster].append(tweet)

    # Check for convergence (if the new clusters are the same as the previous)
    if new_clusters == clusters:
      return clusters, centroids

    # Update centroids
    clusters = new_clusters
    for i in range(k):
      centroids[i] = update_centroids(clusters[i])
    test += 1

In [7]:
# Function to compute the Sum of Squared Error as the sum of squared distances
# of every point in every cluster to its cluster's centroid
def compute_sse(clusters, centroids):
  sse = 0
  for i in range(len(centroids)):
    sse += sum([jaccard_distance(x, centroids[i]) ** 2 for x in clusters[i]])
  return sse

In [8]:
# Preprocess the tweets data
preprocess_tweets(tweets)

# Convert tweets to sets in order to compute jaccard distance
tweets = [set(tweet.split()) for tweet in tweets]

# Test with 5 different values of K
k_values = [5, 10, 15, 20, 25]

# Create a log file and find the results of the k-means algorithm
# with each value of k given above
with open('k_means_logs.txt', "a") as logfile:
  # Store the k-value and resulting clusters after running the algorithm
  values = []

  for k in k_values:
    clusters, centroids = k_means_clustering(tweets, k)

    # Get the SSE
    sse = compute_sse(clusters, centroids)

    # Get the size of each cluster
    sizes = defaultdict(int)
    for i in range(len(clusters)):
      sizes[i+1] = len(clusters[i])

    check = True
    for x in sizes:
      if check:
        values.append([k, sse, "{}: {}".format(x, sizes[x])])
        check = False
      else:
        values.append(["", "", "{}: {}".format(x, sizes[x])])

  # Write the results to the logfile
  logfile.write(tabulate(values,
                headers=['Value of K', 'SSE', 'Size of each cluster']))

In [9]:
# Create a dataframe of tweets from the US News Health dataset
# Ignore the tweet id and timestamp columns
tweets_df = pd.read_csv('https://raw.githubusercontent.com/nikhilmanda9/Tweets-Similarity/main/usnewshealth.txt',
                        delimiter='|', names=('id', 'timestamp', 'tweet'),
                        usecols=['tweet'], header=None, encoding='utf-8')
print(tweets_df.head())

# Convert the dataframe to a list of tweets
tweets = list(tweets_df['tweet'])

                                               tweet
0  Planning to hire a personal trainer? Read thes...
1  RT @AnnaMedaris: Any dads out their who strugg...
2  America's problem with diabetes in one map: ht...
3  Think water &amp; fiber will cure your constip...
4  About to lose it? Here, try one of these offic...


In [10]:
# Preprocess the tweets data
preprocess_tweets(tweets)

# Convert tweets to sets in order to compute jaccard distance
tweets = [set(tweet.split()) for tweet in tweets]

# Test with 5 different values of K
k_values = [5, 10, 15, 20, 25]

# Create a log file and find the results of the k-means algorithm
# with each value of k given above
with open('k_means_logs1.txt', "a") as logfile:
  # Store the k-value and resulting clusters after running the algorithm
  values = []

  for k in k_values:
    clusters, centroids = k_means_clustering(tweets, k)

    # Get the SSE
    sse = compute_sse(clusters, centroids)

    # Get the size of each cluster
    sizes = defaultdict(int)
    for i in range(len(clusters)):
      sizes[i+1] = len(clusters[i])

    check = True
    for x in sizes:
      if check:
        values.append([k, sse, "{}: {}".format(x, sizes[x])])
        check = False
      else:
        values.append(["", "", "{}: {}".format(x, sizes[x])])

  # Write the results to the logfile
  logfile.write(tabulate(values,
                headers=['Value of K', 'SSE', 'Size of each cluster']))

In [11]:
# Create a dataframe of tweets from the LA Times Health dataset
# Ignore the tweet id and timestamp columns
tweets_df = pd.read_csv('https://raw.githubusercontent.com/nikhilmanda9/Tweets-Similarity/main/latimeshealth.txt',
                        delimiter='|', names=('id', 'timestamp', 'tweet'),
                        usecols=['tweet'], header=None, encoding='utf-8')
print(tweets_df.head())

# Convert the dataframe to a list of tweets
tweets = list(tweets_df['tweet'])

                                               tweet
0  Five new running shoes that aim to go the extr...
1  Gym Rat: Disq class at Crunch is intense worko...
2  Noshing through thousands of ideas at Natural ...
3  Natural Products Expo also explores beauty, su...
4  Free Fitness Weekends in South Bay beach citie...


In [12]:
# Preprocess the tweets data
preprocess_tweets(tweets)

# Convert tweets to sets in order to compute jaccard distance
tweets = [set(tweet.split()) for tweet in tweets]

# Test with 5 different values of K
k_values = [5, 10, 15, 20, 25]

# Create a log file and find the results of the k-means algorithm
# with each value of k given above
with open('k_means_logs2.txt', "a") as logfile:
  # Store the k-value and resulting clusters after running the algorithm
  values = []

  for k in k_values:
    clusters, centroids = k_means_clustering(tweets, k)

    # Get the SSE
    sse = compute_sse(clusters, centroids)

    # Get the size of each cluster
    sizes = defaultdict(int)
    for i in range(len(clusters)):
      sizes[i+1] = len(clusters[i])

    check = True
    for x in sizes:
      if check:
        values.append([k, sse, "{}: {}".format(x, sizes[x])])
        check = False
      else:
        values.append(["", "", "{}: {}".format(x, sizes[x])])

  # Write the results to the logfile
  logfile.write(tabulate(values,
                headers=['Value of K', 'SSE', 'Size of each cluster']))