In [1]:
TWEETS_FILE = "/content/drive/My Drive/project/dataset/preprocessed/tweets/dataset_cluster.csv"
METRIC_FILE = "/content/drive/My Drive/project/tmp/cluster_metric.npy"

In [2]:
import pandas as pd
import tensorflow_hub as hub
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from collections import defaultdict
import re

english_stopwords = set(stopwords.words('english'))

def count_words(sentences):
  word_count = defaultdict(lambda: 0)
  for sentence in sentences:
    words = re.findall(r'[A-Za-z]+|[^A-Za-z ]', sentence)
    for word in words:
      word = word.lower()
      if word[0].isalpha() and word not in english_stopwords:
        word_count[word] += 1
  word_count = word_count.items()
  word_count = sorted(word_count, key=lambda x: x[1], reverse=True)
  return word_count[0:10]

In [4]:
tweets = pd.read_csv(TWEETS_FILE).values
metrics = np.load(METRIC_FILE)

In [5]:
from sklearn.cluster import AgglomerativeClustering

model = AgglomerativeClustering(
  n_clusters=None,
  distance_threshold=0.5,
  affinity='precomputed',
  linkage="average"
)

u = model.fit_predict(metrics)

result = np.unique(u, return_counts=True)
print("Clusters > 1:", len(result[1][result[1] > 1]), "Total:", len(result[1]))
print("Total:", sum(result[1]))
print("Total clustered:", sum(result[1][result[1] > 1]))

for i in result[1]:
  print(i, end=" ")

Clusters > 1: 549 Total: 1428
Total: 3000
Total clustered: 2121
3 5 6 4 4 2 3 2 8 7 2 3 3 2 2 3 3 7 2 4 3 2 2 2 4 12 3 2 2 2 4 4 29 3 3 4 6 17 3 16 3 3 15 2 3 58 18 4 2 7 3 9 3 4 9 25 2 9 3 2 3 2 28 69 2 6 5 2 4 2 6 23 4 2 8 5 2 4 7 5 2 22 13 20 9 6 2 3 2 2 2 2 4 6 8 5 2 3 4 3 2 3 22 4 6 2 4 5 4 3 3 2 5 2 2 2 6 11 2 6 4 2 3 4 2 6 3 2 27 7 3 2 2 2 2 3 3 3 2 3 2 11 3 2 2 5 3 2 8 2 2 2 2 5 3 8 19 2 17 3 2 2 2 12 2 3 2 2 2 2 2 5 4 8 4 3 2 3 1 2 4 3 1 9 2 2 2 2 5 3 2 2 2 6 2 8 3 8 2 2 2 7 2 8 8 2 2 2 2 1 5 2 2 3 2 2 2 2 2 2 6 2 2 1 2 11 2 2 2 2 3 3 2 2 1 2 2 2 6 6 4 2 2 2 2 4 5 2 8 16 6 3 1 14 2 5 2 2 3 2 2 5 2 1 5 1 2 2 2 2 3 2 1 2 3 4 3 2 2 1 3 2 5 2 1 5 3 14 2 2 3 2 2 2 3 2 2 2 3 3 2 4 3 1 2 2 2 2 1 4 2 2 2 2 2 2 2 2 3 2 2 4 2 2 2 5 3 5 4 2 2 6 2 3 2 2 2 2 3 2 2 2 3 2 1 1 3 3 1 1 2 2 2 2 2 3 1 1 1 1 1 4 2 4 6 1 1 4 1 2 2 1 1 1 1 1 1 2 9 2 2 2 2 2 1 1 2 2 2 2 3 2 4 1 1 1 2 2 2 4 1 2 1 6 1 2 2 1 3 1 2 2 2 2 1 3 2 1 1 1 1 21 2 2 1 1 1 2 1 1 2 2 1 2 1 2 1 1 3 3 2 1 2 2 4 1 2 1 1 1 1 2 1 1 2 

In [6]:
print(list(enumerate(result[1])))

[(0, 3), (1, 5), (2, 6), (3, 4), (4, 4), (5, 2), (6, 3), (7, 2), (8, 8), (9, 7), (10, 2), (11, 3), (12, 3), (13, 2), (14, 2), (15, 3), (16, 3), (17, 7), (18, 2), (19, 4), (20, 3), (21, 2), (22, 2), (23, 2), (24, 4), (25, 12), (26, 3), (27, 2), (28, 2), (29, 2), (30, 4), (31, 4), (32, 29), (33, 3), (34, 3), (35, 4), (36, 6), (37, 17), (38, 3), (39, 16), (40, 3), (41, 3), (42, 15), (43, 2), (44, 3), (45, 58), (46, 18), (47, 4), (48, 2), (49, 7), (50, 3), (51, 9), (52, 3), (53, 4), (54, 9), (55, 25), (56, 2), (57, 9), (58, 3), (59, 2), (60, 3), (61, 2), (62, 28), (63, 69), (64, 2), (65, 6), (66, 5), (67, 2), (68, 4), (69, 2), (70, 6), (71, 23), (72, 4), (73, 2), (74, 8), (75, 5), (76, 2), (77, 4), (78, 7), (79, 5), (80, 2), (81, 22), (82, 13), (83, 20), (84, 9), (85, 6), (86, 2), (87, 3), (88, 2), (89, 2), (90, 2), (91, 2), (92, 4), (93, 6), (94, 8), (95, 5), (96, 2), (97, 3), (98, 4), (99, 3), (100, 2), (101, 3), (102, 22), (103, 4), (104, 6), (105, 2), (106, 4), (107, 5), (108, 4), (109

In [7]:
results = []

for current_label in range(0, max(u)+1):
# for current_label in [7,12,13]:
  if result[1][current_label] > 1:
    print("----------------------------")
    print("Label:", current_label)
    sentences = list()
    for index, label in enumerate(model.labels_):
      if label == current_label:
        print(tweets[index][0])
        sentences.append(tweets[index][0])
    c = count_words(sentences)
    print(c)
    print("Total items:", len(sentences))
    results.append(
      (len(sentences),
        (c[0][0] if len(c) > 0 else "") 
        + " " + 
        (c[1][0] if len(c) > 1 else "")
        + " " +
        (c[2][0] if len(c) > 2 else "")
      )
    )

----------------------------
Label: 0
Great things coming to Arkansas! $40M to be awarded to @myARDOT from @USDOT to reconstruct and improve US 67 in @PulaskiAR and Lonoke Counties. Will reduce congestion and travel time – a huge boost for economic growth!
$40M proposed to @AnokaCounty in Minnesota from @USDOT for the Ramsey Gateway Project on US Highway 10/169. Will help congestion and make travel safer and more efficient in the area!
I’m proposing a $50M award to @nevadadot from @USDOT to reconstruct the Tropicana Avenue/I-15 interchange. So important for improving and reducing traffic in this booming area!
[('usdot', 3), ('reconstruct', 2), ('us', 2), ('congestion', 2), ('travel', 2), ('area', 2), ('great', 1), ('things', 1), ('coming', 1), ('arkansas', 1)]
Total items: 3
----------------------------
Label: 1
....They want to Impeach me (I’m not worried!), and yet they were all breaking the law in so many ways. How can they do that and yet impeach a very successful (Economy Plus) Pr

In [9]:
sorted(results, key=lambda x: x[0], reverse=True)

[(69, 'endorsement total complete'),
 (58, 'collusion trump mueller'),
 (29, 'america great make'),
 (28, 'obamacare repeal replace'),
 (27, 'comey fbi mccabe'),
 (25, 'impeachment democrats nothing'),
 (23, 'cnn news fake'),
 (22, 'fake news media'),
 (22, 'pelosi nancy democrats'),
 (21, 'approval rating thank'),
 (20, 'ballots mail election'),
 (19, 'maga thank kag'),
 (18, 'border crime democrats'),
 (17, 'bus usdot service'),
 (17, 'portland left radical'),
 (16, 'fbi hillary clinton'),
 (16, 'drug prices big'),
 (15, 'interviewed enjoy foxnews'),
 (14, 'fake polls news'),
 (14, 'nation today honor'),
 (13, 'korea north kim'),
 (12, 'campaign history spied'),
 (12, 'biden would joe'),
 (11, 'china trade deal'),
 (11, 'fed federal reserve'),
 (11, 'great congratulations win'),
 (11, 'mexico border u'),
 (10, 'law order '),
 (9, 'impeachment impeachable impeached'),
 (9, 'thank rnc maga'),
 (9, 'iran deal nuclear'),
 (9, 'join tickets pm'),
 (9, 'whistleblower call scam'),
 (9, 'rus

In [None]:
df = pd.DataFrame(np.asarray(results))

df.to_csv(results_directory + "/agglomerative_clustering.csv")