In [1]:
TWEETS_FILE = "/content/drive/My Drive/project/dataset/preprocessed/tweets/dataset_cluster.csv"
METRIC_FILE = "/content/drive/My Drive/project/tmp/cluster_metric.npy"

In [2]:
import pandas as pd
import tensorflow_hub as hub
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from collections import defaultdict
import re

english_stopwords = set(stopwords.words('english'))

def count_words(sentences):
  word_count = defaultdict(lambda: 0)
  for sentence in sentences:
    words = re.findall(r'[A-Za-z]+|[^A-Za-z ]', sentence)
    for word in words:
      word = word.lower()
      if word[0].isalpha() and word not in english_stopwords:
        word_count[word] += 1
  word_count = word_count.items()
  word_count = sorted(word_count, key=lambda x: x[1], reverse=True)
  return word_count[0:10]

In [4]:
tweets = pd.read_csv(TWEETS_FILE).values
metrics = np.load(METRIC_FILE)

metrics[metrics[:,:] < 0] = 1

In [5]:
from sklearn.cluster import DBSCAN

model = DBSCAN(
  metric='precomputed',
  min_samples=2,
  eps=0.35
)

u = model.fit_predict(metrics)

result = np.unique(u, return_counts=True)
print("Clusters > 1:", len(result[1][result[1] > 1]), "Total:", len(result[1]))
print("Total:", sum(result[1]))
print("Total clustered:", sum(result[1][result[1] > 1]))

for i in result[1]:
  print(i, end=" ")

Clusters > 1: 170 Total: 170
Total: 3000
Total clustered: 3000
2063 2 3 231 37 74 5 22 7 2 10 7 12 13 9 8 10 2 2 3 2 6 4 2 2 3 2 2 2 16 3 5 6 5 2 2 5 2 4 2 6 2 6 4 2 5 2 8 2 3 5 2 2 5 2 2 3 2 6 2 10 3 2 3 2 2 2 2 2 2 3 2 2 3 3 7 4 2 21 2 2 2 2 3 3 4 3 2 3 4 2 2 2 22 2 3 4 8 4 2 2 2 2 2 7 2 3 5 2 2 5 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 6 3 5 2 2 2 2 2 2 3 2 3 2 3 2 5 2 2 2 3 2 3 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 

In [6]:
print(list(enumerate(result[1], start=-1)))

[(-1, 2063), (0, 2), (1, 3), (2, 231), (3, 37), (4, 74), (5, 5), (6, 22), (7, 7), (8, 2), (9, 10), (10, 7), (11, 12), (12, 13), (13, 9), (14, 8), (15, 10), (16, 2), (17, 2), (18, 3), (19, 2), (20, 6), (21, 4), (22, 2), (23, 2), (24, 3), (25, 2), (26, 2), (27, 2), (28, 16), (29, 3), (30, 5), (31, 6), (32, 5), (33, 2), (34, 2), (35, 5), (36, 2), (37, 4), (38, 2), (39, 6), (40, 2), (41, 6), (42, 4), (43, 2), (44, 5), (45, 2), (46, 8), (47, 2), (48, 3), (49, 5), (50, 2), (51, 2), (52, 5), (53, 2), (54, 2), (55, 3), (56, 2), (57, 6), (58, 2), (59, 10), (60, 3), (61, 2), (62, 3), (63, 2), (64, 2), (65, 2), (66, 2), (67, 2), (68, 2), (69, 3), (70, 2), (71, 2), (72, 3), (73, 3), (74, 7), (75, 4), (76, 2), (77, 21), (78, 2), (79, 2), (80, 2), (81, 2), (82, 3), (83, 3), (84, 4), (85, 3), (86, 2), (87, 3), (88, 4), (89, 2), (90, 2), (91, 2), (92, 22), (93, 2), (94, 3), (95, 4), (96, 8), (97, 4), (98, 2), (99, 2), (100, 2), (101, 2), (102, 2), (103, 7), (104, 2), (105, 3), (106, 5), (107, 2), (108

In [7]:
results = []

for current_label in range(0, max(u)+1):
# for current_label in [7,12,13]:
  current_label += 1
  if result[1][current_label] > 1:
    print("----------------------------")
    print("Label:", current_label)
    sentences = list()
    for index, label in enumerate(model.labels_):
      if label == current_label:
        print(tweets[index][0])
        sentences.append(tweets[index][0])
    c = count_words(sentences)
    print(c)
    print("Total items:", len(sentences))
    results.append(
      (len(sentences),
        (c[0][0] if len(c) > 0 else "") 
        + " " + 
        (c[1][0] if len(c) > 1 else "")
        + " " +
        (c[2][0] if len(c) > 2 else "")
      )
    )

----------------------------
Label: 1
Join me in Clive, Iowa tomorrow at noon! #AmericaFirst #MAGA Tickets:
Join me in Cedar Rapids, Iowa tomorrow at 7:00pm! #MAGA
Heading to Iowa- join me today at noon! #MakeAmericaGreatAgain Tickets:
[('join', 3), ('iowa', 3), ('tomorrow', 2), ('noon', 2), ('maga', 2), ('tickets', 2), ('clive', 1), ('americafirst', 1), ('cedar', 1), ('rapids', 1)]
Total items: 3
----------------------------
Label: 2
The White House is running very smoothly and the results for our Nation are obviously very good. We are the envy of the world. But anytime I even think about making changes, the FAKE NEWS MEDIA goes crazy, always seeking to make us look as bad as possible! Very dishonest!
Wow! Really bad TV Ratings for Morning Joe ( @JoeNBC). @foxandfriends doing great, leading all others by far. @CNN not a factor!!!
Can’t believe how badly @CNN has done in the newly released TV ratings. They are so far below @FoxNews (thank you President Trump!) that you can barely find 

In [10]:
sorted(results, key=lambda x: x[0])

[(0, '  '),
 (2, 'police chant pigs'),
 (2, 'minneapolis national guard'),
 (2, 'portland federal government'),
 (2, 'joe sleepy biden'),
 (2, 'full measure sheryl'),
 (2, 'fake news goes'),
 (2, 'stock market time'),
 (2, 'great guard national'),
 (2, 'corrupt joe biden'),
 (2, 'trump rally gives'),
 (2, 'convention post office'),
 (2, 'nasa thank great'),
 (2, 'one united states'),
 (2, 'would ever joe'),
 (2, 'police great bill'),
 (2, 'democrat suburbs live'),
 (2, 'god pledge allegiance'),
 (2, 'seattle mayor anarchists'),
 (2, 'ronny know well'),
 (2, 'instead driving jobs'),
 (2, 'sending area cares'),
 (2, 'international holocaust remembrance'),
 (2, 'fema louisiana texas'),
 (2, 'cnn foxnews poll'),
 (2, 'bring men women'),
 (2, 'federal crime government'),
 (2, 'fisa bill abuse'),
 (2, 'great national champions'),
 (2, 'left fake news'),
 (2, 'unfair adam schiff'),
 (2, 'women canadian pm'),
 (2, 'scam call lawyer'),
 (2, 'fire james mattis'),
 (2, 'crazy nancy pelosi'),
 (2,

In [None]:
df = pd.DataFrame(np.asarray(results))

df.to_csv(results_directory + "/agglomerative_clustering.csv")