In [2]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Word2Vec

import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.decomposition import PCA
from sklearn import cluster

import community # for Louvain
import networkx as nx

# for plotting k-means
from sklearn.manifold import TSNE

In [3]:
###### import data ######
data_dir = "../data/cleaned_tweets.csv"

# keep tweets in array:
tweet_text = []

with open(data_dir) as csv_file:
    readCSV = csv.reader(csv_file, delimiter = ',')

    # iterate by row
    for row in readCSV:
        # append to array that holds tweets
        tweet_text.append(row)

In [4]:
# get flattened tweets (make it all just one list)
flattened_text = [val for sublist in tweet_text for val in sublist]

### Clustering Algorithm 1: K-Means Clustering

In [5]:
# initializer vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(flattened_text)

In [10]:
# run with k = 10
true_k = 10
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=10, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [11]:
# see results
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Top terms per cluster:
Cluster 0:
 rt
 climatechange
 climatestrike
 today
 people
 new
 amp
 climateemergency
 ha
 fridayforfuture
Cluster 1:
 climateemergency
 rt
 climatechange
 climatestrike
 climateaction
 fridaysforfuture
 action
 today
 globalclimatestrike
 fridayforfuture
Cluster 2:
 catania
 scicily
 fridaysforfuture
 climatestrike
 rt
 17vlr
 climateacti
 oggi
 climateactionweek
 climateactionnow
Cluster 3:
 montreal
 street
 000
 time
 wow
 official
 500
 thousand
 rt
 massive
Cluster 4:
 change
 spoken
 continue
 speak
 listen
 leader
 coming
 people
 rt
 comin
Cluster 5:
 climate
 change
 strike
 rt
 million
 climatestrike
 people
 action
 global
 say
Cluster 6:
 future
 strike
 friday
 continues
 climatestrike
 presente
 school
 fridaysforfuture
 klimatstrejk
 torino
Cluster 7:
 climatecrisis
 rt
 action
 ha
 million
 climate
 scientist
 respected
 explain
 world
Cluster 8:
 fridaysforfuture
 climatestrike
 rt
 italy
 santiago
 schoolstrike4climate
 people
 chile
 vancouv

In [12]:
# get cluster labels
model.labels_

array([0, 0, 0, ..., 8, 0, 2], dtype=int32)

#### Now that I see that it works for k = 10, let's see what topics it comes up for other values of k

#### For k = 5

In [10]:
# for k = 5
model_5 = KMeans(n_clusters = 5, init='k-means++', max_iter=100, n_init=1)
model_5.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=5, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [11]:
# see results
print("Top terms per cluster:")
order_centroids = model_5.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(5):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Top terms per cluster:
Cluster 0:
 rt
 climatestrike
 climatechange
 climate
 today
 climatecrisis
 montreal
 people
 street
 time
Cluster 1:
 fridaysforfuture
 climatestrike
 rt
 klimatstrejk
 italy
 change
 xe9al
 montr
 coming
 fridayforfuture
Cluster 2:
 climateemergency
 rt
 climatechange
 climatecrisis
 climatestrike
 climateaction
 fridaysforfuture
 action
 today
 globalclimatestrike
Cluster 3:
 wellington
 nz
 fridaysforfuture
 climatestrike
 begin
 rt
 growing
 auckland
 demand
 thousand
Cluster 4:
 city
 iowa
 fridaysforfuture
 climatestrike
 strike
 59
 schoolstr
 happy
 school
 rt


In [12]:
# get cluster labels
model_5.labels_

array([0, 0, 0, ..., 1, 0, 1], dtype=int32)

#### For k = 15

In [6]:
# for k = 15
model_15 = KMeans(n_clusters = 15, init='k-means++', max_iter=100, n_init=1)
model_15.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=15, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [7]:
# see results
print("Top terms per cluster:")
order_centroids = model_15.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(15):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Top terms per cluster:
Cluster 0:
 climateemergency
 rt
 climatechange
 climatecrisis
 climatestrike
 climateaction
 fridaysforfuture
 action
 today
 globalclimatestrike
Cluster 1:
 change
 xe9al
 montr
 coming
 klimatstrejk
 fridayforfuture
 500
 climatestrike
 spoken
 continue
Cluster 2:
 climatechange
 rt
 ha
 climateaction
 preach
 world
 speech
 like
 gretathunberg
 real
Cluster 3:
 hull
 acto
 headed
 quebec
 hill
 wait
 parliament
 incredible
 wow
 planet
Cluster 4:
 fridaysforfuture
 climatestrike
 rt
 italy
 people
 vancouver
 schoolstrike4climate
 million
 torino
 picture
Cluster 5:
 amp
 rt
 climatechange
 30
 cambie
 50
 climatestrike
 vancouver
 000
 watch
Cluster 6:
 climatestrike
 rt
 today
 climatecrisis
 people
 climateemergency
 new
 fridaysforfuture
 fridayforfuture
 million
Cluster 7:
 newfoundland
 john
 st
 future
 fridaysforfuture
 strike
 climatestrike
 friday
 continues
 presente
Cluster 8:
 thousand
 street
 milan
 massive
 demand
 growing
 action
 time
 clima

In [8]:
# get cluster labels
model_15.labels_

array([6, 6, 6, ..., 4, 6, 4], dtype=int32)