# Week11 - Clustering Lab

* Create clusters for the following dataset
* Choose a meaningful cluster count
* Identify potential labels for each cluster

In [18]:
from zipfile import ZipFile
from io import BytesIO
import urllib.request as urllib2
import pandas as pd

def get_data():
    r = urllib2.urlopen("https://github.com/msaricaumbc/DS_data/blob/master/ds602/people_wiki.zip?raw=true").read()
    file = ZipFile(BytesIO(r))
    people_wiki_csv = file.open("people_wiki.csv")
    people = pd.read_csv(people_wiki_csv)
    people.drop('URI', inplace=True, axis=1)
    return people

people = get_data()
people.head()

Unnamed: 0,name,text
0,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(stop_words='english', max_features=10000)
corpus = vec.fit_transform(people.text)
corpus = corpus.toarray()

In [21]:
corpus
vec.get_feature_names_out()

array(['01', '10', '100', ..., 'zoology', 'zrich', 'zurich'], dtype=object)

In [22]:
pd.DataFrame(corpus, columns=vec.get_feature_names_out())

Unnamed: 0,01,10,100,1000,10000,100000,100m,100th,101,102,...,zhang,zhou,zimbabwe,zimbabwean,zombie,zone,zoo,zoology,zrich,zurich
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42781,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42782,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42783,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42784,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn import metrics

# text preprocessing and vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(people['text'])

# clustering using k-means
kmeans = KMeans(n_clusters=5, random_state=0).fit(X)
labels = kmeans.labels_

# assigning cluster labels
people['cluster'] = labels

print(people[['name', 'cluster']])

  super()._check_params_vs_input(X, default_n_init=10)


                                     name  cluster
0                           Digby Morrell        1
1                          Alfred J. Lewy        4
2                           Harpdog Brown        3
3                     Franz Rottensteiner        4
4                                  G-Enka        3
...                                   ...      ...
42781                  Motoaki Takenouchi        3
42782  Alan Judge (footballer, born 1960)        1
42783                        Eduardo Lara        1
42784                Tatiana Faberg%C3%A9        4
42785                      Kenneth Thomas        4

[42786 rows x 2 columns]


In [26]:
import numpy as np

def get_top_terms_per_cluster(tfidf_matrix, clusters, terms, n_terms=10):
    # finding and sorting centroid of each cluster
    centroids = np.array(kmeans.cluster_centers_)
    ordered_centroids = centroids.argsort()[:, ::-1]
    
    top_terms_per_cluster = {}
    for cluster_num in range(kmeans.n_clusters):
        top_terms = [terms[ind] for ind in ordered_centroids[cluster_num, :n_terms]]
        top_terms_per_cluster[cluster_num] = top_terms
    return top_terms_per_cluster

# feature names that correspond to the columns
terms = vectorizer.get_feature_names_out()
top_terms_per_cluster = get_top_terms_per_cluster(X, labels, terms, n_terms=10)

# printing top terms for each cluster
for cluster, terms in top_terms_per_cluster.items():
    print(f"Cluster {cluster}: {', '.join(terms)}")

Cluster 0: party, served, law, election, minister, member, president, elected, university, state
Cluster 1: league, season, played, football, team, coach, games, club, player, baseball
Cluster 2: film, theatre, television, films, series, award, actor, role, directed, festival
Cluster 3: music, album, band, released, orchestra, jazz, song, records, songs, albums
Cluster 4: university, new, art, world, book, research, work, published, american, professor


In [34]:
cluster_labels = {
    0: "Politicians",
    1: "Athletes",
    2: "Actors",
    3: "Musicians",
    4: "Scholars"                
}

# Add the descriptive labels to the DataFrame
people['cluster_label'] = people['cluster'].map(cluster_labels)

# Show the DataFrame with labels
print(people[['name', 'text', 'cluster_label']].head(20))

                                       name  \
0                             Digby Morrell   
1                            Alfred J. Lewy   
2                             Harpdog Brown   
3                       Franz Rottensteiner   
4                                    G-Enka   
5                             Sam Henderson   
6                             Aaron LaCrate   
7                           Trevor Ferguson   
8                              Grant Nelson   
9                              Cathy Caruth   
10                             Sophie Crumb   
11                            Jenn Ashworth   
12                         Jonathan Hoefler   
13  Anthony Gueterbock, 18th Baron Berkeley   
14                       David Chernushenko   
15                           Joerg Steineck   
16                           Andrew Pinsent   
17          Paddy Dunne (Gaelic footballer)   
18                        Alexandros Mouzas   
19                      John Angus Campbell   

            