<h1> <center> Clustering Wikipedia Documents </h1> <center>
    <h2> Using scikit-learn </h2>
    
Source: [ML Foundations, Washington University-Coursera](https://www.coursera.org/learn/ml-foundations/supplement/6DeQc/retrieving-wikipedia-articles-assignment)

    
Other useful links: <br>
https://investigate.ai/text-analysis/a-simple-explanation-of-tf-idf/ <br>
https://scikit-learn.org/stable/modules/neighbors.html

In [1]:
# imports

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.neighbors import NearestNeighbors, KDTree
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# display setup
#pd.set_option("display.max_rows", None, "display.max_columns", None)

<br>

# <center> Load and Explore Data <center>

### <center>Load product review data <center>

In [3]:
people_df = pd.read_csv('./data/people_wiki.csv')

In [4]:
people_df

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...
...,...,...,...
59066,<http://dbpedia.org/resource/Olari_Elts>,Olari Elts,olari elts born april 27 1971 in tallinn eston...
59067,<http://dbpedia.org/resource/Scott_F._Crago>,Scott F. Crago,scott francis crago born july 26 1963 twin bro...
59068,<http://dbpedia.org/resource/David_Cass_(footb...,David Cass (footballer),david william royce cass born 27 march 1962 in...
59069,<http://dbpedia.org/resource/Keith_Elias>,Keith Elias,keith hector elias born february 3 1972 in lac...


### <center> Explore <center> 

__Taking a look at the entry for President Obama__

In [5]:
obama_df = people_df[people_df['name'] == 'Barack Obama']

In [6]:
obama_df

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [7]:
obama_df['text']

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object

<br>

__Explore the entry for actor George Clooney__

In [8]:
clooney_df = people_df[people_df['name'] == 'George Clooney']
clooney_df['text']

38514    george timothy clooney born may 6 1961 is an a...
Name: text, dtype: object

<br>

# <center> Build word count vectors <center> 
    
Using CountVectorizer from sklearn

__Find most common words in article on Obama__

1. Get word counts for Obama acticle

In [9]:
vectorizer_obama = CountVectorizer()

matrix_obama = vectorizer_obama.fit_transform(obama_df['text'])
matrix_obama
#obama['word_count'] = turicreate.text_analytics.count_words(obama['text'])

<1x270 sparse matrix of type '<class 'numpy.int64'>'
	with 270 stored elements in Compressed Sparse Row format>

2. visualize the matrix (as a dict and as a DataFrame)

In [10]:
# as a dict
counts_obama_dict = dict( zip(vectorizer_obama.get_feature_names_out()
                              , matrix_obama.toarray()[0]
                             )
                        )
counts_obama_dict

{'13th': 1,
 '1961': 1,
 '1992': 1,
 '1996': 1,
 '1997': 1,
 '20': 2,
 '2000in': 1,
 '2004': 3,
 '2007': 1,
 '2008': 1,
 '2009': 3,
 '2010': 2,
 '2011': 3,
 '2012': 1,
 '2012obama': 1,
 '2013': 1,
 '44th': 1,
 '63': 1,
 'act': 8,
 'address': 1,
 'administration': 1,
 'affordable': 1,
 'afghanistan': 2,
 'african': 1,
 'after': 4,
 'against': 1,
 'american': 3,
 'americans': 1,
 'and': 21,
 'arms': 1,
 'as': 6,
 'ask': 1,
 'at': 2,
 'attention': 1,
 'attorney': 1,
 'august': 1,
 'barack': 1,
 'before': 1,
 'began': 1,
 'bin': 1,
 'bm': 1,
 'born': 2,
 'briefs': 1,
 'brk': 1,
 'budget': 1,
 'by': 1,
 'californias': 1,
 'called': 1,
 'campaign': 3,
 'care': 1,
 'chicago': 2,
 'civil': 1,
 'clinton': 1,
 'close': 1,
 'columbia': 1,
 'combat': 1,
 'community': 1,
 'constitutional': 1,
 'consumer': 1,
 'continued': 1,
 'control': 4,
 'convention': 1,
 'court': 1,
 'creation': 1,
 'cuba': 1,
 'current': 1,
 'death': 1,
 'debate': 1,
 'debt': 1,
 'defeated': 1,
 'defeating': 1,
 'defense': 1,


In [11]:
# as a DataFrame
counts_obama_df = pd.DataFrame(matrix_obama.toarray()
                               , columns=vectorizer_obama.get_feature_names()
                              )
counts_obama_df



Unnamed: 0,13th,1961,1992,1996,1997,20,2000in,2004,2007,2008,...,was,where,whether,which,while,with,withdrawal,won,worked,years
0,1,1,1,1,1,2,1,3,1,1,...,5,1,1,1,1,3,1,1,1,1


<br>

3. Find most common words (in dict or in DataFrame)

In [12]:
# find most common in dict
max(counts_obama_dict.items(), key=lambda x: x[1])

('the', 40)

In [13]:
# find most common in DataFrame
counts_obama_df.sort_values(by=0, axis=1, ascending=False)

Unnamed: 0,the,in,and,of,to,his,obama,act,he,as,...,hawaii,hillary,hold,honolulu,hook,husen,hussein,ii,inaugurated,years
0,40,30,21,18,14,11,9,8,7,6,...,1,1,1,1,1,1,1,1,1,1


<br>


# <center> Compute TF-IDF <center>

### <center> Compute TF-IDF for articles on Obama, Clooney, and Beckham <center> 

In [14]:
fourpeople_df = people_df[(people_df['name'].isin(['Barack Obama', 'Bill Clinton', 'David Beckham', 'George Clooney']))]
fourpeople_df.set_index('name',inplace=True,drop=False)
fourpeople_df

Unnamed: 0_level_0,URI,name,text
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
David Beckham,<http://dbpedia.org/resource/David_Beckham>,David Beckham,david robert joseph beckham obe bkm born 2 may...
Barack Obama,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...
Bill Clinton,<http://dbpedia.org/resource/Bill_Clinton>,Bill Clinton,william jefferson bill clinton born william je...
George Clooney,<http://dbpedia.org/resource/George_Clooney>,George Clooney,george timothy clooney born may 6 1961 is an a...


In [15]:
tfidf_vectorizer_4 = TfidfVectorizer(stop_words='english')

matrix_tfidf_4 = tfidf_vectorizer_4.fit_transform(fourpeople_df['text'].astype('U'))
matrix_tfidf_4

<4x658 sparse matrix of type '<class 'numpy.float64'>'
	with 799 stored elements in Compressed Sparse Row format>

In [16]:
counts_tfidf_4_df = pd.DataFrame(matrix_tfidf_4.toarray()
                                  , columns=tfidf_vectorizer_4.get_feature_names_out()
                                 , index=fourpeople_df.index
                                 )
counts_tfidf_4_df

Unnamed: 0_level_0,100,115,12,13th,17,18,19,1946,1960,1961,...,work,worked,working,world,worlds,writer,yale,year,years,york
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
David Beckham,0.081479,0.051673,0.051673,0.0,0.051673,0.051673,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.098946,0.051673,0.0,0.0,0.040739,0.032982,0.0
Barack Obama,0.0,0.0,0.0,0.042206,0.0,0.0,0.0,0.0,0.0,0.033276,...,0.0,0.033276,0.0,0.0,0.0,0.0,0.0,0.0,0.02694,0.0
Bill Clinton,0.0,0.0,0.0,0.0,0.0,0.0,0.048556,0.048556,0.0,0.0,...,0.038282,0.0,0.0,0.030992,0.0,0.0,0.048556,0.0,0.12397,0.048556
George Clooney,0.042372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053743,0.042372,...,0.084744,0.042372,0.053743,0.068607,0.0,0.053743,0.0,0.042372,0.0,0.0


<br>

__Find most common words in article on Obama__

In [17]:
counts_tfidf_4_df.sort_values(by='Barack Obama'
                               , axis=1
                               , ascending=False)

Unnamed: 0_level_0,obama,act,law,military,iraq,president,control,democratic,involvement,ordered,...,football,fiveyear,firstteam,finding,final,films,film,fifa,february,york
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
David Beckham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.103346,0.051673,0.051673,0.0,0.103346,0.0,0.0,0.155018,0.051673,0.0
Barack Obama,0.379854,0.337648,0.199654,0.168824,0.168824,0.133103,0.133103,0.133103,0.126618,0.126618,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bill Clinton,0.0,0.0,0.114846,0.0,0.0,0.267973,0.038282,0.114846,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048556
George Clooney,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.053743,0.0,0.053743,0.214973,0.0,0.0,0.0


<br>

__Find most common words in article on Clooney__

In [18]:
counts_tfidf_4_df.sort_values(by='George Clooney'
                                    , axis=1
                                    , ascending=False)

Unnamed: 0_level_0,drama,film,actor,thriller,clooney,academy,2005,awards,political,award,...,form,footballer,football,fiveyear,firstteam,final,filed,fifa,federal,york
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
David Beckham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.103346,0.103346,0.051673,0.051673,0.103346,0.0,0.155018,0.0,0.0
Barack Obama,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033276,0.0,0.0,0.0,0.0,0.0,0.042206,0.0,0.042206,0.0
Bill Clinton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048556
George Clooney,0.214973,0.214973,0.214973,0.214973,0.214973,0.214973,0.16123,0.16123,0.16123,0.16123,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<br>

# <center> Manually evaluate the distance between certain people's articles <center>

In [19]:
counts_tfidf_4_df.loc[['Bill Clinton']]

Unnamed: 0_level_0,100,115,12,13th,17,18,19,1946,1960,1961,...,work,worked,working,world,worlds,writer,yale,year,years,york
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bill Clinton,0.0,0.0,0.0,0.0,0.0,0.0,0.048556,0.048556,0.0,0.0,...,0.038282,0.0,0.0,0.030992,0.0,0.0,0.048556,0.0,0.12397,0.048556


__Is Obama closer to Clinton, to Clooney or to Beckham?__

In [20]:
cosine_similarity(counts_tfidf_4_df.loc[['Barack Obama']], counts_tfidf_4_df.loc[['Bill Clinton']])

array([[0.23276838]])

In [21]:
cosine_similarity(counts_tfidf_4_df.loc[['Barack Obama']], counts_tfidf_4_df.loc[['George Clooney']])

array([[0.05292066]])

In [22]:
cosine_similarity(counts_tfidf_4_df.loc[['Barack Obama']], counts_tfidf_4_df.loc[['David Beckham']])

array([[0.03288187]])

<br>
<br>


# <center> Apply nearest neighbors to Wikipedia articles <center>



### <center> Compute TF-IDF for the entire corpus of articles <center>


In [23]:
# unfortunately this seems to take more memory than the laptop can handle
# so emptying the memory
import gc
gc.collect()

0

In [24]:
vectorizer_alltexts = CountVectorizer()

matrix_df = vectorizer_alltexts.fit_transform(people_df['text'])
matrix_df

#people['word_count'] = turicreate.text_analytics.count_words(people['text'])

<59071x548429 sparse matrix of type '<class 'numpy.int64'>'
	with 10244028 stored elements in Compressed Sparse Row format>

In [25]:
# Cells below may give memory error
# check overcommit mode
# cat /proc/sys/vm/overcommit_memory

# to fix it do as root:
# echo 1 > /proc/sys/vm/overcommit_memory


In [26]:
all_array = matrix_df.toarray()

In [27]:
words = vectorizer_alltexts.get_feature_names_out()

In [28]:
counts_all_df = pd.DataFrame(all_array
                             , columns=words
                             , index=people_df['name']
                            )


In [29]:
counts_all_df.shape

(59071, 548429)

In [30]:
# unfortunately if using pd.set_option("display.max_rows", None, "display.max_columns", None)
# this seems to take more memory than the laptop can handle
counts_all_df.head(1)

Unnamed: 0_level_0,00,000,0000,00000,00000van,0001,00014338,0001sec,0002,00026,...,zyx,zyzzyva,zyzzyza,zz,zzap64,zzb,zzebra,zzran,zzt,zzts
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Digby Morrell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<br>
<br>

### <center> Build a nearest neighbors model <center>


In [31]:
# The NearestNeighbors class is unsupervised and can not be used for classification but only for nearest neighbour searches.
nbrs = NearestNeighbors(n_neighbors=2).fit(counts_all_df)
nbrs

NearestNeighbors(n_neighbors=2)

In [32]:
# this takes too much memory
# nbrs.kneighbors(counts_all_df)

<br>
<br>

### <center> Build a kmeans  model <center>
    
KMeans clustering is a method of clustering.


In [33]:
km = KMeans(n_clusters=2)


In [34]:
# this one unfortunately is too much for the memory of the laptop
# km.fit(counts_all_df)

<br>
<br>
<br>
<br>
<br>
<br>

# <center> Other things <center>

### <center> Compute TF-IDF ONLY for article on Obama <center> 


In [35]:
tfidf_vectorizer_obama = TfidfVectorizer(stop_words='english')

matrix_tfidf_obama = tfidf_vectorizer_obama.fit_transform(obama_df['text'])
matrix_tfidf_obama

<1x229 sparse matrix of type '<class 'numpy.float64'>'
	with 229 stored elements in Compressed Sparse Row format>

In [36]:
counts_tfidf_obama_df = pd.DataFrame(matrix_tfidf_obama.toarray()
                                  , columns=tfidf_vectorizer_obama.get_feature_names()
                                 )
counts_tfidf_obama_df



Unnamed: 0,13th,1961,1992,1996,1997,20,2000in,2004,2007,2008,...,university,unsuccessfully,urged,victory,wall,war,withdrawal,won,worked,years
0,0.037987,0.037987,0.037987,0.037987,0.037987,0.075974,0.037987,0.113961,0.037987,0.037987,...,0.075974,0.037987,0.037987,0.037987,0.037987,0.037987,0.037987,0.037987,0.037987,0.037987


<br>

### <center> Compute TF-IDF ONLY for article on Clooney <center> 


In [37]:
tfidf_vectorizer_clooney = TfidfVectorizer(stop_words='english', use_idf=True)

matrix_tfidf_clooney = tfidf_vectorizer_clooney.fit_transform(clooney_df['text'])
matrix_tfidf_clooney

<1x191 sparse matrix of type '<class 'numpy.float64'>'
	with 191 stored elements in Compressed Sparse Row format>

In [38]:
counts_tfidf_clooney_df = pd.DataFrame(matrix_tfidf_clooney.toarray()
                                    , columns=tfidf_vectorizer_clooney.get_feature_names_out()
                                   )
counts_tfidf_clooney_df

Unnamed: 0,100,1960,1961,1978,1994,1997,1998,1999,2001,2004,...,wellreceived,wide,widened,won,work,worked,working,world,writer,year
0,0.050572,0.050572,0.050572,0.050572,0.050572,0.050572,0.050572,0.101144,0.050572,0.050572,...,0.050572,0.050572,0.050572,0.050572,0.101144,0.050572,0.050572,0.101144,0.050572,0.050572


# <center> ASSIGNMENT <center>

1. Take a particular famous person, 'Elton John'. 

    - What are the 3 words in his articles with highest word counts?  
    - What are the 3 words in his articles with highest TF-IDF?  

    These results illustrate why TF-IDF is useful for finding important words.

In [39]:
elton_df = people_df[people_df['name'] == 'Elton John']

### <center> Find most common words in article on Elton John <center>



In [40]:
# 1. Get word counts for acticle on Elton
vectorizer_elton = CountVectorizer()

matrix_elton = vectorizer_elton.fit_transform(elton_df['text'])
matrix_elton


<1x252 sparse matrix of type '<class 'numpy.int64'>'
	with 252 stored elements in Compressed Sparse Row format>

In [41]:
# 2. visualize the matrix as a DataFrame
counts_elton_df = pd.DataFrame(matrix_elton.toarray()
                               , columns=vectorizer_elton.get_feature_names()
                              )
counts_elton_df



Unnamed: 0,10,100,1947,1967,19702000,1976,1980s,1988,1992,1994,...,who,wind,winning,with,worked,world,worldwide,year,years,you
0,1,3,1,1,1,1,1,1,1,1,...,1,1,1,2,1,1,2,1,1,1


In [42]:
# 3. Find most common words in DataFrame
counts_elton_df.sort_values(by=0, axis=1, ascending=False)

Unnamed: 0,the,in,and,of,has,he,john,on,since,for,...,england,english,entered,era,established,events,fellow,fifty,fight,you
0,27,18,15,13,9,7,7,6,5,5,...,1,1,1,1,1,1,1,1,1,1


<br>

### <center> Compute TF-IDF ONLY for article on Elton John <center> 


In [43]:
tfidf_vectorizer_elton = TfidfVectorizer(stop_words='english', use_idf=True)

matrix_tfidf_elton = tfidf_vectorizer_elton.fit_transform(elton_df['text'])
matrix_tfidf_elton

<1x198 sparse matrix of type '<class 'numpy.float64'>'
	with 198 stored elements in Compressed Sparse Row format>

In [44]:
counts_tfidf_elton_df = pd.DataFrame(matrix_tfidf_elton.toarray()
                                    , columns=tfidf_vectorizer_elton.get_feature_names_out()
                                   )
counts_tfidf_elton_df

Unnamed: 0,10,100,1947,1967,19702000,1976,1980s,1988,1992,1994,...,way,wed,westminster,wind,winning,worked,world,worldwide,year,years
0,0.048168,0.144505,0.048168,0.048168,0.048168,0.048168,0.048168,0.048168,0.048168,0.048168,...,0.048168,0.048168,0.048168,0.048168,0.048168,0.048168,0.048168,0.096337,0.048168,0.048168


In [45]:
counts_tfidf_elton_df.sort_values(by=0
                               , axis=1
                               , ascending=False)

Unnamed: 0,john,award,billboard,academy,million,awards,elton,british,music,100,...,england,datein,david,diamond,diana,disney,dwight,elizabeth,empire,years
0,0.337178,0.240842,0.192673,0.144505,0.144505,0.144505,0.144505,0.144505,0.144505,0.144505,...,0.048168,0.048168,0.048168,0.048168,0.048168,0.048168,0.048168,0.048168,0.048168,0.048168
