# Retrieving Wikipedia articles

### Fire up GraphLab Create

In [19]:
import graphlab

### Load some text data - from wikipedia, pages on people

In [20]:
people = graphlab.SFrame('people_wiki.gl/')

Data contains:  link to wikipedia article, name of person, text of article.

In [21]:
people.head()

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...


# Compare top words according to word counts to TF-IDF

In [22]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
people.head()

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'since': 1L, 'carltons': 1L, 'being': 1L, '2005': ..."
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'precise': 1L, 'thomas': 1L, 'closely': 1L, ..."
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...,"{'just': 1L, 'issued': 1L, 'mainly': 1L, ..."
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...,"{'all': 1L, 'bauforschung': 1L, ..."
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'legendary': 1L, 'gangstergenka': 1L, ..."
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...,"{'now': 1L, 'currently': 1L, 'less': 1L, 'being': ..."
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...,"{'exclusive': 2L, 'producer': 1L, 'tribe': ..."
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...,"{'taxi': 1L, 'salon': 1L, 'gangs': 1L, 'being': ..."
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...,"{'houston': 1L, 'frankie': 1L, 'labels': ..."
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...,"{'phenomenon': 1L, 'deborash': 1L, ..."


In [23]:
people['tfidf'] = graphlab.text_analytics.tf_idf(people['word_count'])

In [24]:
elton = people[people['name'] == 'Elton John']

In [25]:
elton[['word_count']].stack('word_count',new_column_name=['word','count']).sort('count',ascending=False).head(3)

word,count
the,27
in,18
and,15


In [26]:
elton[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False).head(3)

word,tfidf
furnish,18.38947184
elton,17.48232027
billboard,17.3036809575


# Measuring distance

In [27]:
victoria = people[people['name'] == 'Victoria Beckham']

In [28]:
paul = people[people['name'] == 'Paul McCartney']

In [29]:
graphlab.distances.cosine(elton['tfidf'][0], victoria['tfidf'][0])

0.9567006376655429

In [30]:
graphlab.distances.cosine(elton['tfidf'][0], paul['tfidf'][0])

0.8250310029221779

# Building nearest neighbors models with different input features and setting the distance metric

In [31]:
knn_model_tfidf = graphlab.nearest_neighbors.create(people, features=['tfidf'], label='name', distance='cosine')

In [32]:
knn_model_word_count = graphlab.nearest_neighbors.create(people, features=['word_count'], label='name', distance='cosine')

### Who is the nearest neighbor to 'Elton John' using raw word counts?

In [33]:
knn_model_word_count.query(elton)

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


### Who is the nearest neighbor to 'Elton John' using TF-IDF?

In [34]:
knn_model_tfidf.query(elton)

query_label,reference_label,distance,rank
0,Elton John,-2.22044604925e-16,1
0,Rod Stewart,0.717219667893,2
0,George Michael,0.747600998969,3
0,Sting (musician),0.747671954431,4
0,Phil Collins,0.75119324879,5


### Who is the nearest neighbor to 'Victoria Beckham' using raw word counts?

In [35]:
knn_model_word_count.query(victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.22044604925e-16,1
0,Mary Fitzgerald (artist),0.207307036115,2
0,Adrienne Corri,0.214509782788,3
0,Beverly Jane Fry,0.217466468741,4
0,Raman Mundair,0.217695474992,5


### Who is the nearest neighbor to 'Victoria Beckham' using TF-IDF?

In [36]:
knn_model_tfidf.query(victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.11022302463e-16,1
0,David Beckham,0.548169610263,2
0,Stephen Dow Beckham,0.784986706828,3
0,Mel B,0.809585523409,4
0,Caroline Rush,0.819826422919,5
