# Demo Clustering Model on Wiki article

In [1]:
import graphlab as gl

## Load in data of wiki page

In [2]:
people = gl.SFrame('people_wiki.gl/')

This non-commercial license of GraphLab Create for academic use is assigned to nikki12345001@gmail.com and will expire on July 31, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1502878245.log


In [3]:
people.head()

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...


In [5]:
obama = people[people['name'] == 'Barack Obama']
obama

URI,name,text
<http://dbpedia.org/resou rce/Barack_Obama> ...,Barack Obama,barack hussein obama ii brk husen bm born august ...


In [7]:
obama['word_count'] = gl.text_analytics.count_words(obama['text'])

In [8]:
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name = ['word', 'count'])
obama_word_count_table

word,count
cuba,1
relations,1
sought,1
combat,1
ending,1
withdrawal,1
state,1
islamic,1
by,1
gains,1


In [10]:
# Show the uninformative words that can have huge counts
obama_word_count_table.sort('count', ascending=False)

word,count
the,40
in,30
and,21
of,18
to,14
his,11
obama,9
act,8
he,7
a,7


# Compute TF-IDF for the corpus

In [11]:
people['word_count'] = gl.text_analytics.count_words(people['text'])

In [18]:
tfidf_people = gl.text_analytics.tf_idf(people['word_count'])

In [14]:
# Add the result to a new column called tfidf 
people['tfidf'] = tfidf_people

## Evaluate the TF-IDF Model on Obama article

In [19]:
obama_tfidf = people[people['name'] == 'Barack Obama']
# Take out the tfidf column, transform the columns to rows, and sort
obama_tfidf[['tfidf']].stack('tfidf', new_column_name = ['word', 'tfidf']).sort('tfidf', ascending=False)

word,tfidf
obama,43.2956530721
act,27.678222623
iraq,17.747378588
control,14.8870608452
law,14.7229357618
ordered,14.5333739509
military,13.1159327785
involvement,12.7843852412
response,12.7843852412
democratic,12.4106886973


## Demo Similarity Computing Process

### Create 2nd example: Clinton, Beckham

In [21]:
clinton = people[people['name'] == 'Bill Clinton']
beckham = people[people['name'] == 'David Beckham']

### Cosine distance: smaller => closer

In [22]:
print gl.distances.cosine(obama_tfidf['tfidf'][0], clinton['tfidf'][0])

0.833985493688


In [23]:
print gl.distances.cosine(obama_tfidf['tfidf'][0], beckham['tfidf'][0])

0.979130584475


# KNN Model

In [24]:
knn_model = gl.nearest_neighbors.create(people, features=['tfidf'], label='name')

## Evaluation & Example

In [26]:
# Who is closest to Obama?
knn_model.query(obama_tfidf)

query_label,reference_label,distance,rank
0,Barack Obama,0.0,1
0,Joe Biden,0.794117647059,2
0,Joe Lieberman,0.794685990338,3
0,Kelly Ayotte,0.811989100817,4
0,Bill Clinton,0.813852813853,5


In [27]:
# Who is closest to Taylor Swift?
swift = people[people['name'] == 'Taylor Swift']
knn_model.query(swift)

query_label,reference_label,distance,rank
0,Taylor Swift,0.0,1
0,Carrie Underwood,0.76231884058,2
0,Alicia Keys,0.764705882353,3
0,Jordin Sparks,0.769633507853,4
0,Leona Lewis,0.776119402985,5


In [28]:
# Who is closest to Arnold Schwarzenegger?
arnold = people[people['name'] == 'Arnold Schwarzenegger']
knn_model.query(arnold)

query_label,reference_label,distance,rank
0,Arnold Schwarzenegger,0.0,1
0,Jesse Ventura,0.818918918919,2
0,John Kitzhaber,0.824615384615,3
0,Lincoln Chafee,0.833876221498,4
0,Anthony Foxx,0.833910034602,5


# HW4

In [30]:
# Create data for person 'Elton John'
elton = people[people['name'] == 'Elton John']

In [34]:
# Explore the tfidf
elton_tfidf_table = elton[['tfidf']].stack('tfidf', new_column_name = ['word', 'tfidf']).sort('tfidf', ascending=False)
elton_tfidf_table

word,tfidf
furnish,18.38947184
elton,17.48232027
billboard,17.3036809575
john,13.9393127924
songwriters,11.250406447
tonightcandle,10.9864953892
overallelton,10.9864953892
19702000,10.2933482087
fivedecade,10.2933482087
aids,10.262846934


In [35]:
# Explore the word count
elton_word_count_table = elton[['word_count']].stack('word_count', new_column_name = ['word', 'count']).sort('count', ascending=False)
elton_word_count_table

word,count
the,27
in,18
and,15
of,13
a,10
has,9
john,7
he,7
on,6
award,5


In [41]:
# Measuring distance
victoria_beckham = people[people['name'] == 'Victoria Beckham']
paul_mccartney = people[people['name'] == 'Paul McCartney']

print gl.distances.cosine(elton['tfidf'][0], victoria_beckham['tfidf'][0])
print gl.distances.cosine(elton['tfidf'][0], paul_mccartney['tfidf'][0])

0.956700637666
0.825031002922


In [42]:
# Build two models: tfidf VS. word count
knn_model_tfidf = gl.nearest_neighbors.create(people, features=['tfidf'], label='name', distance='cosine')
knn_model_word_count = gl.nearest_neighbors.create(people, features=['word_count'], label='name', distance='cosine')

In [44]:
# Display the predicted results
knn_model_tfidf.query(elton)

query_label,reference_label,distance,rank
0,Elton John,-2.22044604925e-16,1
0,Rod Stewart,0.717219667893,2
0,George Michael,0.747600998969,3
0,Sting (musician),0.747671954431,4
0,Phil Collins,0.75119324879,5


In [45]:
knn_model_word_count.query(elton)

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


In [47]:
gl.SFrame.append(knn_model_tfidf.query(victoria_beckham), knn_model_word_count.query(victoria_beckham))

query_label,reference_label,distance,rank
0,Victoria Beckham,1.11022302463e-16,1
0,David Beckham,0.548169610263,2
0,Stephen Dow Beckham,0.784986706828,3
0,Mel B,0.809585523409,4
0,Caroline Rush,0.819826422919,5
0,Victoria Beckham,-2.22044604925e-16,1
0,Mary Fitzgerald (artist),0.207307036115,2
0,Adrienne Corri,0.214509782788,3
0,Beverly Jane Fry,0.217466468741,4
0,Raman Mundair,0.217695474992,5
