## Load pandas

In [1]:
import pandas as pd

## Load some text data - from wikipedia, pages on people

In [2]:
people = pd.read_csv('people_wiki.csv')

In [3]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
len(people)

59071

## Explore the dataset and checkout the text it contains

# Exploring the entry for president Obama

In [5]:
obama = people[people['name'] == 'Barack Obama']

In [6]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [7]:
text = obama['text']
print(text)

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object


# Hand-on exercise 1
### Exploring the entry for actor George Clooney

In [None]:
# Fill in the answer here

# Get the word counts for Obama article

In [8]:
from collections import Counter
word_count_obama = [Counter(" ".join(obama['text'].values.tolist()).split(" "))]

In [9]:
print word_count_obama

[Counter({'the': 40, 'in': 30, 'and': 21, 'of': 18, 'to': 14, 'his': 11, 'obama': 9, 'act': 8, 'he': 7, 'a': 7, 'us': 6, 'as': 6, 'law': 6, 'was': 5, 'for': 4, 'iraq': 4, 'military': 4, 'control': 4, 'has': 4, 'after': 4, 'president': 4, 'democratic': 4, '2011': 3, 'term': 3, 'school': 3, 'january': 3, 'response': 3, 'senate': 3, 'election': 3, 'american': 3, 'first': 3, 'campaign': 3, 'from': 3, 'with': 3, 'signed': 3, 'states': 3, 'ordered': 3, 'united': 3, 'party': 3, 'involvement': 3, '2009': 3, '2004': 3, 'office': 2, 'domestic': 2, '20': 2, 'policy': 2, '2010': 2, 'during': 2, 'republican': 2, 'presidential': 2, 'university': 2, 'served': 2, 'on': 2, 'house': 2, 'national': 2, 'born': 2, 'second': 2, 'afghanistan': 2, 'chicago': 2, 'representatives': 2, 'harvard': 2, 'november': 2, 'foreign': 2, 'protection': 2, 'dont': 2, 'primary': 2, 'nominee': 2, 'is': 2, 'at': 2, 'illinois': 2, 'relief': 2, 'operations': 1, 'represent': 1, 'unemployment': 1, 'administration': 1, 'over': 1, '

# Sort the word counts for the Obama article


### Sorting the word counts to show top common words at the top

In [143]:
import operator
def cal_word_count(each_people):
    each_words = [Counter(" ".join(each_people['text'].values.tolist()).split(" "))]
    sorted_words = sorted(each_words[0].items(),key=operator.itemgetter(1))
    sorted_words.reverse()
    return sorted_words

In [144]:
cal_word_count(each_people=obama)

[('the', 40),
 ('in', 30),
 ('and', 21),
 ('of', 18),
 ('to', 14),
 ('his', 11),
 ('obama', 9),
 ('act', 8),
 ('a', 7),
 ('he', 7),
 ('law', 6),
 ('as', 6),
 ('us', 6),
 ('was', 5),
 ('democratic', 4),
 ('president', 4),
 ('after', 4),
 ('has', 4),
 ('control', 4),
 ('military', 4),
 ('iraq', 4),
 ('for', 4),
 ('2004', 3),
 ('2009', 3),
 ('involvement', 3),
 ('party', 3),
 ('united', 3),
 ('ordered', 3),
 ('states', 3),
 ('signed', 3),
 ('with', 3),
 ('from', 3),
 ('campaign', 3),
 ('first', 3),
 ('american', 3),
 ('election', 3),
 ('senate', 3),
 ('response', 3),
 ('january', 3),
 ('school', 3),
 ('term', 3),
 ('2011', 3),
 ('relief', 2),
 ('illinois', 2),
 ('at', 2),
 ('is', 2),
 ('nominee', 2),
 ('primary', 2),
 ('dont', 2),
 ('protection', 2),
 ('foreign', 2),
 ('november', 2),
 ('harvard', 2),
 ('representatives', 2),
 ('chicago', 2),
 ('afghanistan', 2),
 ('second', 2),
 ('born', 2),
 ('national', 2),
 ('house', 2),
 ('on', 2),
 ('served', 2),
 ('university', 2),
 ('presidential'

Most common words include uninformative words like "the", "in", "and",...

# Hand-on exercise 2
### Exploring sorting the word counts for Elton John

In [None]:
# 

### Compute TF-IDF for the corpus 


To give more weight to informative words, we weigh them by their TF-IDF scores.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

CountVectorizer converts a collection of text documents to a matrix of token counts

In [55]:
vectorizer = CountVectorizer(analyzer="word", tokenizer = None, preprocessor = None, 
                             stop_words = "english")

fit_transform will learn the vocabulary dictionary and return term-document matrix. (tf calculation)

In [56]:
word_counts = vectorizer.fit_transform(people['text'])

get_feature_names returns indeces and feature names 

In [101]:
features_names = vectorizer.get_feature_names()

TfidfTransformer will transform a count matrix to a normalized tf or tf-idf representation

In [58]:
transformer = TfidfTransformer(norm=None)

In [59]:
people_tfidf = transformer.fit_transform(word_counts)

#### Words with highest TF-IDF are much more informative.

In [64]:
obama_tfidf = people_tfidf[obama.index]

In [106]:
obama_features = map(lambda x: features_names[x], obama_ifidf.indices)

In [108]:
obama_tfidf_vec = dict(zip(obama_features, obama_ifidf.data))

In [110]:
import operator

In [111]:
sorted_obama_tfidf = sorted(obama_tfidf_vec.items(), key=operator.itemgetter(1))

In [113]:
sorted_obama_tfidf.reverse()

In [118]:
sorted_obama_tfidf

[(u'obama', 52.277113834307315),
 (u'act', 35.674051187909924),
 (u'iraq', 21.741727931276476),
 (u'law', 20.721855882367674),
 (u'control', 18.884330378434285),
 (u'ordered', 17.526980051210632),
 (u'military', 17.114203144108135),
 (u'democratic', 16.409249536745939),
 (u'response', 15.780836746511332),
 (u'involvement', 15.780836746511332),
 (u'senate', 13.162835582973344),
 (u'term', 12.318257920710373),
 (u'campaign', 12.077519128423219),
 (u'nominee', 11.427270622632713),
 (u'afghanistan', 11.41598178739336),
 (u'laureateduring', 11.293365137302578),
 (u'2012obama', 11.293365137302578),
 (u'husen', 11.293365137302578),
 (u'president', 11.2265246134842),
 (u'domestic', 10.964448288651756),
 (u'normalize', 10.887900029194414),
 (u'brk', 10.887900029194414),
 (u'relief', 10.839679776118567),
 (u'election', 10.712062862503725),
 (u'protection', 10.695889057392007),
 (u'doddfrank', 10.600217956742634),
 (u'signed', 10.552378505322665),
 (u'dont', 10.279138356554288),
 (u'party', 10.06

In [136]:
def cal_top_freq_features(each_person):
    
    each_tfidf = people_tfidf[each_person.index]
    each_features = map(lambda x: features_names[x], each_tfidf.indices)
    each_tfidf_vec = dict(zip(each_features, each_tfidf.data))
    each_sorted_tfidf = sorted(each_tfidf_vec.items(), key=operator.itemgetter(1))
    each_sorted_tfidf.reverse()
    return each_sorted_tfidf

If you want to reproduce obama again with cal_top_freq_features, 

In [138]:
cal_top_freq_features(each_person=obama)

[(u'obama', 52.277113834307315),
 (u'act', 35.674051187909924),
 (u'iraq', 21.741727931276476),
 (u'law', 20.721855882367674),
 (u'control', 18.884330378434285),
 (u'ordered', 17.526980051210632),
 (u'military', 17.114203144108135),
 (u'democratic', 16.409249536745939),
 (u'response', 15.780836746511332),
 (u'involvement', 15.780836746511332),
 (u'senate', 13.162835582973344),
 (u'term', 12.318257920710373),
 (u'campaign', 12.077519128423219),
 (u'nominee', 11.427270622632713),
 (u'afghanistan', 11.41598178739336),
 (u'laureateduring', 11.293365137302578),
 (u'2012obama', 11.293365137302578),
 (u'husen', 11.293365137302578),
 (u'president', 11.2265246134842),
 (u'domestic', 10.964448288651756),
 (u'normalize', 10.887900029194414),
 (u'brk', 10.887900029194414),
 (u'relief', 10.839679776118567),
 (u'election', 10.712062862503725),
 (u'protection', 10.695889057392007),
 (u'doddfrank', 10.600217956742634),
 (u'signed', 10.552378505322665),
 (u'dont', 10.279138356554288),
 (u'party', 10.06

# Hand-on exercise 3
### Exploring top TF-IDF for other people

###  Elton John

In [None]:
# Fill in here

## Manually compute distances between a few people

Let's manually compare the distances between the articles for a few famous people.

In [48]:
from sklearn.metrics.pairwise import cosine_similarity  

In [49]:
clinton = people[people['name'] == 'Bill Clinton']
print clinton

                                              URI          name  \
36452  <http://dbpedia.org/resource/Bill_Clinton>  Bill Clinton   

                                                    text  
36452  william jefferson bill clinton born william je...  


In [50]:
beckham = people[people['name'] == 'David Beckham']
print beckham

                                               URI           name  \
23386  <http://dbpedia.org/resource/David_Beckham>  David Beckham   

                                                    text  
23386  david robert joseph beckham obe bkm born 2 may...  


# Is Obama closer to Clinton than to Beckham?

We will use cosine distance, which is given by
(1-cosine_similarity)
and find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.

In [51]:
beckham.index

Int64Index([23386], dtype='int64')

In [61]:
1-cosine_similarity(people_tfidf[obama.index], people_tfidf[clinton.index])

array([[ 0.81103282]])

In [62]:
1-cosine_similarity(people_tfidf[obama.index], people_tfidf[beckham.index])

array([[ 0.97443419]])

# Hand-on exercise 4
###  Calculate distance between Elton John and Victoria Beckham using TF-IDF

In [None]:
# Fill in here

###  Calculate distance between Elton John and Paul McCartney using TF-IDF

In [None]:
# Fill in here

### Who is closer to Elton John? Victoria Beckham or Paul McCartney

Answer:

# Build a nearest neighbor model for document retrieval

We now create a nearest-neighbors model and apply it to document retrieval.  

In [30]:
from sklearn.neighbors import NearestNeighbors

In [146]:
nbrs = NearestNeighbors(n_neighbors=5, algorithm='brute', metric=cosine_similarity)

In [153]:
nbrs.fit(people_tfidf.toarray().reshape(1,-1))

MemoryError: 

In [None]:
### Need to find a way to query 

# Applying the nearest-neighbors model for retrieval

## Who is closest to Obama?

In [None]:
# knn_model.query(obama)

As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.

# Other examples of document retrieval

In [None]:
swift = people[people['name'] == 'Taylor Swift']

In [None]:
# knn_model.query(swift)

In [None]:
jolie = people[people['name'] == 'Angelina Jolie']

In [None]:
# knn_model.query(jolie)

In [None]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [None]:
# knn_model.query(arnold)

# Hand-on exercise 5
##  Building Nearest Neighbor using word count 

###  Elton John 

In [None]:
# 

### Victoria Beckham

# Hand-on exercise 6
## Building Nearest Neighbor using TF-IDF

###  Elton John 

In [None]:
#

### Victoria Beckham

In [None]:
#