## Load pandas

In [None]:
import pandas as pd

## Load some text data - from wikipedia, pages on people

In [None]:
people = pd.read_csv('people_wiki.csv')

In [None]:
people.head()

In [None]:
len(people)

## Explore the dataset and checkout the text it contains

# Exploring the entry for president Obama

In [None]:
obama = people[people['name'] == 'Barack Obama']

In [None]:
obama

In [None]:
text = obama['text']
print(text)

# Hand-on exercise 1
### Exploring the entry for actor George Clooney

In [None]:
# Fill in the answer here

# Get the word counts for Obama article

In [None]:
from collections import Counter
word_count_obama = [Counter(" ".join(obama['text'].values.tolist()).split(" "))]

In [None]:
print word_count_obama

# Sort the word counts for the Obama article


### Sorting the word counts to show top common words at the top

In [None]:
import operator
def cal_word_count(each_people):
    each_words = [Counter(" ".join(each_people['text'].values.tolist()).split(" "))]
    sorted_words = sorted(each_words[0].items(),key=operator.itemgetter(1))
    return sorted_words

In [None]:
sorted_words_obama = cal_word_count(each_people=obama)
sorted_words_obama[:30]

In [None]:
sorted_words_obama.reverse()
sorted_words_obama[:10]

Most common words include uninformative words like "the", "in", "and", etc.

Common words in a language that provide little information about the text are called "stop words".

# Hand-on exercise 2
### Exploring sorting the word counts for Elton John

In [None]:
# 

# Count words and compute TF-IDF for the corpus (collection of articles)


To give more weight to informative words, we weigh them by their TF-IDF scores. TF-IDF is term frequency-inverse document frequency.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

CountVectorizer converts a collection of text documents to a matrix of token counts

In [None]:
vectorizer = CountVectorizer(analyzer="word", stop_words = "english")

fit_transform will learn the vocabulary dictionary and return term-document matrix. (tf calculation)

In [None]:
word_counts = vectorizer.fit_transform(people['text'])

In [None]:
word_counts

get_feature_names returns indeces and feature names 

In [None]:
features_names = vectorizer.get_feature_names()

TfidfTransformer will transform a count matrix to a normalized tf or tf-idf representation

In [None]:
transformer = TfidfTransformer(norm=None)

In [None]:
people_tfidf = transformer.fit_transform(word_counts)

In [None]:
people_tfidf

#### Words with highest TF-IDF are much more informative.

In [None]:
obama_tfidf = people_tfidf[obama.index]

In [None]:
obama_features = map(lambda x: features_names[x], obama_tfidf.indices)

In [None]:
obama_tfidf_vec = dict(zip(obama_features, obama_tfidf.data))

In [None]:
import operator

In [None]:
sorted_obama_tfidf = sorted(obama_tfidf_vec.items(), key=operator.itemgetter(1))

In [None]:
sorted_obama_tfidf.reverse()

In [None]:
sorted_obama_tfidf[:10]

In [None]:
def cal_top_freq_features(each_person):
    
    each_tfidf = people_tfidf[each_person.index]
    each_features = map(lambda x: features_names[x], each_tfidf.indices)
    each_tfidf_vec = dict(zip(each_features, each_tfidf.data))
    each_sorted_tfidf = sorted(each_tfidf_vec.items(), key=operator.itemgetter(1))
    each_sorted_tfidf.reverse()
    
    return each_sorted_tfidf

If you want to reproduce obama again with cal_top_freq_features, 

In [None]:
cal_top_freq_features(each_person=obama)[:10]

# Hand-on exercise 3
### Exploring top TF-IDF for other people

###  Elton John

In [None]:
# Fill in here

## Manually compute distances between a few people

Let's manually compare the distances between the articles for a few famous people.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity  

In [None]:
clinton = people[people['name'] == 'Bill Clinton']
print clinton

In [None]:
beckham = people[people['name'] == 'David Beckham']
print beckham

# Is Obama closer to Clinton than to Beckham?

We will use cosine distance, which is given by
(1-cosine_similarity)
and find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.

In [None]:
beckham.index

In [None]:
1-cosine_similarity(people_tfidf[obama.index], people_tfidf[clinton.index])

In [None]:
1-cosine_similarity(people_tfidf[obama.index], people_tfidf[beckham.index])

# Hand-on exercise 4
###  Calculate distance between Elton John and Victoria Beckham using TF-IDF

In [None]:
# Fill in here

###  Calculate distance between Elton John and Paul McCartney using TF-IDF

In [None]:
# Fill in here

### Who is closer to Elton John? Victoria Beckham or Paul McCartney

Answer:

# Build a nearest neighbor model for document retrieval

We now create a nearest-neighbors model and apply it to document retrieval.  

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import DistanceMetric

In [None]:
#Euclidean distance metric is common
nbrs1 = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='euclidean')#cosine_similarity)

#no metric specified
nbrs2 = NearestNeighbors(n_neighbors=5, algorithm='brute')

In [None]:
nbrs1.fit(people_tfidf)

In [None]:
nbrs2.fit(people_tfidf)

In [None]:
#try another distance metric
nbrs3 = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='manhattan')

In [None]:
nbrs3.fit(people_tfidf)

# Applying the nearest-neighbors model for retrieval

## Who is closest to Obama?

In [None]:
import numpy as np

In [None]:
# find the 10 nearest neighbors
# knn_model.query(obama)

dist1, index1 = nbrs1.kneighbors(X=people_tfidf[obama.index], n_neighbors=10, return_distance=True)

In [None]:
zip(map(lambda x: people['name'].iloc[x], index1), dist1)

In [None]:
dist2, index2 = nbrs2.kneighbors(X=people_tfidf[obama.index], n_neighbors=10, return_distance=True)

In [None]:
zip(map(lambda x: people['name'].iloc[x], index2), dist2)

In [None]:
#manhattan distance metric
dist3, index3 = nbrs3.kneighbors(X=people_tfidf[obama.index], n_neighbors=10, return_distance=True)

In [None]:
zip(map(lambda x: people['name'].iloc[x], index3), dist3)

# Other examples of document retrieval

In [None]:
swift = people[people['name'] == 'Taylor Swift']

In [None]:
# knn_model.query(swift)

In [None]:
jolie = people[people['name'] == 'Angelina Jolie']

In [None]:
# knn_model.query(jolie)

In [None]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [None]:
# knn_model.query(arnold)

# Hand-on exercise 5
##  Building Nearest Neighbor using word count 

###  Elton John 

In [None]:
# 

### Victoria Beckham

# Hand-on exercise 6
## Building Nearest Neighbor using TF-IDF

###  Elton John 

In [None]:
#

### Victoria Beckham

In [None]:
#