In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [2]:
df = pd.read_csv('people_wiki.csv')

## Explore

In [3]:
df.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
len(df)

59071

## Count words

In [5]:
tokenizer = RegexpTokenizer(r'\w+')
df['word_count'] = df['text'].apply(lambda x : dict(Counter(tokenizer.tokenize(x))))
df.head()

Unnamed: 0,URI,name,text,word_count
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'was': 5, '21': 1, 'until': 1, 'acted': 1, 'i..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'thomas': 1, '1981': 1, 'follows': 1, 'modali..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'promoter': 1, 'best': 4, 'combos': 1, 'socie..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'continental': 1, 'stated': 1, 'until': 1, 'l..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'homophobes': 1, 'famous': 3, 'paul': 2, 'sta..."


## Compute tfidf's

In [6]:
vectorizer = TfidfVectorizer(min_df=2, max_df=0.95)
X = vectorizer.fit_transform(df.text)
idf = vectorizer.idf_
idf_dict = dict(zip(vectorizer.get_feature_names(), idf))
# visualize
df['tfidf'] = df['word_count'].apply(lambda x : {k: v * idf_dict[k] for (k,v) in x.items() if k in idf_dict})
df.head()

Unnamed: 0,URI,name,text,word_count,tfidf
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'was': 5, '21': 1, 'until': 1, 'acted': 1, 'i...","{'2002': 2.87532951742, 'afl': 5.6986537577, '..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'thomas': 1, '1981': 1, 'follows': 1, 'modali...","{'totally': 6.80472876757, 'jet': 6.8625483384..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'promoter': 1, 'best': 4, 'combos': 1, 'socie...","{'promoter': 6.63465418439, 'extensively': 4.7..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'continental': 1, 'stated': 1, 'until': 1, 'l...","{'was': 3.23809316195, 'stated': 4.74945329174..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'homophobes': 1, 'famous': 3, 'paul': 2, 'sta...","{'homophobes': 10.8879000292, 'famous': 12.643..."


## Get the word counts for Obama article

In [7]:
obama = df[df.name == "Barack Obama"]

wc = obama.iloc[0]["word_count"]
wc_df = pd.DataFrame.from_dict(wc, orient='index')
wc_df = wc_df.sort(0, ascending=False)
wc_df.head(10)

Unnamed: 0,0
the,40
in,30
and,21
of,18
to,14
his,11
obama,9
act,8
a,7
he,7


## Get the tfidf's for the Obama article

In [8]:
tfidf = obama.iloc[0]["tfidf"]
tfidf_df = pd.DataFrame.from_dict(tfidf, orient='index')
tfidf_df = tfidf_df.sort(0, ascending=False)
tfidf_df.head(10)

Unnamed: 0,0
obama,52.277114
act,35.674051
iraq,21.741728
law,20.721856
control,18.88433
us,17.592044
ordered,17.52698
military,17.114203
democratic,16.40925
involvement,15.780837


## Cosine similarity

In [9]:
clinton = df[df['name'] == 'Bill Clinton']
beckham = df[df['name'] == 'David Beckham']
clinton_vec = vectorizer.transform(clinton['text'])[0]
beckham_vec = vectorizer.transform(beckham['text'])[0]
obama_vec = vectorizer.transform(obama['text'])[0]

print(cosine_similarity(obama_vec, obama_vec))
print(cosine_similarity(obama_vec, clinton_vec))
print(cosine_similarity(obama_vec, beckham_vec))
print(cosine_similarity(clinton_vec, beckham_vec))

[[ 1.]]
[[ 0.22615606]]
[[ 0.05833899]]
[[ 0.0832253]]


## N-Nearest Neighbor

In [10]:
nn = NearestNeighbors(n_neighbors=4, algorithm='auto').fit(X)

In [11]:
distances, indices = nn.kneighbors(obama_vec, n_neighbors=11)
nbrs = [(x, df.iloc[x]["name"], y) for (x,y) in list(zip(indices[0], distances[0])) ]
pd.options.display.float_format = '{:,.3f}'.format
pd.DataFrame(nbrs, columns=['index', 'name', 'distance'])

Unnamed: 0,index,name,distance
0,35817,Barack Obama,0.0
1,24478,Joe Biden,1.138
2,38376,Samantha Power,1.191
3,38714,Eric Stern (politician),1.204
4,57108,Hillary Rodham Clinton,1.206
5,2412,Joe the Plumber,1.212
6,46140,Robert Gibbs,1.222
7,6796,Eric Holder,1.223
8,7914,Phil Schiliro,1.225
9,39357,John McCain,1.229
