In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load some text data - from wikipedia, pages on people using pandas

In [80]:
people = pd.read_csv('people_wiki.csv')

In [81]:
# Data contains: link to wikipedia article, name of person, text of article.
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [82]:
#number of records in the dataset
len(people)

59071

# Explore the dataset and checkout the text it contains¶
### Exploring the entry for president Obama

In [127]:
#people['name'].value_counts()
#people.name = people.name.str.replace(' *', '')
#people.sort_values('name')

In [128]:
obama = people[people['name']== 'Barack Obama']

In [129]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [130]:
obama['text'].values

array([ 'barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a graduate of columbia university and harvard law school where he served as president of the harvard law review he was a community organizer in chicago before earning his law degree he worked as a civil rights attorney and taught constitutional law at the university of chicago law school from 1992 to 2004 he served three terms representing the 13th district in the illinois senate from 1997 to 2004 running unsuccessfully for the united states house of representatives in 2000in 2004 obama received national attention during his campaign to represent illinois in the united states senate with his victory in the march democratic party primary his keynote address at the democratic national convention in july and his election to the senate in november he began his presidential campaign in 2007 

In [131]:
people.shape

(59071, 3)

In [132]:
#Get the word counts for Obama article
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
features = vectorizer.fit_transform(obama['text'])

In [133]:
features

<1x270 sparse matrix of type '<class 'numpy.int64'>'
	with 270 stored elements in Compressed Sparse Row format>

In [134]:
vocab = vectorizer.get_feature_names()

In [135]:
vocab

['13th',
 '1961',
 '1992',
 '1996',
 '1997',
 '20',
 '2000in',
 '2004',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2012obama',
 '2013',
 '44th',
 '63',
 'act',
 'address',
 'administration',
 'affordable',
 'afghanistan',
 'african',
 'after',
 'against',
 'american',
 'americans',
 'and',
 'arms',
 'as',
 'ask',
 'at',
 'attention',
 'attorney',
 'august',
 'barack',
 'before',
 'began',
 'bin',
 'bm',
 'born',
 'briefs',
 'brk',
 'budget',
 'by',
 'californias',
 'called',
 'campaign',
 'care',
 'chicago',
 'civil',
 'clinton',
 'close',
 'columbia',
 'combat',
 'community',
 'constitutional',
 'consumer',
 'continued',
 'control',
 'convention',
 'court',
 'creation',
 'cuba',
 'current',
 'death',
 'debate',
 'debt',
 'defeated',
 'defeating',
 'defense',
 'degree',
 'delegates',
 'democratic',
 'district',
 'doddfrank',
 'domestic',
 'dont',
 'down',
 'during',
 'earning',
 'economic',
 'election',
 'elementary',
 'ended',
 'ending',
 'equality',
 'federal',
 'filed',

In [136]:
dist = np.sum(features.toarray(), axis=0)
dist

array([ 1,  1,  1,  1,  1,  2,  1,  3,  1,  1,  3,  2,  3,  1,  1,  1,  1,
        1,  8,  1,  1,  1,  2,  1,  4,  1,  3,  1, 21,  1,  6,  1,  2,  1,
        1,  1,  1,  1,  1,  1,  1,  2,  1,  1,  1,  1,  1,  1,  3,  1,  2,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  4,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  4,  1,  1,  2,  2,  1,  2,  1,  1,  3,  1,
        1,  1,  1,  1,  1,  3,  4,  2,  1,  3,  1,  1,  1,  1,  1,  1,  2,
        4,  1,  7,  1, 11,  1,  1,  1,  2,  1,  1,  1,  2, 30,  1,  1,  1,
        1,  1,  1,  3,  4,  2,  1,  3,  1,  1,  1,  1,  1,  1,  6,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  4,  1,  1,  1,  2,  1,  1,
        1,  1,  1,  2,  1,  1,  2,  9,  1, 18,  2,  1,  2,  1,  1,  1,  3,
        1,  1,  1,  1,  3,  1,  1,  1,  2,  4,  2,  1,  2,  1,  1,  1,  1,
        2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  1,  1,
        2,  1,  2,  1,  3,  1,  1,  1,  1,  1,  1,  1,  1,  3,  1,  2,  3,
        2,  1,  3,  1,  1

In [137]:
#Sort the word counts for the Obama article
#Turning dictonary of word counts into a dataframe

obama_word_count_table = pd.DataFrame({'token':vocab, 'count':dist}).sort_values('count')

In [138]:
obama_word_count_table.sort_values('count', ascending=False).head(10)

Unnamed: 0,count,token
242,40,the
115,30,in
28,21,and
162,18,of
245,14,to
106,11,his
160,9,obama
18,8,act
104,7,he
133,6,law


# Compute TF-IDF for the corpus
### To give more weight to informative words, we weigh them by their TF-IDF scores.
### We'll vectorize the entire corpus and compute the TF-IDF in one step

In [139]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()

In [140]:
dtm = vect.fit_transform(people['text'])

In [141]:
dtm.shape

(59071, 548429)

In [142]:
features = vect.get_feature_names()

In [143]:
features

['00',
 '000',
 '0000',
 '00000',
 '00000van',
 '0001',
 '00014338',
 '0001sec',
 '0002',
 '00026',
 '0003',
 '0005',
 '000577',
 '0005sec',
 '0006',
 '0007',
 '0007105916',
 '0007200374',
 '0007207328',
 '0007213506',
 '000721426xhe',
 '0007a',
 '000he',
 '000in',
 '000m',
 '000seelenprojekt',
 '000tnmickushina',
 '001',
 '0017',
 '001cd',
 '001ehebbm',
 '002',
 '0020849605',
 '0024',
 '0026183900',
 '002864574x',
 '0028659287',
 '003',
 '0033',
 '0034',
 '0036',
 '004',
 '0043',
 '0046',
 '004erdemir',
 '005',
 '006',
 '0060222425',
 '0060628227',
 '0060628464',
 '0060669667',
 '006074393x',
 '0064',
 '0066',
 '007',
 '0070710481',
 '0071357440',
 '0071375627',
 '0072131772',
 '0072131896',
 '0072222611',
 '0072225351',
 '0072438886',
 '007all',
 '008',
 '0080',
 '0080357547',
 '008after',
 '009',
 '00906603',
 '0091',
 '0091857112',
 '0091900255she',
 '0099416689',
 '009at',
 '00a10',
 '00g',
 '00s',
 '00sex',
 '00sin',
 '01',
 '010',
 '0100',
 '01000400',
 '01011001',
 '01011001i',

In [144]:
#Examine the TF-IDF for the Obama article
#Remember that the Obama article is at index 35817
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [145]:
#So we'll examine the document term matrix at index 35817 and pull out the TF-IDF by each word.
#First we'll make a dictionary
word_scores = {}
for word in vocab:
    word_scores[word] = dtm[35817, features.index(word)]

In [146]:
obama_word_scores = pd.DataFrame(list(word_scores.items()),
                      columns=['word', 'tfidf'])

In [148]:
obama_word_scores.sort_values('tfidf', ascending=False).head(10)

Unnamed: 0,word,tfidf
140,obama,0.365018
156,the,0.279323
157,act,0.249089
209,in,0.209673
76,iraq,0.151809
82,and,0.146739
216,law,0.144687
34,control,0.131857
67,of,0.126205
184,us,0.122834


In [149]:
vect = TfidfVectorizer(stop_words = 'english')
dtm = vect.fit_transform(people['text'])
features = vect.get_feature_names()

In [150]:
dtm.shape

(59071, 548115)

In [151]:
word_scores = {}
for word in vocab:
    if word in features: # vocab contains stop words that are no longer in our dtm
        word_scores[word] = dtm[35817, features.index(word)]

In [152]:
obama_word_scores = pd.DataFrame(list(word_scores.items()),
                      columns=['word', 'tfidf'])

In [153]:
obama_word_scores.sort_values('tfidf', ascending=False).head(10)

Unnamed: 0,word,tfidf
119,obama,0.413495
134,act,0.28217
61,iraq,0.17197
181,law,0.163903
113,control,0.149369
196,ordered,0.138633
217,military,0.135368
30,democratic,0.129792
66,response,0.124821
31,involvement,0.124821


# Manually compute similarity and distances between a few people
### Let's manually compare the similarity and distances between the articles for a few famous people.


In [154]:
#Is Obama closer to Clinton than to Beckham?
#Note that we're computing the cosine similarity (which ranges from 0 to 1, where 1 is most similar) not the cosine distance which we can use as well.
#cosine_distance = (1-cosine_similarity)
#We should find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.


from sklearn.metrics.pairwise import cosine_similarity
clinton = people[people['name'] == 'Bill Clinton']
clooney = people[people['name'] == 'George Clooney']

In [155]:
cosine_similarity(dtm[obama.index], dtm[clinton.index])

array([[ 0.18896718]])

In [156]:
cosine_similarity(dtm[obama.index], dtm[clooney.index])

array([[ 0.0376872]])

In [157]:
#cosine distance
from sklearn.metrics.pairwise import pairwise_distances
pairwise_distances(dtm[obama.index], dtm[clinton.index], metric='cosine')

array([[ 0.81103282]])

In [158]:
pairwise_distances(dtm[obama.index], dtm[clooney.index], metric='cosine')

array([[ 0.9623128]])

In [159]:
#Build a nearest neighbor model for document retrieval¶
dtm.shape

(59071, 548115)

In [160]:
from sklearn.neighbors import NearestNeighbors
neighbors = NearestNeighbors(n_neighbors=10)

In [161]:
neighbors.fit(dtm)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=10, p=2, radius=1.0)

In [162]:
#Who is closest to Obama?
distances, indices = neighbors.kneighbors(dtm[obama.index], return_distance=True)

In [163]:
distances

array([[ 0.        ,  1.16514466,  1.2073695 ,  1.21963978,  1.22250901,
         1.23617841,  1.24305659,  1.24466747,  1.24829554,  1.25160692]])

In [164]:
indices

array([[35817, 24478, 38376, 57108, 38714, 46140, 18827, 44681,  6796,
         2412]])

In [167]:
closest_people = zip(people.iloc[indices[0]]['name'], distances[0])

In [168]:
print(list(closest_people))

[('Barack Obama', 0.0), ('Joe Biden', 1.16514466168904), ('Samantha Power', 1.2073694972352877), ('Hillary Rodham Clinton', 1.2196397836767039), ('Eric Stern (politician)', 1.2225090075525111), ('Robert Gibbs', 1.2361784064861678), ('Henry Waxman', 1.2430565883007181), ('Jesse Lee (politician)', 1.244667471314294), ('Eric Holder', 1.248295535663575), ('Joe the Plumber', 1.2516069176926294)]
