In [1]:
import pandas as pd
import numpy as np

In [2]:
data_set = pd.read_csv('data/people_wiki.csv')

In [3]:
print(data_set.shape)

(59071, 3)


In [4]:
print(data_set.head(10))

                                                 URI                 name  \
0        <http://dbpedia.org/resource/Digby_Morrell>        Digby Morrell   
1       <http://dbpedia.org/resource/Alfred_J._Lewy>       Alfred J. Lewy   
2        <http://dbpedia.org/resource/Harpdog_Brown>        Harpdog Brown   
3  <http://dbpedia.org/resource/Franz_Rottensteiner>  Franz Rottensteiner   
4               <http://dbpedia.org/resource/G-Enka>               G-Enka   
5        <http://dbpedia.org/resource/Sam_Henderson>        Sam Henderson   
6        <http://dbpedia.org/resource/Aaron_LaCrate>        Aaron LaCrate   
7      <http://dbpedia.org/resource/Trevor_Ferguson>      Trevor Ferguson   
8         <http://dbpedia.org/resource/Grant_Nelson>         Grant Nelson   
9         <http://dbpedia.org/resource/Cathy_Caruth>         Cathy Caruth   

                                                text  
0  digby morrell born 10 october 1979 is a former...  
1  alfred j lewy aka sandy lewy graduated 

In [5]:
obama = data_set.loc[data_set['name'] == 'Barack Obama']
print(obama.text)

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english')
word_counts = count_vect.fit_transform(data_set.text)
print(type(word_counts))

<class 'scipy.sparse.csr.csr_matrix'>


In [7]:
word_counts

<59071x548115 sparse matrix of type '<class 'numpy.int64'>'
	with 8078359 stored elements in Compressed Sparse Row format>

In [8]:
## get frequency of the word 'the'

In [9]:
count_vect.vocabulary_['obama']

358378

In [10]:
# show all uniq words

In [11]:
count_vect.get_feature_names()

['00',
 '000',
 '0000',
 '00000',
 '00000van',
 '0001',
 '00014338',
 '0001sec',
 '0002',
 '00026',
 '0003',
 '0005',
 '000577',
 '0005sec',
 '0006',
 '0007',
 '0007105916',
 '0007200374',
 '0007207328',
 '0007213506',
 '000721426xhe',
 '0007a',
 '000he',
 '000in',
 '000m',
 '000seelenprojekt',
 '000tnmickushina',
 '001',
 '0017',
 '001cd',
 '001ehebbm',
 '002',
 '0020849605',
 '0024',
 '0026183900',
 '002864574x',
 '0028659287',
 '003',
 '0033',
 '0034',
 '0036',
 '004',
 '0043',
 '0046',
 '004erdemir',
 '005',
 '006',
 '0060222425',
 '0060628227',
 '0060628464',
 '0060669667',
 '006074393x',
 '0064',
 '0066',
 '007',
 '0070710481',
 '0071357440',
 '0071375627',
 '0072131772',
 '0072131896',
 '0072222611',
 '0072225351',
 '0072438886',
 '007all',
 '008',
 '0080',
 '0080357547',
 '008after',
 '009',
 '00906603',
 '0091',
 '0091857112',
 '0091900255she',
 '0099416689',
 '009at',
 '00a10',
 '00g',
 '00s',
 '00sex',
 '00sin',
 '01',
 '010',
 '0100',
 '01000400',
 '01011001',
 '01011001i',

In [12]:
from itertools import islice
list(islice(count_vect.vocabulary_.items(), 20))

[('digby', 160586),
 ('morrell', 336006),
 ('born', 96562),
 ('10', 704),
 ('october', 359398),
 ('1979', 11143),
 ('australian', 72855),
 ('rules', 427109),
 ('footballer', 202134),
 ('played', 386788),
 ('kangaroos', 272389),
 ('carlton', 111892),
 ('football', 202093),
 ('league', 293372),
 ('aflfrom', 48412),
 ('western', 529065),
 ('australia', 72702),
 ('early', 172590),
 ('senior', 442626),
 ('west', 528963)]

In [13]:
# ordered count of all words

In [14]:
occ = np.asarray(word_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': count_vect.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

Unnamed: 0,occurrences,term
509445,59644,university
96562,54633,born
349506,49604,new
340895,29888,music
345486,29758,national
57247,28804,american
541245,28802,years
436036,28432,school
323328,28403,member
535617,27284,world


In [15]:
# use trained vectorizer to count the 'Obama' word occurences

In [16]:
obama_count = count_vect.transform(obama.text)
print(obama_count)

  (0, 3270)	1
  (0, 8159)	1
  (0, 14807)	1
  (0, 16121)	1
  (0, 16484)	1
  (0, 18130)	2
  (0, 18413)	1
  (0, 20271)	3
  (0, 22237)	1
  (0, 23034)	1
  (0, 23789)	3
  (0, 24571)	2
  (0, 25312)	3
  (0, 26011)	1
  (0, 26449)	1
  (0, 26651)	1
  (0, 34697)	1
  (0, 37524)	1
  (0, 45070)	8
  (0, 46357)	1
  (0, 46832)	1
  (0, 48228)	1
  (0, 48258)	2
  (0, 48631)	1
  (0, 57247)	3
  :	:
  (0, 473301)	1
  (0, 475253)	1
  (0, 477301)	1
  (0, 481988)	1
  (0, 482176)	1
  (0, 482236)	1
  (0, 485165)	1
  (0, 486237)	3
  (0, 486344)	1
  (0, 496705)	1
  (0, 499831)	1
  (0, 501497)	1
  (0, 507829)	1
  (0, 508423)	1
  (0, 509144)	3
  (0, 509445)	2
  (0, 510860)	1
  (0, 511694)	1
  (0, 518492)	1
  (0, 524331)	1
  (0, 524897)	1
  (0, 533155)	1
  (0, 534477)	1
  (0, 535182)	1
  (0, 541245)	1


In [17]:
# use trained vectorizer to decipher which word occurs 14 times in the 'Obama' text

In [18]:
print (count_vect.get_feature_names()[494541])

todayrob


In [19]:
# get top 10 obama words

In [20]:
obama_occ = np.asarray(obama_count.sum(axis=0)).ravel().tolist()
obama_counts_df = pd.DataFrame({'term': count_vect.get_feature_names(), 'occurrences': obama_occ})
obama_counts_df.sort_values(by='occurrences', ascending=False).head(20)

Unnamed: 0,occurrences,term
358378,9,obama
45070,8,act
292368,6,law
328423,4,military
259085,4,iraq
395397,4,president
155872,4,democratic
138523,4,control
436036,3,school
20271,3,2004


In [21]:
# From the vectorized count, we can use the TfidfTransformer to calculate the tf-idf weights
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(word_counts)
transformed_weights

<59071x548115 sparse matrix of type '<class 'numpy.float64'>'
	with 8078359 stored elements in Compressed Sparse Row format>

In [22]:
# Let's examine the top 20 ngrams by average tf-idf weight
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': count_vect.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
509445,university,0.017893
349506,new,0.014489
340895,music,0.01266
96562,born,0.011502
197102,film,0.011009
386788,played,0.01086
293372,league,0.010484
439001,season,0.010132
345486,national,0.010078
323328,member,0.010029


In [23]:
# use TFIDF on Obama text

In [24]:
obama_tf_idf = transformer.transform(obama_count)
print(obama_tf_idf)

  (0, 541245)	0.01641457583591241
  (0, 535182)	0.020200567551102604
  (0, 534477)	0.01885339848983865
  (0, 533155)	0.057487564796564326
  (0, 524897)	0.030132338063191502
  (0, 524331)	0.039710388381786775
  (0, 518492)	0.03467685716561149
  (0, 511694)	0.056876213404974736
  (0, 510860)	0.04610966636712518
  (0, 509445)	0.02922337824867251
  (0, 509144)	0.06093269911705297
  (0, 508423)	0.06034920350773561
  (0, 507829)	0.061791306275852116
  (0, 501497)	0.06505947899937921
  (0, 499831)	0.05451945180683298
  (0, 496705)	0.03382416456099088
  (0, 486344)	0.03815101177628357
  (0, 486237)	0.09743331594028297
  (0, 485165)	0.04589476146829438
  (0, 482236)	0.06646487791129124
  (0, 482176)	0.04385097256997385
  (0, 481988)	0.030438293702223316
  (0, 477301)	0.04712764416750055
  (0, 475253)	0.03886211196888937
  (0, 473301)	0.0587068453138822
  :	:
  (0, 57247)	0.05047970987121912
  (0, 48631)	0.036239125199723665
  (0, 48258)	0.09029661234723273
  (0, 48228)	0.05636982183302688
  (0,

In [25]:
# get top 10 words by TFIDF for Obama

In [26]:
obama_weights = np.asarray(obama_tf_idf.sum(axis=0)).ravel().tolist()
obama_tf_idf_df = pd.DataFrame({'term': count_vect.get_feature_names(), 'weight': obama_weights})
obama_tf_idf_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
358378,obama,0.413495
45070,act,0.28217
259085,iraq,0.17197
292368,law,0.163903
138523,control,0.149369
365249,ordered,0.138633
328423,military,0.135368
155872,democratic,0.129792
258530,involvement,0.124821
417604,response,0.124821
