In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Fitting and transforming a list of documents (in this case a list of sentences)

In [2]:
v = CountVectorizer()

sent = ["The sky is blue.", "The sun is bright today.", \
        "The sun in the sky is bright.", "We can see the shining sun, the bright sun."]
v.fit(sent)

transformed = v.transform(sent)

print(v.vocabulary_, '\nWord sky has id %d' % v.vocabulary_['sky'])

{'can': 2, 'sun': 8, 'today': 10, 'see': 5, 'is': 4, 'bright': 1, 'we': 11, 'in': 3, 'shining': 6, 'the': 9, 'blue': 0, 'sky': 7} 
Word sky has id 7


### Printing the vocabulary of the fitted vectorizer and the term frequency matrix

In [3]:
#Getting the vocabulary dictionary and sorting it based on value (word id)
print(sorted(v.vocabulary_.items(), key=lambda x: x[1])) 

print(transformed.toarray()) # The term frequency matrix for the four sentences

[('blue', 0), ('bright', 1), ('can', 2), ('in', 3), ('is', 4), ('see', 5), ('shining', 6), ('sky', 7), ('sun', 8), ('the', 9), ('today', 10), ('we', 11)]
[[1 0 0 0 1 0 0 1 0 1 0 0]
 [0 1 0 0 1 0 0 0 1 1 1 0]
 [0 1 0 1 1 0 0 1 1 2 0 0]
 [0 1 1 0 0 1 1 0 2 2 0 1]]


### The sparse matrix, a tuple of (doc_id, word_id) and the corresponding word count

In [4]:
print(transformed)

  (0, 0)	1
  (0, 4)	1
  (0, 7)	1
  (0, 9)	1
  (1, 1)	1
  (1, 4)	1
  (1, 8)	1
  (1, 9)	1
  (1, 10)	1
  (2, 1)	1
  (2, 3)	1
  (2, 4)	1
  (2, 7)	1
  (2, 8)	1
  (2, 9)	2
  (3, 1)	1
  (3, 2)	1
  (3, 5)	1
  (3, 6)	1
  (3, 8)	2
  (3, 9)	2
  (3, 11)	1


### Transforming a new sentence using the same vectorizer (will maintain word ids)

In [5]:
sent2 = ['The moon is bright today']

print(v.transform(sent2))

  (0, 1)	1
  (0, 4)	1
  (0, 9)	1
  (0, 10)	1


### The word moon doesn't appear in original vocabulary, so will not be found even after transforming new sentence

In [6]:
print('moon' in v.vocabulary_)

False


### Generating the Tfidf Matrix

In [8]:
vv = TfidfVectorizer(norm = None)
tfidf = vv.fit_transform(sent)
print(sorted(vv.vocabulary_.items(), key=lambda x : x[1]))

for row in tfidf.toarray():
    print(["%.4f"% val for val in row])

[('blue', 0), ('bright', 1), ('can', 2), ('in', 3), ('is', 4), ('see', 5), ('shining', 6), ('sky', 7), ('sun', 8), ('the', 9), ('today', 10), ('we', 11)]
['1.9163', '0.0000', '0.0000', '0.0000', '1.2231', '0.0000', '0.0000', '1.5108', '0.0000', '1.0000', '0.0000', '0.0000']
['0.0000', '1.2231', '0.0000', '0.0000', '1.2231', '0.0000', '0.0000', '0.0000', '1.2231', '1.0000', '1.9163', '0.0000']
['0.0000', '1.2231', '0.0000', '1.9163', '1.2231', '0.0000', '0.0000', '1.5108', '1.2231', '2.0000', '0.0000', '0.0000']
['0.0000', '1.2231', '1.9163', '0.0000', '0.0000', '1.9163', '1.9163', '0.0000', '2.4463', '2.0000', '0.0000', '1.9163']


### Building the Tfidf matrix for the first 200 documents in the simple-wiki

In [9]:
import os

vv = TfidfVectorizer()

docs = [] # a list of the text content of every document
names = [] # maintains a list of file names
for root, dirs, files in os.walk('single-docs'):
    for file in files[:200]:
        with open(os.path.join(root,file), 'r', encoding='utf8') as f:
            docs.append(f.read())
            names.append(file)
            
vv.fit(docs)

transformed = vv.transform(docs)
print(transformed.shape) # 200 documents by the number of unique words
print(transformed.toarray()) # will be mostly zeros as a document contains only a small subset of the number of words

(200, 6401)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


### Example for cosine similarity and argmax functions

In [10]:
from scipy.spatial.distance import cosine, cdist
import numpy as np

print(1-cosine([1,2], [1,3])) # 1 - cosine(x,y) because cosine(x,y) in scipy returns distance not similarity
print(np.argmax([0.8,0.85,0.9]))

0.989949493661
2


### Transforming a query to tf-idf (using the same vectorizer)

In [11]:
q = ['president of']
tq = vv.transform(q)
print(tq, '\n', vv.vocabulary_['president'])

  (0, 4371)	0.940348435107
  (0, 3938)	0.3402129048 
 4371


### Printing file content having maximum similarity with query

In [12]:
import numpy as np

# Generates similarities between query tf-idf matrix and document tf-idf matrix
# Since there's only 1 query, access the first element of the array
# Format is a list of 200 elements, where every element is the cosine similarity between the query and the ith document
sims = 1-cdist(tq.toarray(), transformed.toarray(), metric='cosine')[0]

max_sim = max(sims)
print('Maximum cosine similarity is', max_sim)
max_pos = np.argmax(sims)
print('Document index with max similarity is %d, with file name %s' % (max_pos, names[max_pos]))

print('\nFile:\n')

print(open('single-docs/'+names[max_pos], 'r').read())
print(names[max_pos])

Maximum cosine similarity is 0.295128719449
Document index with max similarity is 121, with file name sw_100427.txt

File:

Donald Tusk

Donald Tusk (, born 22 April 1957) is the President of the European Council. Before that he was the Prime Minister of Poland from 2007 to 2014. He was the leader of the biggest Polish political party, Platforma Obywatelska (Civic Platform). In August 2014, Tusk was elected to become the next President of the European Council.

In March 2017, Tusk was reelected as President of the European Council.




sw_100427.txt
