In [1]:
import numpy as np
import re

from collections import defaultdict

corpus = ["aaa bbb! ccc aaa eee aaa ggg.", "ggg fff eee.", "ddd aaa ggg aaa eee."]

def split_into_words(d):
    d = re.sub(r"\W+", " ", d)
    return d.split()

vocabulary = defaultdict()
vocabulary.default_factory = lambda: len(vocabulary)

def docs2num(corpus, voc):
    for doc in corpus:
        yield [vocabulary[word] for word in split_into_words(doc)]

docs = list(docs2num(corpus,vocabulary))

In [2]:
split_into_words(corpus[0])

['aaa', 'bbb', 'ccc', 'aaa', 'eee', 'aaa', 'ggg']

In [3]:
vocabulary

defaultdict(<function __main__.<lambda>>,
            {'aaa': 0,
             'bbb': 1,
             'ccc': 2,
             'ddd': 6,
             'eee': 3,
             'fff': 5,
             'ggg': 4})

In [4]:
# Create the doc-term matrix
nbDocs = len(docs)
nbWords = len(vocabulary)

docTermMatrix = np.zeros((nbDocs, nbWords), dtype=int)

In [5]:
docTermMatrix

array([[0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0]])

In [6]:
for idx, d in enumerate(docs):
    for w_num in d:
        docTermMatrix[idx, w_num] += 1

In [7]:
docTermMatrix

array([[3, 1, 1, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 1, 0],
       [2, 0, 0, 1, 1, 0, 1]])

In [8]:
docs

[[0, 1, 2, 0, 3, 0, 4], [4, 5, 3], [6, 0, 4, 0, 3]]

In [9]:
re.sub(r"\W+", " ", "a b c d1 e f g.").split()

['a', 'b', 'c', 'd1', 'e', 'f', 'g']

In [10]:
doc_1 = np.array([1, 2, 0, 0])
doc_2 = np.array([50, 20, 0, 0])
doc_3 = np.array([0, 0, 2, 3])

print("before standardization")
print("distance between d1 and d2: {:.3f}".format(np.linalg.norm(doc_1 - doc_2)))
print("distance between d1 and d3: {:.3f}".format(np.linalg.norm(doc_1 - doc_3)))
print()

m = np.vstack((doc_1, doc_2, doc_3))
d = np.diag((1. / m.std(axis=0)))
# standardize
m = m.dot(d)

print("after standardization")
print("distance between d1 and d2: {:.3f}".format(np.linalg.norm(m[0] - m[1])))
print("distance between d1 and d3: {:.3f}".format(np.linalg.norm(m[0] - m[2])))
print()

before standardization
distance between d1 and d2: 52.202
distance between d1 and d3: 4.243

after standardization
distance between d1 and d2: 2.901
distance between d1 and d3: 3.009



In [11]:
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfTransformer

In [13]:
tfidf = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=False)
normalize_weighted_D = normalize(docTermMatrix)

In [14]:
normalize_weighted_D

array([[ 0.83205029,  0.2773501 ,  0.2773501 ,  0.2773501 ,  0.2773501 ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.57735027,  0.57735027,
         0.57735027,  0.        ],
       [ 0.75592895,  0.        ,  0.        ,  0.37796447,  0.37796447,
         0.        ,  0.37796447]])

In [15]:
tfidf.fit_transform(docTermMatrix).todense()

matrix([[ 0.78860962,  0.39251202,  0.39251202,  0.18703408,  0.18703408,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.39515588,  0.39515588,
          0.829279  ,  0.        ],
        [ 0.74318769,  0.        ,  0.        ,  0.26439208,  0.26439208,
          0.        ,  0.55485647]])

In [16]:
docTermMatrix

array([[3, 1, 1, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 1, 0],
       [2, 0, 0, 1, 1, 0, 1]])