In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import re
import numpy as np

In [10]:
docs = ["I never drinking beer in the morning", 
        "Bavarian beer is the best beer all over the world"]

In [3]:
STOPWORDS = stopwords.words('english')
stemmer = SnowballStemmer("english")
tokenizer = RegexpTokenizer('\w+')

In [5]:
print(len(STOPWORDS))

153


In [7]:
def parse_doc(line):
    return [stemmer.stem(word) for word in tokenizer.tokenize(line.lower()) 
            if len(word)>0 and word not in STOPWORDS]

In [11]:
docs_stage1 = [parse_doc(d) for d in docs]

In [12]:
print(docs_stage1)

[['never', 'drink', 'beer', 'morn'], ['bavarian', 'beer', 'best', 'beer', 'world']]


In [14]:
vect1 = CountVectorizer(
      input='content'
      , strip_accents = 'unicode'
      , analyzer = 'word'
      , ngram_range=(1, 1)
      , tokenizer = parse_doc
  )
vect2 = CountVectorizer(
      input='content'
      , strip_accents = 'unicode'
      , analyzer = 'word'
      , ngram_range=(2, 2)
      , tokenizer = parse_doc
  )
vect3 = TfidfVectorizer(
      input='content'
      , strip_accents = 'unicode'
      , analyzer = 'word'
      , ngram_range=(1, 1)
      , tokenizer = parse_doc
      , smooth_idf = False
      , norm = None
  )
vect4 = TfidfVectorizer(
      input='content'
      , strip_accents = 'unicode'
      , analyzer = 'word'
      , ngram_range=(1, 1)
      , tokenizer = parse_doc
      , smooth_idf = False
  )

In [15]:
matrix1 = vect1.fit_transform(docs)

In [16]:
print(vect1.get_feature_names())
print(matrix1)

['bavarian', 'beer', 'best', 'drink', 'morn', 'never', 'world']
  (0, 4)	1
  (0, 1)	1
  (0, 3)	1
  (0, 5)	1
  (1, 6)	1
  (1, 2)	1
  (1, 0)	1
  (1, 1)	2


In [31]:
matrix2 = vect2.fit_transform(docs)

In [32]:
print(vect2.get_feature_names())
print(matrix2)

['bavarian beer', 'beer best', 'beer morn', 'beer world', 'best beer', 'drink beer', 'never drink']
  (0, 2)	1
  (0, 5)	1
  (0, 6)	1
  (1, 3)	1
  (1, 4)	1
  (1, 1)	1
  (1, 0)	1


In [17]:
matrix3 = vect3.fit_transform(docs)

In [23]:
print(vect3.get_feature_names())
print(matrix3)

['bavarian', 'beer', 'best', 'drink', 'morn', 'never', 'world']
  (0, 5)	1.69314718056
  (0, 3)	1.69314718056
  (0, 1)	1.0
  (0, 4)	1.69314718056
  (1, 1)	2.0
  (1, 0)	1.69314718056
  (1, 2)	1.69314718056
  (1, 6)	1.69314718056


In [24]:
print(vect3.transform(["never stop"]))

  (0, 5)	1.69314718056


In [58]:
n = 2
d = 1
print(np.log(2./1.)+1)

1.69314718056


In [25]:
matrix4 = vect4.fit_transform(docs)

In [26]:
print(matrix4)

  (0, 5)	0.546454011634
  (0, 3)	0.546454011634
  (0, 1)	0.32274454218
  (0, 4)	0.546454011634
  (1, 1)	0.563430756324
  (1, 0)	0.476985598255
  (1, 2)	0.476985598255
  (1, 6)	0.476985598255


In [27]:
t = vect3.transform(["What should I never do in the morning?"])

In [68]:
print(t)

  (0, 5)	0.707106781187
  (0, 4)	0.707106781187


In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
print(cosine_similarity(t, matrix4).flatten())

[ 0.77280267  0.        ]
