In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

In [2]:
sentences=['I have a credit card account','My account card, debit card is lost','My credit card stopped working']

### CountVectorizer

In [3]:
vectorizer=CountVectorizer()
countvec=vectorizer.fit_transform(sentences)

In [4]:
countvec.A

array([[1, 1, 1, 0, 1, 0, 0, 0, 0, 0],
       [1, 2, 0, 1, 0, 1, 1, 1, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 1, 1, 1]], dtype=int64)

In [5]:
vectorizer.get_feature_names()



['account',
 'card',
 'credit',
 'debit',
 'have',
 'is',
 'lost',
 'my',
 'stopped',
 'working']

In [6]:
vectorizer=CountVectorizer(max_features=4) #max features to consider in vocab, top n from the term frequency
countvec=vectorizer.fit_transform(sentences)

In [7]:
print(countvec.A)
print(vectorizer.get_feature_names())

[[1 1 1 0]
 [1 2 0 1]
 [0 1 1 1]]
['account', 'card', 'credit', 'my']




In [8]:
vectorizer=CountVectorizer(max_features=4, stop_words='english')  #adding englist stopwords
countvec=vectorizer.fit_transform(sentences)

In [9]:
print(countvec.A)
print(vectorizer.get_feature_names())

[[1 1 1 0]
 [1 2 0 1]
 [0 1 1 0]]
['account', 'card', 'credit', 'debit']




In [12]:
vectorizer=CountVectorizer(max_features=8, ngram_range=(1,2))  #unigrams and bigrams
countvec=vectorizer.fit_transform(sentences)

In [13]:
print(countvec.A)
print(vectorizer.get_feature_names())

[[1 1 1 1 0 0 0 0]
 [1 2 0 0 1 1 0 0]
 [0 1 1 1 1 0 1 1]]
['account', 'card', 'credit', 'credit card', 'my', 'my account', 'my credit', 'stopped']




### TfidfVectorizer

In [14]:
vectorizer=TfidfVectorizer(use_idf=False, norm='l1')   #l1 norm -> sum on elements is one l2-> sum of squares is 1
tfvec=vectorizer.fit_transform(sentences)

In [15]:
print(tfvec.A)
print(vectorizer.get_feature_names())

[[0.25       0.25       0.25       0.         0.25       0.
  0.         0.         0.         0.        ]
 [0.14285714 0.28571429 0.         0.14285714 0.         0.14285714
  0.14285714 0.14285714 0.         0.        ]
 [0.         0.2        0.2        0.         0.         0.
  0.         0.2        0.2        0.2       ]]
['account', 'card', 'credit', 'debit', 'have', 'is', 'lost', 'my', 'stopped', 'working']


In [16]:
vectorizer=TfidfVectorizer(use_idf=False, norm='l2')
tfvec=vectorizer.fit_transform(sentences)

In [17]:
print(tfvec.A)
print(vectorizer.get_feature_names())

[[0.5        0.5        0.5        0.         0.5        0.
  0.         0.         0.         0.        ]
 [0.33333333 0.66666667 0.         0.33333333 0.         0.33333333
  0.33333333 0.33333333 0.         0.        ]
 [0.         0.4472136  0.4472136  0.         0.         0.
  0.         0.4472136  0.4472136  0.4472136 ]]
['account', 'card', 'credit', 'debit', 'have', 'is', 'lost', 'my', 'stopped', 'working']


In [19]:
vectorizer=TfidfVectorizer(use_idf=False, norm=None)
tfvec=vectorizer.fit_transform(sentences)

In [20]:
print(tfvec.A)
print(vectorizer.get_feature_names())

[[1. 1. 1. 0. 1. 0. 0. 0. 0. 0.]
 [1. 2. 0. 1. 0. 1. 1. 1. 0. 0.]
 [0. 1. 1. 0. 0. 0. 0. 1. 1. 1.]]
['account', 'card', 'credit', 'debit', 'have', 'is', 'lost', 'my', 'stopped', 'working']


In [21]:
vectorizer_idf=TfidfVectorizer(smooth_idf=False)   #smooth_idf prevents zero division
tfidfvec=vectorizer_idf.fit_transform(sentences)

In [22]:
print(vectorizer_idf.idf_)
print(vectorizer_idf.get_feature_names())

[1.40546511 1.         1.40546511 2.09861229 2.09861229 2.09861229
 2.09861229 1.40546511 2.09861229 2.09861229]
['account', 'card', 'credit', 'debit', 'have', 'is', 'lost', 'my', 'stopped', 'working']




In [23]:
tfidfvec.A

array([[0.45951737, 0.3269504 , 0.45951737, 0.        , 0.68614212,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.3055129 , 0.43474989, 0.        , 0.45618573, 0.        ,
        0.45618573, 0.45618573, 0.3055129 , 0.        , 0.        ],
       [0.        , 0.26959162, 0.37890161, 0.        , 0.        ,
        0.        , 0.        , 0.37890161, 0.56576828, 0.56576828]])