In [340]:
import numpy as np
import pandas as pd
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [344]:
sent = "phone is good.battery is long?camera is bright.charger is not free"

In [345]:
from nltk.tokenize import word_tokenize
print(word_tokenize(sent))

['phone', 'is', 'good.battery', 'is', 'long', '?', 'camera', 'is', 'bright.charger', 'is', 'not', 'free']


In [346]:
import re
sent = re.sub(r"\.",". ",sent)
sent

'phone is good. battery is long?camera is bright. charger is not free'

In [347]:
print(word_tokenize(sent))

['phone', 'is', 'good', '.', 'battery', 'is', 'long', '?', 'camera', 'is', 'bright', '.', 'charger', 'is', 'not', 'free']


In [293]:
doc1 = "Text Analytics is boring boring boring!!"
doc2 = "Analytics is interesting"
doc3 = "We want interesting sports analytics"

In [294]:
def clean_txt(sent):
    tokens = word_tokenize(sent.lower())
    stop_updated = stopwords.words("english") + list(punctuation) 
    final_word = [term for term in tokens if term not in stop_updated 
               and len(term) > 2] 
    res = " ".join(final_word)
    return res

In [295]:
doc1_clean = clean_txt(doc1)
doc1_clean

'text analytics boring boring boring'

In [296]:
doc2_clean = clean_txt(doc2)
doc2_clean

'analytics interesting'

In [297]:
doc3_clean = clean_txt(doc3)
doc3_clean

'want interesting sports analytics'

In [298]:
doc = pd.DataFrame([doc1_clean, doc2_clean, doc3_clean], columns=["text"])
doc

Unnamed: 0,text
0,text analytics boring boring boring
1,analytics interesting
2,want interesting sports analytics


In [299]:
#CountVectorizer of sklearn gives us the DTM
from sklearn.feature_extraction.text import CountVectorizer

In [300]:
#Instantiating CountVectorizer
count_vect = CountVectorizer(ngram_range =(1,1))

In [301]:
#Invoking fit_transoform on the CountVectorizer object
X = count_vect.fit_transform(doc['text'])
X

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [302]:
X.toarray()

array([[1, 3, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 0],
       [1, 0, 1, 1, 0, 1]], dtype=int64)

In [303]:
X.todense()

matrix([[1, 3, 0, 0, 1, 0],
        [1, 0, 1, 0, 0, 0],
        [1, 0, 1, 1, 0, 1]], dtype=int64)

- fit -> extract distinct words to form a bag of words
- transform -> compute the occourrence of every word from the BOW in each document

In [304]:
#Obtaining the BOW's
bow = count_vect.get_feature_names()
bow

['analytics', 'boring', 'interesting', 'sports', 'text', 'want']

In [305]:
#Get the position of every word in the BOW, as a dictionary
print(count_vect.vocabulary_)

{'text': 4, 'analytics': 0, 'boring': 1, 'interesting': 2, 'want': 5, 'sports': 3}


In [306]:
#To get the position of a particular word in BOW
count_vect.vocabulary_['boring']

1

In [307]:
#Printing the document term matrix can be done using toarray() or todense()
#methods
X.toarray()

array([[1, 3, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 0],
       [1, 0, 1, 1, 0, 1]], dtype=int64)

In [308]:
X.todense()

matrix([[1, 3, 0, 0, 1, 0],
        [1, 0, 1, 0, 0, 0],
        [1, 0, 1, 1, 0, 1]], dtype=int64)

In [309]:
DTM = pd.DataFrame(X.toarray(),columns = count_vect.get_feature_names())
DTM

Unnamed: 0,analytics,boring,interesting,sports,text,want
0,1,3,0,0,1,0
1,1,0,1,0,0,0
2,1,0,1,1,0,1


In [310]:
DTM = pd.DataFrame(X.toarray(),columns = bow)
DTM

Unnamed: 0,analytics,boring,interesting,sports,text,want
0,1,3,0,0,1,0
1,1,0,1,0,0,0
2,1,0,1,1,0,1


In [311]:
#Term document matrix or TDM is a transpose of DTM 
#which is used in finding similarity between words
TDM = DTM.T
TDM

Unnamed: 0,0,1,2
analytics,1,1,1
boring,3,0,0
interesting,0,1,1
sports,0,0,1
text,1,0,0
want,0,0,1


In [312]:
ndoc = ["awesome paper is not always interesting","is paper really news"]

In [313]:
count_vect.transform(ndoc)

<2x6 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [314]:
DTMnd = pd.DataFrame(count_vect.transform(ndoc).toarray(),columns = count_vect.get_feature_names())
DTMnd

Unnamed: 0,analytics,boring,interesting,sports,text,want
0,0,0,1,0,0,0
1,0,0,0,0,0,0


## ngram analysis
- unigram -> a token comprises of a exactly a single word
- bigram -> a token comprises of exactly two words
- trigram -> a token comprises of eaxctly three words

### The next immediate word(s) are grouped with the current word to form a bigram or a tri-gram
- For example: suppose we have a string " how are you doing"
- unigram :{"how", "are", "you", "doing"}
- bigram: {"how are", "are you", "you doing"}
- trigram:{"how are you", "are you doing"}

### Any combination of unigrams, bigrams and trigrams can be obtained using a Countvectorizer

- By default the vectorizer's in sklearn will perform on unigrams, i.e ngram_range will be set to (1,1)
- to extarct bigrams ngram_range = (2,2)
- to extract trigrams ngram_range = (3,3)
- To extract both unigrams and bigrams ngram_range = (1,2)

In [315]:
# A token is called as a bigram if its min and max length is 2
count_vect_bg = CountVectorizer(ngram_range=(2,2))

In [316]:
X_bg = count_vect_bg.fit_transform(doc['text'])

In [317]:
doc['text']

0    text analytics boring boring boring
1                  analytics interesting
2      want interesting sports analytics
Name: text, dtype: object

In [318]:
print(count_vect_bg.get_feature_names())

['analytics boring', 'analytics interesting', 'boring boring', 'interesting sports', 'sports analytics', 'text analytics', 'want interesting']


In [319]:
DTM_bg = pd.DataFrame(X_bg.toarray(),columns=count_vect_bg.get_feature_names())
DTM_bg

Unnamed: 0,analytics boring,analytics interesting,boring boring,interesting sports,sports analytics,text analytics,want interesting
0,1,0,2,0,0,1,0
1,0,1,0,0,0,0,0
2,0,0,0,1,1,0,1


In [320]:
DTM_bg.sum()

analytics boring         1
analytics interesting    1
boring boring            2
interesting sports       1
sports analytics         1
text analytics           1
want interesting         1
dtype: int64

In [321]:
count_vect_ubg = CountVectorizer(ngram_range=(1,1),max_features = 6)

In [322]:
X_ubg = count_vect_ubg.fit_transform(doc['text'])

In [323]:
print(count_vect_ubg.get_feature_names())

['analytics', 'boring', 'interesting', 'sports', 'text', 'want']


In [324]:
DTM_ubg = pd.DataFrame(X_ubg.toarray(),
                      columns=count_vect_ubg.get_feature_names())
DTM_ubg

Unnamed: 0,analytics,boring,interesting,sports,text,want
0,1,3,0,0,1,0
1,1,0,1,0,0,0
2,1,0,1,1,0,1


 ### Setting max features
 
 1. your vectorizer's take another argument called max_features where in you can specify the top n features to be selected
 2. on your DTM, if you perfrom column sum, you will get the total No. of occourrence of a word accross all the documents

In [325]:
#Get the top 3 features in our DTM based on their frequencies
#accross all the documents in the corpus by taking a column sum of all the words in the DTM
count_nfeatures = CountVectorizer(max_features=6,ngram_range=(1,1))

In [326]:
X_nfeatures = count_nfeatures.fit_transform(doc['text'])
doc['text']

0    text analytics boring boring boring
1                  analytics interesting
2      want interesting sports analytics
Name: text, dtype: object

In [327]:
count_nfeatures.get_feature_names()

['analytics', 'boring', 'interesting', 'sports', 'text', 'want']

In [328]:
DTM_nfeatures = pd.DataFrame(X_nfeatures.toarray(),
                            columns=count_nfeatures.get_feature_names())
DTM_nfeatures

Unnamed: 0,analytics,boring,interesting,sports,text,want
0,1,3,0,0,1,0
1,1,0,1,0,0,0
2,1,0,1,1,0,1


In [329]:
# Get the DTM for all the trigrams - CountVectorizer(ngram_range=(3,3))

### TFIDF

In [330]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [331]:
tfidf_vect = TfidfVectorizer(ngram_range=(1,1))

In [332]:
X = tfidf_vect.fit_transform(doc['text'])
X

<3x6 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [333]:
doc['text']

0    text analytics boring boring boring
1                  analytics interesting
2      want interesting sports analytics
Name: text, dtype: object

In [334]:
tfidf = pd.DataFrame(X.toarray(), columns=tfidf_vect.get_feature_names())
tfidf

Unnamed: 0,analytics,boring,interesting,sports,text,want
0,0.183595,0.932558,0.0,0.0,0.310853,0.0
1,0.613356,0.0,0.789807,0.0,0.0,0.0
2,0.345205,0.0,0.444514,0.584483,0.0,0.584483


In [335]:
tfidf_square = tfidf.apply(lambda x: x**2)
tfidf_square

Unnamed: 0,analytics,boring,interesting,sports,text,want
0,0.033707,0.869664,0.0,0.0,0.096629,0.0
1,0.376205,0.0,0.623795,0.0,0.0,0.0
2,0.119167,0.0,0.197593,0.34162,0.0,0.34162


In [285]:
doc['text']

0    text analytics boring boring boring
1                  analytics interesting
2                  want sports analytics
Name: text, dtype: object

In [286]:
tfidf_square.sum(axis=1)

0    1.0
1    1.0
2    1.0
dtype: float64

### Cosine Similarity

In [336]:
from sklearn.metrics.pairwise import cosine_similarity

In [337]:
cs = cosine_similarity(DTM)
print(cs)

[[1.         0.21320072 0.15075567]
 [0.21320072 1.         0.70710678]
 [0.15075567 0.70710678 1.        ]]


In [338]:
cs_words = cosine_similarity(DTM.T)

In [339]:
sim_mat = pd.DataFrame(cs_words,columns=DTM.columns,index=DTM.columns)
sim_mat

Unnamed: 0,analytics,boring,interesting,sports,text,want
analytics,1.0,0.57735,0.816497,0.57735,0.57735,0.57735
boring,0.57735,1.0,0.0,0.0,1.0,0.0
interesting,0.816497,0.0,1.0,0.707107,0.0,0.707107
sports,0.57735,0.0,0.707107,1.0,0.0,1.0
text,0.57735,1.0,0.0,0.0,1.0,0.0
want,0.57735,0.0,0.707107,1.0,0.0,1.0
