In [1]:
import numpy as np
import pandas as pd
from string import punctuation 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
doc1="This is a very good and plain paper. This is really good and interesting"
doc2="This paper is very interesting. Awesome"

In [3]:
def clean_txt(sent):
    tokens=word_tokenize(sent.lower())
     #Stripping white spcaes before and after the text
    stop_updated=stopwords.words("english")+list(punctuation)
    final_word=[term for term in tokens if term not in stop_updated and len(term)>2 ]
    res=" ".join(final_word)
    return res

In [4]:
doc1_clean=clean_txt(doc1)
doc1_clean

'good plain paper really good interesting'

In [5]:
doc2_clean=clean_txt(doc2)
doc2_clean

'paper interesting awesome'

In [6]:
doc=pd.DataFrame([doc1_clean,doc2_clean],columns=['Text'])
doc

Unnamed: 0,Text
0,good plain paper really good interesting
1,paper interesting awesome


### countvectorizer
- extract features from text
- Allows to apply some transfromation on texts so that we arrive at a matrix of numbers

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
count_vect=CountVectorizer()

In [9]:
# fit--->extract distinct words from the text corpus to form a bag of words
count_vect.fit(doc.Text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [10]:
count_vect.get_feature_names() # these distinct words are going to be the features in case if we build a ML model

['awesome', 'good', 'interesting', 'paper', 'plain', 'really']

In [11]:
print(count_vect.vocabulary_) # get the position of every word in the BOWs,it is usually dictionary order wise

{'good': 1, 'plain': 4, 'paper': 3, 'really': 5, 'interesting': 2, 'awesome': 0}


In [12]:
#transform-Apply the trnsformation on BOW or vocabulary extracted
x=count_vect.transform(doc.Text)
x

<2x6 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [13]:
#printing the document term matrix can be done using toarray or todense()
x.toarray()

array([[0, 2, 1, 1, 1, 1],
       [1, 0, 1, 1, 0, 0]], dtype=int64)

In [14]:
x.todense()

matrix([[0, 2, 1, 1, 1, 1],
        [1, 0, 1, 1, 0, 0]], dtype=int64)

In [15]:
DTM=pd.DataFrame(x.toarray(),columns=count_vect.get_feature_names())
DTM #DTM-document term matrix

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0,2,1,1,1,1
1,1,0,1,1,0,0


In [16]:
TDM=DTM.T
TDM

Unnamed: 0,0,1
awesome,0,1
good,2,0
interesting,1,1
paper,1,1
plain,1,0
really,1,0


## ngram analysis
- unigram: a token comprising of exactly a single word
- bigram: a token comprising of two words
- trigram:a token comprising of 3 words

### How is it done : The next immediate words are grouped with the current word to form a biagram or trigram
text="how are you doing"
- unigram:{"how","are","you","doing"}
- bigram:{"how are","are you","you doing"}
- trigram{"how are you","are you doing"}

### Any combination of unigrams,bigrams,trigrams can be obtained using a CountVectorizer
- By default the vectorizers in sklearn will extract only unigrams,ie. ngram_range will be set to (1,1) by default which means taht features are exactly of length one  which results in unigram
- to extract bigrams ngram_range(2,2)
- to extract trrams ngram_range(3,3)
- to extract bigrams & trigrams,ngram_range (1,2)
- to extract bigrams & trigrams,unigrams,ngram_range is (1,3)

In [17]:
count_vect_bg=CountVectorizer(ngram_range=(2,2))

In [18]:
x_bg=count_vect_bg.fit_transform(doc['Text']) # fit will extract the distinct words from the text
                                              # transform will create the compressed sparse matrix of the distinct words

In [19]:
print(count_vect_bg.get_feature_names())

['good interesting', 'good plain', 'interesting awesome', 'paper interesting', 'paper really', 'plain paper', 'really good']


In [20]:
DTM_bg=pd.DataFrame(x_bg.toarray(),columns=count_vect_bg.get_feature_names())

In [21]:
DTM_bg

Unnamed: 0,good interesting,good plain,interesting awesome,paper interesting,paper really,plain paper,really good
0,1,1,0,0,1,1,1
1,0,0,1,1,0,0,0


In [22]:
count_vect_ubg=CountVectorizer(ngram_range=(1,2))

In [23]:
x_ubg=count_vect_ubg.fit_transform(doc['Text'])

In [24]:
print(count_vect_ubg.get_feature_names())

['awesome', 'good', 'good interesting', 'good plain', 'interesting', 'interesting awesome', 'paper', 'paper interesting', 'paper really', 'plain', 'plain paper', 'really', 'really good']


In [25]:
DTM_ubg=pd.DataFrame(x_ubg.toarray(),columns=count_vect_ubg.get_feature_names())
DTM_ubg

Unnamed: 0,awesome,good,good interesting,good plain,interesting,interesting awesome,paper,paper interesting,paper really,plain,plain paper,really,really good
0,0,2,1,1,1,0,1,0,1,1,1,1,1
1,1,0,0,0,1,1,1,1,0,0,0,0,0


In [26]:
# get the top 3 features in out DTM based on their frequencies
count_nfeatures=CountVectorizer(max_features=3)

In [27]:
x_nfeatures=count_nfeatures.fit_transform(doc['Text'])

In [28]:
count_nfeatures.get_feature_names() # top 3 features

['good', 'interesting', 'paper']

In [29]:
DTM_nfeatures=pd.DataFrame(x_nfeatures.toarray(),columns=count_nfeatures.get_feature_names())
DTM_nfeatures

Unnamed: 0,good,interesting,paper
0,2,1,1
1,0,1,1


In [30]:
# getting the sum of frequency of each feature
DTM_nfeatures.sum()

good           2
interesting    2
paper          2
dtype: int64

## TFIDF

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
#creating TFIDF vectorizer with default values which includes applying L2 norm by default.
tfidf_vect=TfidfVectorizer()

In [33]:
# - transform phase - Apply the transformatin on BOW extracted from our corpus to obtain a matrix of numbers
# - Here the transformation which will be applied is TF*IDF 

In [34]:
x=tfidf_vect.fit_transform(doc['Text'])

In [35]:
tfidf=pd.DataFrame(x.toarray(),columns=tfidf_vect.get_feature_names())

In [36]:
tfidf

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0.0,0.755256,0.268685,0.268685,0.377628,0.377628
1,0.704909,0.0,0.501549,0.501549,0.0,0.0


In [37]:
#creating a tfidf vectorizer with no additional smoothing and no L2 norm
tfidf_vect=TfidfVectorizer(smooth_idf=False,norm=False)

In [38]:
x=tfidf_vect.fit_transform(doc['Text'])

In [39]:
tfidf=pd.DataFrame(x.toarray(),columns=tfidf_vect.get_feature_names())
tfidf

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0.0,3.386294,1.0,1.0,1.693147,1.693147
1,1.693147,0.0,1.0,1.0,0.0,0.0


In [40]:
# creating a tfidf vectorizer with  additional smoothing and no L2 norm
# Additional smoothing is adding 1 to the numerator and denominator of the IDF formula
# the additional smoothing is applied because if there is a new word in testing data and if that word is not present in the training set then the IDF for that word will be calculated as infinity.But if 1 is added to both numerator and denominator then it wont give us infinity and will give a smoothed value.
tfidf_vect=TfidfVectorizer(smooth_idf=True,norm=False)

In [41]:
x=tfidf_vect.fit_transform(doc['Text'])

In [42]:
tfidf=pd.DataFrame(x.toarray(),columns=tfidf_vect.get_feature_names())
tfidf

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0.0,2.81093,1.0,1.0,1.405465,1.405465
1,1.405465,0.0,1.0,1.0,0.0,0.0


In [43]:
# when TFIDF matrix is created with defult values,it results in smoothed IDF with L@ norm values by default
# Need for smoothing : To avoid infinity values in IDF for new features

## Cosine similarity

In [44]:
from sklearn.metrics.pairwise import cosine_similarity

In [45]:
# Finding the similarity between documents within the corpus
cs=cosine_similarity(DTM)
cs

array([[1.        , 0.40824829],
       [0.40824829, 1.        ]])

In [46]:
# Finding the similarity accross words in the DTM
cs=cosine_similarity(DTM.T)
cs

array([[1.        , 0.        , 0.70710678, 0.70710678, 0.        ,
        0.        ],
       [0.        , 1.        , 0.70710678, 0.70710678, 1.        ,
        1.        ],
       [0.70710678, 0.70710678, 1.        , 1.        , 0.70710678,
        0.70710678],
       [0.70710678, 0.70710678, 1.        , 1.        , 0.70710678,
        0.70710678],
       [0.        , 1.        , 0.70710678, 0.70710678, 1.        ,
        1.        ],
       [0.        , 1.        , 0.70710678, 0.70710678, 1.        ,
        1.        ]])

In [47]:
similarity_df=pd.DataFrame(cs,columns=DTM.columns,index=DTM.columns)
similarity_df

Unnamed: 0,awesome,good,interesting,paper,plain,really
awesome,1.0,0.0,0.707107,0.707107,0.0,0.0
good,0.0,1.0,0.707107,0.707107,1.0,1.0
interesting,0.707107,0.707107,1.0,1.0,0.707107,0.707107
paper,0.707107,0.707107,1.0,1.0,0.707107,0.707107
plain,0.0,1.0,0.707107,0.707107,1.0,1.0
really,0.0,1.0,0.707107,0.707107,1.0,1.0
