In [54]:
import numpy as np
import pandas as pd
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [55]:
doc1 = "This is a very good and plain paper. this is really \
good and interesting"
doc2 = "This paper is very interesting, awesome"

In [56]:
def clean_txt(sent):
    tokens = word_tokenize(sent.lower())
    stop_updated = stopwords.words("english") + list(punctuation) 
    final_word = [term for term in tokens if term not in stop_updated 
               and len(term) > 2] 
    res = " ".join(final_word)
    return res

In [57]:
doc1_clean = clean_txt(doc1)
doc1_clean

'good plain paper really good interesting'

In [58]:
doc2_clean = clean_txt(doc2)
doc2_clean

'paper interesting awesome'

In [59]:
doc = pd.DataFrame([doc1_clean, doc2_clean], columns=["text"])
doc

Unnamed: 0,text
0,good plain paper really good interesting
1,paper interesting awesome


## Extract the Features from text

### Vectorizer on Text Data:
    - extracts features from text 
    - allows to apply some transformations on text so that we arrive at a matrix of numbers

### Count Vectorizer:
    - extracts features from text 
    - Applys the transformation on the extracted features - Computes the TF of every term in document
 TF = Total occourrence of a word or term in a document

In [60]:
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
count_vect = CountVectorizer()

- fit -> extarct distinct words from the text corpus to form a bag of words

In [62]:
count_vect.fit(doc.text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [63]:
#Obtaining the BOW's
count_vect.get_feature_names()

['awesome', 'good', 'interesting', 'paper', 'plain', 'really']

In [64]:
#BOW results in an unordered collection of elements
#Get the position of every word in the BOW, its usually dictionary ordered
print(count_vect.vocabulary_)

{'good': 1, 'plain': 4, 'paper': 3, 'really': 5, 'interesting': 2, 'awesome': 0}


 - transform : Apply the transformation on the BOW extracted from our corpus to obtain a matrix of numbers
 - Different types of transformations which can be appied:
  1. DTM using CountVectorizer
  2. TFIDF matrix using TFIDF vectorizer

In [65]:
X = count_vect.transform(doc.text)
X

<2x6 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [66]:
#Printing the document term matrix can be done using toarray or todense()
#methods
X.toarray()

array([[0, 2, 1, 1, 1, 1],
       [1, 0, 1, 1, 0, 0]], dtype=int64)

 - Every row in the above matrix represent a document/review
 - Every column in the above matrix represent a word accross all the documents
 - The values represent the term frequency - Frequency of that term in that document -> This is the outcome of transform step from CountVectorizers

In [67]:
X.todense()

matrix([[0, 2, 1, 1, 1, 1],
        [1, 0, 1, 1, 0, 0]], dtype=int64)

In [105]:
DTM = pd.DataFrame(X.toarray(),columns = count_vect.get_feature_names())
DTM

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0.0,2.0,1.0,1.0,1.0,1.0
1,1.0,0.0,1.0,1.0,0.0,0.0


In [69]:
TDM = DTM.T
TDM

Unnamed: 0,0,1
awesome,0,1
good,2,0
interesting,1,1
paper,1,1
plain,1,0
really,1,0


## ngram analysis
- unigram -> a token comprises of a exactly a single word
- bigram -> a token comprises of exactly two words
- trigram -> a token comprises of eaxctly three words

### How this is done:  The next immediate word(s) are grouped with the current word to form a bigram or a tri-gram
- For example: " how are you doing"
- unigram :{"how", "are", "you", "doing"}
- bigram: {"how are", "are you", "you doing"}
- trigram:{"how are you", "are you doing"}

### Any combination of unigrams, bigrams and trigrams can be obtained using a Countvectorizer
 - By default the vectorizer's in sklearn will extract only unigrams, i.e ngram_range will be set to (1,1) by default which means that features are exactly of length one, which results in a unigram
 - to extract bigrams ngram_range is set to (2,2)
 - to extract trigrams ngram_range is set to (3,3)
 - to extract both unigrams and bigrams, ngram_range is set to (1,2)
 - to extract both unigrams, bigrams and trigrams, ngram_range is set to (1,3)

In [70]:
# A token is called as a bigram if its min and max length is 2
count_vect_bg = CountVectorizer(ngram_range=(2,2))

In [71]:
X_bg = count_vect_bg.fit_transform(doc['text'])
X_bg

<2x7 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [72]:
print(count_vect_bg.get_feature_names())

['good interesting', 'good plain', 'interesting awesome', 'paper interesting', 'paper really', 'plain paper', 'really good']


In [73]:
DTM_bg = pd.DataFrame(X_bg.toarray(),
                      columns=count_vect_bg.get_feature_names())
DTM_bg

Unnamed: 0,good interesting,good plain,interesting awesome,paper interesting,paper really,plain paper,really good
0,1,1,0,0,1,1,1
1,0,0,1,1,0,0,0


In [74]:
count_vect_ubg = CountVectorizer(ngram_range=(1,2))

In [75]:
X_ubg = count_vect_ubg.fit_transform(doc['text'])

In [76]:
print(count_vect_ubg.get_feature_names())

['awesome', 'good', 'good interesting', 'good plain', 'interesting', 'interesting awesome', 'paper', 'paper interesting', 'paper really', 'plain', 'plain paper', 'really', 'really good']


In [77]:
DTM_ubg = pd.DataFrame(X_ubg.toarray(),
                      columns=count_vect_ubg.get_feature_names())
DTM_ubg

Unnamed: 0,awesome,good,good interesting,good plain,interesting,interesting awesome,paper,paper interesting,paper really,plain,plain paper,really,really good
0,0,2,1,1,1,0,1,0,1,1,1,1,1
1,1,0,0,0,1,1,1,1,0,0,0,0,0


In [78]:
# Get the DTM for all the trigrams - CountVectorizer(ngram_range=(3,3))

#Get only top n features in our DTM
Setting max features - your vectorizer's take another argument called max_features where in you can specify the top n features to be selected

In [79]:
#Get the top 3 features in our DTM based on their frequencies of occourrence accross all the documents
count_nfeatures = CountVectorizer(max_features=3)

In [80]:
X_nfeatures = count_nfeatures.fit_transform(doc['text'])

In [81]:
count_nfeatures.get_feature_names()

['good', 'interesting', 'paper']

In [82]:
DTM_nfeatures = pd.DataFrame(X_nfeatures.toarray(),
                            columns=count_nfeatures.get_feature_names())
DTM_nfeatures

Unnamed: 0,good,interesting,paper
0,2,1,1
1,0,1,1


In [83]:
#What happens when i take a column sum on the DTM
DTM_nfeatures.sum()

good           2
interesting    2
paper          2
dtype: int64

### Vectorizing using TfIDF vectorizer

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
#Creating tfidf with default values
tfidf_vect = TfidfVectorizer(max_features=3500)

 - transform Phase : Apply the transformation on the BOW extracted from our corpus to obtain a matrix of numbers
 - Here the transformation which will be appied is TF*IDF

In [86]:
X = tfidf_vect.fit_transform(doc['text'])

In [87]:
tfidf = pd.DataFrame(X.toarray(), columns=tfidf_vect.get_feature_names())
tfidf

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0.0,0.755256,0.268685,0.268685,0.377628,0.377628
1,0.704909,0.0,0.501549,0.501549,0.0,0.0


In [97]:
?TfidfVectorizer

In [89]:
#Creating TFIDF vectorizer with no additional smoothing and no L2 norm
tfidf_vect = TfidfVectorizer(smooth_idf=False,norm=False)

In [90]:
X = tfidf_vect.fit_transform(doc['text'])

In [91]:
tfidf = pd.DataFrame(X.toarray(), columns=tfidf_vect.get_feature_names())
tfidf

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0.0,3.386294,1.0,1.0,1.693147,1.693147
1,1.693147,0.0,1.0,1.0,0.0,0.0


In [92]:
#Creating TFIDF vectorizer with additional smoothing and no L2 norm
#Additional smoothing is add 1 to the NR and DR of the IDF formula
tfidf_vect = TfidfVectorizer(smooth_idf=True,norm=False)

In [93]:
X = tfidf_vect.fit_transform(doc['text'])

In [94]:
tfidf = pd.DataFrame(X.toarray(), columns=tfidf_vect.get_feature_names())
tfidf

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0.0,2.81093,1.0,1.0,1.405465,1.405465
1,1.405465,0.0,1.0,1.0,0.0,0.0


In [95]:
#when tfidf matrix is created with default values, it results in smoothed IDF with L2 norm values by default
#Need for smoothing :To avoid infinity values in IDF for new features
#L2 norm vectors of TFIDF values perfrom better when used in ML algorithms

In [None]:
#if use_idf is set to False with no normalization, your TFIDF matrix will be same as ur DTM

In [102]:
tfidf_vect = TfidfVectorizer(norm=False,use_idf=False)
X = tfidf_vect.fit_transform(doc['text'])
tfidf = pd.DataFrame(X.toarray(), columns=tfidf_vect.get_feature_names())
tfidf

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0.0,2.0,1.0,1.0,1.0,1.0
1,1.0,0.0,1.0,1.0,0.0,0.0


### Cosine Similarity

In [104]:
from sklearn.metrics.pairwise import cosine_similarity

In [106]:
#Finding similarity between documents in the corpus
cs = cosine_similarity(DTM)
cs

array([[1.        , 0.40824829],
       [0.40824829, 1.        ]])

In [107]:
# Finding the similarity between two words accross the corpus:
cs_words = cosine_similarity(DTM.T)
cs_words

array([[1.        , 0.        , 0.70710678, 0.70710678, 0.        ,
        0.        ],
       [0.        , 1.        , 0.70710678, 0.70710678, 1.        ,
        1.        ],
       [0.70710678, 0.70710678, 1.        , 1.        , 0.70710678,
        0.70710678],
       [0.70710678, 0.70710678, 1.        , 1.        , 0.70710678,
        0.70710678],
       [0.        , 1.        , 0.70710678, 0.70710678, 1.        ,
        1.        ],
       [0.        , 1.        , 0.70710678, 0.70710678, 1.        ,
        1.        ]])

In [108]:
sim_df = pd.DataFrame(cs_words,columns=DTM.columns,index=DTM.columns)
sim_df

Unnamed: 0,awesome,good,interesting,paper,plain,really
awesome,1.0,0.0,0.707107,0.707107,0.0,0.0
good,0.0,1.0,0.707107,0.707107,1.0,1.0
interesting,0.707107,0.707107,1.0,1.0,0.707107,0.707107
paper,0.707107,0.707107,1.0,1.0,0.707107,0.707107
plain,0.0,1.0,0.707107,0.707107,1.0,1.0
really,0.0,1.0,0.707107,0.707107,1.0,1.0
