# Vectorization

## Aim: To implement vectorization using TFIDF and Bag of Words.

In [None]:
pip install contractions -q

In [None]:
pip install advertools -q

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import advertools as adv
import re
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
txt1='Game of thrones is an amazing TV series'
txt2='Game of Thrones is the best TV series'
txt3='Game of Thrones is so great'
txt4='The series Game of Thrones is so boring'

In [None]:
lst=[]
for i in [txt1,txt2,txt3,txt4]:
  lst.append(i)
print(lst)

['Game of thrones is an amazing TV series', 'Game of Thrones is the best TV series', 'Game of Thrones is so great', 'The series Game of Thrones is so boring']


In [None]:
tf = TfidfVectorizer(analyzer='word',stop_words= 'english')
tf_wm = tf.fit_transform(lst)
print('IDF Values ')
for w1, w2 in zip(tf.get_feature_names_out(), tf.idf_):
    print(w1, ':', w2)
print()
tf_tokens = tf.get_feature_names_out()
df = pd.DataFrame(data = tf_wm.toarray(),index = ['txt1','txt2','txt3','txt4'],
                  columns = tf_tokens)
print('TF-IDF')
display(df)

IDF Values 
amazing : 1.916290731874155
best : 1.916290731874155
boring : 1.916290731874155
game : 1.0
great : 1.916290731874155
series : 1.2231435513142097
thrones : 1.0
tv : 1.5108256237659907

TF-IDF


Unnamed: 0,amazing,best,boring,game,great,series,thrones,tv
txt1,0.623342,0.0,0.0,0.325285,0.0,0.397871,0.325285,0.49145
txt2,0.0,0.623342,0.0,0.325285,0.0,0.397871,0.325285,0.49145
txt3,0.0,0.0,0.0,0.41988,0.804612,0.0,0.41988,0.0
txt4,0.0,0.0,0.715739,0.373502,0.0,0.456847,0.373502,0.0


In [None]:
# Bag of words

In [None]:
class BOW:

  def __init__(self,text):
    self.ltext=text.lower()
    self.text=contractions.fix(self.ltext)
    self.stop_words=set(adv.stopwords['english'])

  def remove_quotes_brackets(self):
    self.text1=re.sub("[\"\'\(\)]", "", self.text)
    return self.text1

  def remove_punctuations(self,text):
    text1=re.sub(r"[^\w\s]", " ",text)
    return text1

  def tokenize(self,text):
    self.t_text=word_tokenize(text)
    return self.t_text

  def stop_word(self,text):
    self.sw=[]
    for i in text:
      if i not in self.stop_words:
        self.sw.append(i)
    return self.sw

  def count(self,lst):
    d={}
    for i in lst:
      d[i]=lst.count(i)
    return d

  def preprocess(self):
    q_text=self.remove_quotes_brackets()
    p_text=self.remove_punctuations(q_text)
    t_text=self.tokenize(p_text)
    swtext=self.stop_word(t_text)
    word=' '.join(swtext)
    ctext=self.count(swtext)
    return t_text,swtext,ctext,word

In [None]:
for i in [txt1,txt2,txt3,txt4]:
  print()
  t=BOW(i)
  tkn,stp,cnt,wrd=t.preprocess()
  print('\nTokenized text',tkn)
  print('\nAfter stop word removal',stp)
  print('\nAfter stop words removal',wrd)
  print('\nCount of words\n',cnt)



Tokenized text ['game', 'of', 'thrones', 'is', 'an', 'amazing', 'tv', 'series']

After stop word removal ['game', 'thrones', 'amazing', 'tv', 'series']

After stop words removal game thrones amazing tv series

Count of words
 {'game': 1, 'thrones': 1, 'amazing': 1, 'tv': 1, 'series': 1}


Tokenized text ['game', 'of', 'thrones', 'is', 'the', 'best', 'tv', 'series']

After stop word removal ['game', 'thrones', 'best', 'tv', 'series']

After stop words removal game thrones best tv series

Count of words
 {'game': 1, 'thrones': 1, 'best': 1, 'tv': 1, 'series': 1}


Tokenized text ['game', 'of', 'thrones', 'is', 'so', 'great']

After stop word removal ['game', 'thrones', 'great']

After stop words removal game thrones great

Count of words
 {'game': 1, 'thrones': 1, 'great': 1}


Tokenized text ['the', 'series', 'game', 'of', 'thrones', 'is', 'so', 'boring']

After stop word removal ['series', 'game', 'thrones', 'boring']

After stop words removal series game thrones boring

Count of wor

In [None]:
t1=BOW(txt1)
tkn1,stp1,cnt1,wrd1=t1.preprocess()
t2=BOW(txt2)
tkn2,stp2,cnt2,wrd2=t2.preprocess()
t3=BOW(txt3)
tkn3,stp3,cnt3,wrd3=t3.preprocess()
t4=BOW(txt4)
tkn4,stp4,cnt4,wrd4=t4.preprocess()

In [None]:
lst=[]
lst.append(wrd1)
lst.append(wrd2)
lst.append(wrd3)
lst.append(wrd4)
print(lst)

['game thrones amazing tv series', 'game thrones best tv series', 'game thrones great', 'series game thrones boring']


In [None]:
vect = CountVectorizer()
vect.fit(lst)
v=vect.transform(lst)
vd=vect.vocabulary_

In [None]:
print("Vocabulary: ", vd)

Vocabulary:  {'game': 3, 'thrones': 6, 'amazing': 0, 'tv': 7, 'series': 5, 'best': 1, 'great': 4, 'boring': 2}


In [None]:
v1=v.toarray()
c=list(vd.keys())
df=pd.DataFrame(v1,columns=c)
display(df)

Unnamed: 0,game,thrones,amazing,tv,series,best,great,boring
0,1,0,0,1,0,1,1,1
1,0,1,0,1,0,1,1,1
2,0,0,0,1,1,0,1,0
3,0,0,1,1,0,1,1,0


## Conclusion: TFIDF is better comapred to bag of words as BOW is mainly focused on the count rather than the importance of the word in that text.