# Text Vectorization

## 1. One Hot Encoding

In [1]:
corpus = ['dog eats meat','man eats meat']
vocab = {}
count = 0
for doc in corpus:
  for word in doc.split():
    if word not in vocab:
      count = count+1
      vocab[word] = count
print(vocab)

{'dog': 1, 'eats': 2, 'meat': 3, 'man': 4}


In [2]:
def one_hot(doc):
  one_hot = []
  for word in doc.split():
    temp = [0]*len(vocab)
    if word in vocab:
      temp[vocab[word]-1] = 1
    one_hot.append(temp)
  return one_hot

In [3]:
one_hot("dog eats meat")

[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]]

#### Scikit learn

In [4]:
doc1 = "dog bites meat"
doc2 = 'man eats meat'
doc3 = 'dog bites man'

In [5]:
corpus = [doc1.split(),doc2.split(),doc3.split()]
my_overall_data  = corpus[0] + corpus[1] + corpus[2] 

print(f"My overall data: {my_overall_data}")

#implement Label Encoder
from sklearn.preprocessing import LabelEncoder

My overall data: ['dog', 'bites', 'meat', 'man', 'eats', 'meat', 'dog', 'bites', 'man']


In [6]:
le = LabelEncoder()
integer_data = le.fit_transform(my_overall_data)
print(f"Integer Values are: {integer_data}")

Integer Values are: [1 0 4 3 2 4 1 0 3]


In [8]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder()
one_hot_encoder.fit_transform(corpus).toarray()

array([[1., 0., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 1.],
       [1., 0., 1., 0., 1., 0.]])

In [9]:
one_hot_encoder.transform(["dog eats meat".split()]).toarray()

array([[1., 0., 0., 1., 0., 1.]])

## 2. BOW Techniques

In [1]:
import numpy as np
import pandas as pd
import collections

In [2]:
doc1 = "Harry Potter is an amazing movie!!"
doc2 = "Harry Potter is the best movie."
doc3 = "Harry Potter is so great.."

In [4]:
import re
doc1 = re.sub("[^a-zA-Z0-9]"," ",doc1.lower()).split()
doc2 = re.sub("[^a-zA-Z0-9]"," ",doc2.lower()).split()
doc3 = re.sub("[^a-zA-Z0-9]"," ",doc3.lower()).split()

In [7]:
all_words = set(doc1 + doc2 + doc3)
all_words

{'amazing',
 'an',
 'best',
 'great',
 'harry',
 'is',
 'movie',
 'potter',
 'so',
 'the'}

In [10]:
def BOWRep(all_words,doc):
    bow = dict.fromkeys(all_words,0)
    for word in doc:
        bow[word] = doc.count(word)

    return bow

In [12]:
bow1 = BOWRep(all_words,doc1)
bow2 = BOWRep(all_words,doc2)
bow3 = BOWRep(all_words,doc3)
bow1

{'harry': 1,
 'best': 0,
 'is': 1,
 'movie': 1,
 'an': 1,
 'potter': 1,
 'the': 0,
 'amazing': 1,
 'great': 0,
 'so': 0}

In [14]:
df = pd.DataFrame([bow1,bow2,bow3])
df

Unnamed: 0,harry,best,is,movie,an,potter,the,amazing,great,so
0,1,0,1,1,1,1,0,1,0,0
1,1,1,1,1,0,1,1,0,0,0
2,1,0,1,0,0,1,0,0,1,1


### Using Scikit-Learn

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True) # Present or not 
# binary = true is used mainly for sentiment analysis, if false then it returns count
doc1 = "Harry Potter is an amazing movie!!"
doc2 = "Harry Potter is the best movie."
doc3 = "Harry Potter is so great.."

cv_out = cv.fit_transform([doc1,doc2,doc3])
pd.DataFrame(cv_out.toarray(),columns=cv.get_feature_names_out())

Unnamed: 0,amazing,an,best,great,harry,is,movie,potter,so,the
0,1,1,0,0,1,1,1,1,0,0
1,0,0,1,0,1,1,1,1,0,1
2,0,0,0,1,1,1,0,1,1,0


In [24]:
cv = CountVectorizer(binary=True,ngram_range=(1,3)) # Present or not 
doc1 = "Harry Potter is an amazing movie!!"
doc2 = "Harry Potter is the best movie."
doc3 = "Harry Potter is so great.."

cv_out = cv.fit_transform([doc1,doc2,doc3])
pd.DataFrame(cv_out.toarray(),columns=cv.get_feature_names_out())

Unnamed: 0,amazing,amazing movie,an,an amazing,an amazing movie,best,best movie,great,harry,harry potter,...,potter,potter is,potter is an,potter is so,potter is the,so,so great,the,the best,the best movie
0,1,1,1,1,1,0,0,0,1,1,...,1,1,1,0,0,0,0,0,0,0
1,0,0,0,0,0,1,1,0,1,1,...,1,1,0,0,1,0,0,1,1,1
2,0,0,0,0,0,0,0,1,1,1,...,1,1,0,1,0,1,1,0,0,0


## TF-IDF

Importance - when a word is repeated most in a document,but not in any other </br>
Stop Words - low score </br>
unique words repeated in same sentence - high score </br>

In [25]:
import math
import sklearn

In [33]:
first_sent = "Data Science is an Amazing career in the current world"
second_sent = "Deep Learning is a subset of machine Learning"

first_sent = first_sent.split(" ")
second_sent = second_sent.split(" ")

vocab = set(first_sent).union(set(second_sent))
vocab

{'Amazing',
 'Data',
 'Deep',
 'Learning',
 'Science',
 'a',
 'an',
 'career',
 'current',
 'in',
 'is',
 'machine',
 'of',
 'subset',
 'the',
 'world'}

In [34]:
wordDict1 = dict.fromkeys(vocab,0)
wordDict2 = dict.fromkeys(vocab,0)


In [35]:
for word in first_sent:
    wordDict1[word] += 1

for word in second_sent:
    wordDict2[word] += 1

In [36]:
wordDict1

{'Learning': 0,
 'a': 0,
 'Science': 1,
 'Deep': 0,
 'is': 1,
 'subset': 0,
 'machine': 0,
 'an': 1,
 'the': 1,
 'world': 1,
 'career': 1,
 'in': 1,
 'Amazing': 1,
 'of': 0,
 'current': 1,
 'Data': 1}

In [37]:
df = pd.DataFrame([wordDict1,wordDict2])
df

Unnamed: 0,Learning,a,Science,Deep,is,subset,machine,an,the,world,career,in,Amazing,of,current,Data
0,0,0,1,0,1,0,0,1,1,1,1,1,1,0,1,1
1,2,1,0,1,1,1,1,0,0,0,0,0,0,1,0,0


In [38]:
def calculateTF(wordDict,doc):
    tfDict = {}
    sen_len = len(doc)

    for word,count in wordDict.items():
        tfDict[word] = count/sen_len
    
    return tfDict

In [41]:
tf1 = calculateTF(wordDict1,first_sent)
tf2 = calculateTF(wordDict2,second_sent)

tf = pd.DataFrame([tf1,tf2])
tf

Unnamed: 0,Learning,a,Science,Deep,is,subset,machine,an,the,world,career,in,Amazing,of,current,Data
0,0.0,0.0,0.1,0.0,0.1,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.0,0.1,0.1
1,0.25,0.125,0.0,0.125,0.125,0.125,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0


In [71]:
# def calculateIDF(wordDict,N)
a = tf['is'] > 0
print(a.tolist())

[True, True]
