### Vectorisation techniques

##### Bag Of Words

In [1]:
# Step 1 : Tokenization : A sentence is represted as list of constituent words. This step is done for all input sentences

from sklearn.feature_extraction.text import CountVectorizer

sents = ['coronavirus is a highly infectious disease',
'coronavirus affects older people the most', 
'older people are at high risk due to this disease']

In [2]:
cv = CountVectorizer()
X = cv.fit_transform(sents)

#Step 2 :Vocabulary Creation

print(X.toarray())
sorted(cv.vocabulary_.keys())


[[0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0]
 [0 1 1 0 1 1 1 0 0 0 0 1 1 1 0 1 1]]


['affects',
 'are',
 'at',
 'coronavirus',
 'disease',
 'due',
 'high',
 'highly',
 'infectious',
 'is',
 'most',
 'older',
 'people',
 'risk',
 'the',
 'this',
 'to']

In [3]:
#Step 3 : Sparse Matrix Creation

print(X)


  (0, 3)	1
  (0, 9)	1
  (0, 7)	1
  (0, 8)	1
  (0, 4)	1
  (1, 3)	1
  (1, 0)	1
  (1, 11)	1
  (1, 12)	1
  (1, 14)	1
  (1, 10)	1
  (2, 4)	1
  (2, 11)	1
  (2, 12)	1
  (2, 1)	1
  (2, 2)	1
  (2, 6)	1
  (2, 13)	1
  (2, 5)	1
  (2, 16)	1
  (2, 15)	1


##### TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tfidf = TfidfVectorizer()
#transform the data
transformed = tfidf.fit_transform(sents)
# print(transformed)
df = pd.DataFrame(transformed[0].T.todense(),
index=tfidf.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print(df)

               TF-IDF
infectious   0.490479
highly       0.490479
is           0.490479
coronavirus  0.373022
disease      0.373022
older        0.000000
this         0.000000
the          0.000000
risk         0.000000
people       0.000000
affects      0.000000
most         0.000000
are          0.000000
high         0.000000
due          0.000000
at           0.000000
to           0.000000




In [5]:
# From the above output word infectious is more important

##### Glove

In [6]:
import numpy as np

embeddings_dict={}
with open('./glove.6B.50d.txt','rb') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [7]:
embeddings_dict[b'test']

array([ 0.13175 , -0.25517 , -0.067915,  0.26193 , -0.26155 ,  0.23569 ,
        0.13077 , -0.011801,  1.7659  ,  0.20781 ,  0.26198 , -0.16428 ,
       -0.84642 ,  0.020094,  0.070176,  0.39778 ,  0.15278 , -0.20213 ,
       -1.6184  , -0.54327 , -0.17856 ,  0.53894 ,  0.49868 , -0.10171 ,
        0.66265 , -1.7051  ,  0.057193, -0.32405 , -0.66835 ,  0.26654 ,
        2.842   ,  0.26844 , -0.59537 , -0.5004  ,  1.5199  ,  0.039641,
        1.6659  ,  0.99758 , -0.5597  , -0.70493 , -0.0309  , -0.28302 ,
       -0.13564 ,  0.6429  ,  0.41491 ,  1.2362  ,  0.76587 ,  0.97798 ,
        0.58507 , -0.30176 ], dtype=float32)

In [8]:
from scipy import spatial

In [9]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: 
spatial.distance.euclidean(embeddings_dict[word], embedding))

find_closest_embeddings(embeddings_dict[b'health'])[:10]

[b'health',
 b'care',
 b'medical',
 b'welfare',
 b'prevention',
 b'education',
 b'public',
 b'poor',
 b'healthcare',
 b'needs']

In [10]:
sents = [sent.split() for sent in sents]
sents

[['coronavirus', 'is', 'a', 'highly', 'infectious', 'disease'],
 ['coronavirus', 'affects', 'older', 'people', 'the', 'most'],
 ['older',
  'people',
  'are',
  'at',
  'high',
  'risk',
  'due',
  'to',
  'this',
  'disease']]