In [44]:
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "If he cares about caring, then he should care about caring about caring.",
    "If he began to care, then he should begin to care about caring about caring.",
    "123 the world is large 32.34",
    'He stripped the striped paint by stripping the first coat of paint.'
]

In [45]:
processed_docs = [doc.lower().replace(".","") for doc in corpus]
processed_docs

['this is the first document',
 'this document is the second document',
 'if he cares about caring, then he should care about caring about caring',
 'if he began to care, then he should begin to care about caring about caring',
 '123 the world is large 3234',
 'he stripped the striped paint by stripping the first coat of paint']

### One hot encoding

In [46]:
vocab = {}
count = 0
for doc in processed_docs:
    for word in doc.split():
        if word not in vocab:
            count = count +1
            vocab[word] = count
print(vocab)

{'this': 1, 'is': 2, 'the': 3, 'first': 4, 'document': 5, 'second': 6, 'if': 7, 'he': 8, 'cares': 9, 'about': 10, 'caring,': 11, 'then': 12, 'should': 13, 'care': 14, 'caring': 15, 'began': 16, 'to': 17, 'care,': 18, 'begin': 19, '123': 20, 'world': 21, 'large': 22, '3234': 23, 'stripped': 24, 'striped': 25, 'paint': 26, 'by': 27, 'stripping': 28, 'coat': 29, 'of': 30}


In [47]:
def get_onehot_vector(somestring):
    onehot_encoded = []
    for word in somestring.split():
        temp = [0]*len(vocab)
        if word in vocab:
            temp[vocab[word]-1] = 1 # -1 is to take care of the fact indexing in array starts from 0 and not 1
        onehot_encoded.append(temp)
    return onehot_encoded

In [48]:
get_onehot_vector(processed_docs[1])

[[1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]]

- Using sklearn

In [49]:
S1 = 'dog bites man'
S2 = 'man bites dog'
S3 = 'dog eats meat'
S4 = 'man eats food'

In [50]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

data = [S1.split(), S2.split(), S3.split(), S4.split()]
values = data[0]+data[1]+data[2]+data[3]
print("The data: ",values)

#Label Encoding
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("Label Encoded:",integer_encoded)

#One-Hot Encoding
onehot_encoder = OneHotEncoder()
onehot_encoded = onehot_encoder.fit_transform(data).toarray()
print("Onehot Encoded Matrix:\n",onehot_encoded)

The data:  ['dog', 'bites', 'man', 'man', 'bites', 'dog', 'dog', 'eats', 'meat', 'man', 'eats', 'food']
Label Encoded: [1 0 4 4 0 1 1 2 5 4 2 3]
Onehot Encoded Matrix:
 [[1. 0. 1. 0. 0. 0. 1. 0.]
 [0. 1. 1. 0. 1. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 1. 0. 0.]]


- drawbacks of one hot encoding:
1) sparse representation
2) No fixed length representation
3) no notion of (dis)similarity between words
4) out of vocabulary problem


### Bag of words
- represent the text under consideration as a bag (collection) of words
while ignoring the order and context

In [51]:
processed_docs

['this is the first document',
 'this document is the second document',
 'if he cares about caring, then he should care about caring about caring',
 'if he began to care, then he should begin to care about caring about caring',
 '123 the world is large 3234',
 'he stripped the striped paint by stripping the first coat of paint']

In [52]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
#Build a BOW representation for the corpus
bow_rep = count_vect.fit_transform(processed_docs)
#Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)
#See the BOW rep for first 2 documents
print("BoW representation for 'this is the first document': ",
bow_rep[0].toarray())
print("BoW representation for 'this document is the second document:",bow_rep[1].toarray())

Our vocabulary:  {'this': 25, 'is': 14, 'the': 23, 'first': 11, 'document': 10, 'second': 18, 'if': 13, 'he': 12, 'cares': 7, 'about': 2, 'caring': 8, 'then': 24, 'should': 19, 'care': 6, 'began': 3, 'to': 26, 'begin': 4, '123': 0, 'world': 27, 'large': 15, '3234': 1, 'stripped': 21, 'striped': 20, 'paint': 17, 'by': 5, 'stripping': 22, 'coat': 9, 'of': 16}
BoW representation for 'this is the first document':  [[0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0]]
BoW representation for 'this document is the second document: [[0 0 0 0 0 0 0 0 0 0 2 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0]]


In [53]:
count_vect = CountVectorizer(binary=True)
bow_rep_bin = count_vect.fit_transform(processed_docs)
temp = count_vect.transform(["this  and this  are friends"])
print("Bow representation for 'this  and this  are friends':", temp.toarray())

Bow representation for 'this  and this  are friends': [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]]


### Bag of N-Grams
- The bag-of-n-grams (BoN) approach tries to breaking text into chunks of n contiguous words (or tokens).

In [54]:
count_vect = CountVectorizer(ngram_range=(1,3))
#Build a BOW representation for the corpus
bow_rep = count_vect.fit_transform(processed_docs)
#Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)

Our vocabulary:  {'this': 96, 'is': 54, 'the': 83, 'first': 36, 'document': 33, 'this is': 99, 'is the': 57, 'the first': 84, 'first document': 39, 'this is the': 100, 'is the first': 58, 'the first document': 86, 'second': 67, 'this document': 97, 'document is': 34, 'the second': 87, 'second document': 68, 'this document is': 98, 'document is the': 35, 'is the second': 59, 'the second document': 88, 'if': 50, 'he': 40, 'cares': 22, 'about': 4, 'caring': 25, 'then': 93, 'should': 69, 'care': 17, 'if he': 51, 'he cares': 43, 'cares about': 23, 'about caring': 5, 'caring then': 28, 'then he': 94, 'he should': 45, 'should care': 72, 'care about': 18, 'caring about': 26, 'if he cares': 53, 'he cares about': 44, 'cares about caring': 24, 'about caring then': 7, 'caring then he': 29, 'then he should': 95, 'he should care': 47, 'should care about': 73, 'care about caring': 19, 'about caring about': 6, 'caring about caring': 27, 'began': 8, 'to': 101, 'begin': 11, 'he began': 41, 'began to': 9

In [55]:
processed_docs

['this is the first document',
 'this document is the second document',
 'if he cares about caring, then he should care about caring about caring',
 'if he began to care, then he should begin to care about caring about caring',
 '123 the world is large 3234',
 'he stripped the striped paint by stripping the first coat of paint']

In [56]:
temp = count_vect.transform(["this is the first document"])
print("this is the first document", temp.toarray())

this is the first document [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
  1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0]]


### TF-IDF

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [60]:
tfidf = TfidfVectorizer()
tf_idf_docs = tfidf.fit_transform(processed_docs)
print(tfidf.idf_)
print(tfidf.get_feature_names_out())

[2.25276297 2.25276297 1.84729786 2.25276297 2.25276297 2.25276297
 1.84729786 2.25276297 1.84729786 2.25276297 1.84729786 1.84729786
 1.55961579 1.84729786 1.55961579 2.25276297 2.25276297 2.25276297
 2.25276297 1.84729786 2.25276297 2.25276297 2.25276297 1.33647224
 1.84729786 1.84729786 2.25276297 2.25276297]
['123' '3234' 'about' 'began' 'begin' 'by' 'care' 'cares' 'caring' 'coat'
 'document' 'first' 'he' 'if' 'is' 'large' 'of' 'paint' 'second' 'should'
 'striped' 'stripped' 'stripping' 'the' 'then' 'this' 'to' 'world']


In [65]:
temp = tfidf.transform(["paint coat is striped",'world is a paint'])
print("Tfidf representation for 'paint coat is striped':\n", temp.toarray())

Tfidf representation for 'paint coat is striped':
 [[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.53611046 0.         0.
  0.         0.         0.37115593 0.         0.         0.53611046
  0.         0.         0.53611046 0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.4396812  0.         0.         0.63509072
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.63509072]]


## Distributed Representations

- Distributional similarity
- Distributional hypothesis
- Distributional representation

### Word Embeddings

In [66]:
from gensim.models import Word2Vec, KeyedVectors

In [68]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



In [71]:
vector_king = wv['king']
print(vector_king)

[ 1.25976562e-01  2.97851562e-02  8.60595703e-03  1.39648438e-01
 -2.56347656e-02 -3.61328125e-02  1.11816406e-01 -1.98242188e-01
  5.12695312e-02  3.63281250e-01 -2.42187500e-01 -3.02734375e-01
 -1.77734375e-01 -2.49023438e-02 -1.67968750e-01 -1.69921875e-01
  3.46679688e-02  5.21850586e-03  4.63867188e-02  1.28906250e-01
  1.36718750e-01  1.12792969e-01  5.95703125e-02  1.36718750e-01
  1.01074219e-01 -1.76757812e-01 -2.51953125e-01  5.98144531e-02
  3.41796875e-01 -3.11279297e-02  1.04492188e-01  6.17675781e-02
  1.24511719e-01  4.00390625e-01 -3.22265625e-01  8.39843750e-02
  3.90625000e-02  5.85937500e-03  7.03125000e-02  1.72851562e-01
  1.38671875e-01 -2.31445312e-01  2.83203125e-01  1.42578125e-01
  3.41796875e-01 -2.39257812e-02 -1.09863281e-01  3.32031250e-02
 -5.46875000e-02  1.53198242e-02 -1.62109375e-01  1.58203125e-01
 -2.59765625e-01  2.01416016e-02 -1.63085938e-01  1.35803223e-03
 -1.44531250e-01 -5.68847656e-02  4.29687500e-02 -2.46582031e-02
  1.85546875e-01  4.47265

In [79]:
print(wv.most_similar('queen', topn=5))

[('queens', 0.739944338798523), ('princess', 0.7070532441139221), ('king', 0.6510956883430481), ('monarch', 0.6383602023124695), ('very_pampered_McElhatton', 0.6357026696205139)]
