<a href="https://colab.research.google.com/github/mralamdari/NLP-Text-Processing/blob/main/NLP_Text_Representation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
import sklearn
import numpy as np
import matplotlib.pyplot as plt

In [2]:
documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
documents = [sentence.lower().replace('.', '') for sentence in documents]
documents

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

###One-Hot Encoding

In [3]:
#Vocabulary
all_words = []
for sentence in documents:
  all_words.extend(sentence.split())

unique_words = set(all_words)
vocab = dict(zip(unique_words, range(len(unique_words))))
vocab

{'bites': 0, 'dog': 2, 'eats': 5, 'food': 1, 'man': 3, 'meat': 4}

In [4]:
#onehot creator
def onehot_creator(sentence):
  sentence = sentence.lower().split()
  onehot_encoded = np.zeros((len(sentence), len(vocab)))
  for i, word in enumerate(sentence):
    word_id = vocab.get(word)
    if word_id != None:
      onehot_encoded[i][word_id] = 1
  return onehot_encoded

In [5]:
onehot_creator("Dog bites man")

array([[0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.]])

In [6]:
test_sample = "Dog bites man and then eates meat"
onehot_creator(test_sample)

array([[0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.]])

One Hot Encoding with Scikit-learn

In [7]:
from sklearn import preprocessing

In [8]:
label_encoder = preprocessing.LabelEncoder()
onehot_encoder = preprocessing.OneHotEncoder()

In [9]:
label_encoder.fit_transform(all_words)

array([1, 0, 4, 4, 0, 1, 1, 2, 5, 4, 2, 3])

In [10]:
onehot_encoder.fit_transform(np.array(all_words).reshape(-1, 1)).toarray()

array([[0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.]])

###Bag of Words (BoW)

In [11]:
from sklearn import feature_extraction

In [12]:
count_vect = feature_extraction.text.CountVectorizer()

In [13]:
bow_representation = count_vect.fit_transform(documents)

In [14]:
count_vect.vocabulary_

{'bites': 0, 'dog': 1, 'eats': 2, 'food': 3, 'man': 4, 'meat': 5}

In [15]:
for rep in bow_representation:
  print(rep.toarray())

[[1 1 0 0 1 0]]
[[1 1 0 0 1 0]]
[[0 1 1 0 0 1]]
[[0 0 1 1 1 0]]


In [16]:
count_vect.transform([test_sample]).toarray()

array([[1, 1, 0, 0, 1, 1]])

BoW with Binary vectors

In [17]:
binary_count_vector = feature_extraction.text.CountVectorizer(binary=True)
binary_count_vector.fit(documents)

CountVectorizer(binary=True)

In [18]:
binary_count_vector.transform([test_sample]).toarray()

array([[1, 1, 0, 0, 1, 1]])

###Bag of N-Grams (BoN)

In [19]:
count_vect = feature_extraction.text.CountVectorizer(ngram_range=(1, 3))

In [20]:
bon_representation = count_vect.fit_transform(documents)

In [21]:
count_vect.vocabulary_

{'bites': 0,
 'bites dog': 1,
 'bites man': 2,
 'dog': 3,
 'dog bites': 4,
 'dog bites man': 5,
 'dog eats': 6,
 'dog eats meat': 7,
 'eats': 8,
 'eats food': 9,
 'eats meat': 10,
 'food': 11,
 'man': 12,
 'man bites': 13,
 'man bites dog': 14,
 'man eats': 15,
 'man eats food': 16,
 'meat': 17}

In [22]:
for rep in bow_representation:
  print(rep.toarray())

[[1 1 0 0 1 0]]
[[1 1 0 0 1 0]]
[[0 1 1 0 0 1]]
[[0 0 1 1 1 0]]


In [23]:
count_vect.transform([test_sample]).toarray()

array([[1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]])

###TF-IDF

In [24]:
tfidf = feature_extraction.text.TfidfVectorizer()

In [25]:
bow_rep_tfidf = tfidf.fit_transform(documents)

In [26]:
bow_rep_tfidf.toarray()

array([[0.65782931, 0.53256952, 0.        , 0.        , 0.53256952,
        0.        ],
       [0.65782931, 0.53256952, 0.        , 0.        , 0.53256952,
        0.        ],
       [0.        , 0.44809973, 0.55349232, 0.        , 0.        ,
        0.70203482],
       [0.        , 0.        , 0.55349232, 0.70203482, 0.44809973,
        0.        ]])

In [27]:
#Idf for Vocabulary Words
tfidf.idf_

array([1.51082562, 1.22314355, 1.51082562, 1.91629073, 1.22314355,
       1.91629073])

In [28]:
#Vocabulary Words
tfidf.get_feature_names()



['bites', 'dog', 'eats', 'food', 'man', 'meat']

In [29]:
tfidf.transform([test_sample]).toarray()

array([[0.5051001 , 0.40892206, 0.        , 0.        , 0.40892206,
        0.64065543]])

###Word2Vec

####Embeddings using Spacy

In [None]:
!python -m spacy download en_core_web_md

In [31]:
import spacy

In [32]:
nlp = spacy.load('en_core_web_md')

In [33]:
doc = nlp('i love pizza')

In [34]:
doc.vector

array([ 1.11230902e-01,  2.41379991e-01, -1.61966667e-01, -3.14129978e-01,
        1.05768003e-01,  3.91543359e-01,  3.54509979e-01, -1.27896652e-01,
        1.70420006e-01,  1.69123328e+00, -6.31393373e-01,  1.69920668e-01,
        7.96566680e-02, -1.29133360e-02, -1.50936663e-01,  1.39993383e-02,
       -1.84084341e-01,  1.19740999e+00, -2.22010016e-01,  2.34284341e-01,
        2.44166657e-01, -3.25873703e-01,  2.53593326e-01, -1.86454996e-01,
        1.27480000e-01,  4.88123260e-02, -3.57987314e-01, -2.18866989e-01,
        2.98794001e-01, -2.83073336e-02, -1.92849990e-02,  2.78633356e-01,
       -1.75589994e-01,  1.30146667e-01,  9.73366722e-02,  1.38193667e-01,
       -2.25796700e-02, -1.12435341e-01, -3.15516651e-01,  2.65180707e-01,
       -1.24165334e-01,  1.74489990e-01,  1.08916335e-01,  9.63599980e-02,
        2.16483325e-02,  5.32136679e-01, -2.61241674e-01,  2.68553346e-01,
        6.79000234e-03, -9.15016606e-02, -2.10650012e-01, -4.78706658e-02,
        1.10848665e-01, -

In [35]:
doc[0].vector

array([ 1.8733e-01,  4.0595e-01, -5.1174e-01, -5.5482e-01,  3.9716e-02,
        1.2887e-01,  4.5137e-01, -5.9149e-01,  1.5591e-01,  1.5137e+00,
       -8.7020e-01,  5.0672e-02,  1.5211e-01, -1.9183e-01,  1.1181e-01,
        1.2131e-01, -2.7212e-01,  1.6203e+00, -2.4884e-01,  1.4060e-01,
        3.3099e-01, -1.8061e-02,  1.5244e-01, -2.6943e-01, -2.7833e-01,
       -5.2123e-02, -4.8149e-01, -5.1839e-01,  8.6262e-02,  3.0818e-02,
       -2.1253e-01, -1.1378e-01, -2.2384e-01,  1.8262e-01, -3.4541e-01,
        8.2611e-02,  1.0024e-01, -7.9550e-02, -8.1721e-01,  6.5621e-03,
        8.0134e-02, -3.9976e-01, -6.3131e-02,  3.2260e-01, -3.1625e-02,
        4.3056e-01, -2.7270e-01, -7.6020e-02,  1.0293e-01, -8.8653e-02,
       -2.9087e-01, -4.7214e-02,  4.6036e-02, -1.7788e-02,  6.4990e-02,
        8.8451e-02, -3.1574e-01, -5.8522e-01,  2.2295e-01, -5.2785e-02,
       -5.5981e-01, -3.9580e-01, -7.9849e-02, -1.0933e-02, -4.1722e-02,
       -5.5576e-01,  8.8707e-02,  1.3710e-01, -2.9873e-03, -2.62

####Embeddings using Gensim

In [36]:
import gensim
corpus = [['dog','bites','man'], ["man", "bites" ,"dog"],["dog","eats","meat"],["man", "eats","food"]]

#####Continuous Bag of Words (CBOW)


In [37]:
model_cbow = gensim.models.Word2Vec(corpus, min_count=1, sg=0) 

In [38]:
words = list(model_cbow.wv.vocab)
print(words)

['dog', 'bites', 'man', 'eats', 'meat', 'food']


In [39]:
model_cbow['meat']

  """Entry point for launching an IPython kernel.


array([-0.00348223,  0.00274166,  0.00286569, -0.00024953, -0.0027612 ,
        0.00016903,  0.0036551 ,  0.00111108,  0.00233274,  0.00089957,
        0.00081475,  0.00447297,  0.00022637, -0.0015832 , -0.00177419,
        0.00248535, -0.0040132 , -0.00499565,  0.00205056,  0.00194684,
        0.0001828 , -0.00046876, -0.00216897, -0.00419755,  0.00100626,
       -0.00095996, -0.00153622, -0.00092589, -0.00281605,  0.00158246,
        0.00190902,  0.00040479, -0.00016452,  0.00042426,  0.00166272,
       -0.00488523,  0.00156547,  0.0004393 , -0.00257476, -0.00397822,
       -0.00226262,  0.0019594 ,  0.00471237, -0.00407758, -0.00204773,
       -0.00217647, -0.00128273,  0.00415247, -0.00298349,  0.00143813,
       -0.00385219, -0.00218028,  0.00466753,  0.00132019, -0.00132702,
        0.00292814,  0.00131277,  0.00174935,  0.00332778, -0.00029809,
       -0.00308627, -0.00291407, -0.00210095,  0.00454491, -0.0048104 ,
       -0.00186057, -0.00131231, -0.00338939,  0.00372805, -0.00

In [40]:
model_cbow.similarity('bites', 'eats')

  """Entry point for launching an IPython kernel.


0.018700752

In [41]:
model_cbow.similarity('dog', 'eats')

  """Entry point for launching an IPython kernel.


-0.18652296

In [42]:
model_cbow.similarity('meat', 'eats')

  """Entry point for launching an IPython kernel.


0.012212791

In [43]:
model_cbow.save('cbow_model.bin')

In [44]:
model_cbow = gensim.models.Word2Vec.load('cbow_model.bin')

#####SkipGram

In [45]:
model_skipgram = gensim.models.Word2Vec(corpus, min_count=1, sg=1)

In [46]:
words = list(model_skipgram.wv.vocab)
print(words)

['dog', 'bites', 'man', 'eats', 'meat', 'food']


In [47]:
model_skipgram['man']

  """Entry point for launching an IPython kernel.


array([-2.0112637e-03,  4.6612024e-03,  2.3878811e-05,  1.3731554e-03,
       -2.0678404e-03, -3.6847231e-03, -2.1829552e-03, -6.3940813e-04,
        2.7256953e-03,  3.1432479e-03,  3.4235683e-03, -4.8793722e-03,
       -3.3713137e-03,  1.4868978e-03,  3.5669198e-03,  2.3784512e-03,
        1.4670979e-04,  9.1529329e-04,  1.0346138e-03, -1.2632761e-03,
        3.8981244e-03, -1.8848220e-03, -3.3825007e-03,  4.1506458e-03,
        4.9111829e-04, -3.4495068e-03, -3.8652134e-03,  2.8149204e-03,
       -4.9288641e-03,  3.9023457e-03,  1.3927063e-03,  3.6816704e-03,
       -3.8061489e-03, -1.9930471e-03, -1.0662479e-03,  6.0274446e-04,
        7.6384575e-04,  6.1616849e-04, -6.1438896e-04,  3.0194691e-03,
        4.1772745e-04, -2.8730193e-03,  4.1851439e-03,  4.2853854e-03,
       -2.3214680e-03, -1.0146014e-03, -3.7481105e-03,  2.9782655e-03,
       -2.2786264e-03, -2.0994677e-03, -4.1740588e-03, -2.8265764e-03,
        4.0045881e-04, -3.0477790e-03, -5.4128142e-04, -1.0910128e-03,
      

In [48]:
model_skipgram.similarity('bites', 'eats')

  """Entry point for launching an IPython kernel.


0.018704321

In [49]:
model_skipgram.similarity('dog', 'eats')

  """Entry point for launching an IPython kernel.


-0.18650742

In [50]:
model_skipgram.similarity('meat', 'eats')

  """Entry point for launching an IPython kernel.


0.01214213

In [51]:
model_skipgram.save('skipgram_model.bin')

In [52]:
model_skipgram = gensim.models.Word2Vec.load('skipgram_model.bin')

###Document Vectors