<a href="https://colab.research.google.com/github/mralamdari/NLP-Text-Processing/blob/main/NLP_Text_Representation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
import sklearn
import numpy as np
import matplotlib.pyplot as plt

In [2]:
documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
documents = [sentence.lower().replace('.', '') for sentence in documents]
documents

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

###One-Hot Encoding

In [9]:
#Vocabulary
all_words = []
for sentence in documents:
  all_words.extend(sentence.split())

unique_words = set(all_words)
vocab = dict(zip(unique_words, range(len(unique_words))))
vocab

{'bites': 0, 'dog': 1, 'eats': 4, 'food': 5, 'man': 3, 'meat': 2}

In [38]:
#onehot creator
def onehot_creator(sentence):
  sentence = sentence.lower().split()
  onehot_encoded = np.zeros((len(sentence), len(vocab)))
  for i, word in enumerate(sentence):
    word_id = vocab.get(word)
    if word_id != None:
      onehot_encoded[i][word_id] = 1
  return onehot_encoded

In [40]:
onehot_creator("Dog bites man")

array([[0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.]])

In [66]:
test_sample = "Dog bites man and then eates meat"
onehot_creator(test_sample)

array([[0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

One Hot Encoding with Scikit-learn

In [42]:
from sklearn import preprocessing

In [43]:
label_encoder = preprocessing.LabelEncoder()
onehot_encoder = preprocessing.OneHotEncoder()

In [44]:
label_encoder.fit_transform(all_words)

array([1, 0, 4, 4, 0, 1, 1, 2, 5, 4, 2, 3])

In [50]:
onehot_encoder.fit_transform(np.array(all_words).reshape(-1, 1)).toarray()

array([[0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.]])

###Bag of Words (BoW)

In [51]:
from sklearn import feature_extraction

In [52]:
count_vect = feature_extraction.text.CountVectorizer()

In [54]:
bow_representation = count_vect.fit_transform(documents)

In [56]:
count_vect.vocabulary_

{'bites': 0, 'dog': 1, 'eats': 2, 'food': 3, 'man': 4, 'meat': 5}

In [57]:
for rep in bow_representation:
  print(rep.toarray())

[[1 1 0 0 1 0]]
[[1 1 0 0 1 0]]
[[0 1 1 0 0 1]]
[[0 0 1 1 1 0]]


In [62]:
count_vect.transform([test_sample]).toarray()

array([[1, 1, 0, 0, 1, 1]])

BoW with Binary vectors

In [63]:
binary_count_vector = feature_extraction.text.CountVectorizer(binary=True)
binary_count_vector.fit(documents)

CountVectorizer(binary=True)

In [84]:
binary_count_vector.transform([test_sample]).toarray()

array([[1, 1, 0, 0, 1, 1]])

###Bag of N-Grams (BoN)

In [78]:
count_vect = feature_extraction.text.CountVectorizer(ngram_range=(1, 3))

In [79]:
bon_representation = count_vect.fit_transform(documents)

In [80]:
count_vect.vocabulary_

{'bites': 0,
 'bites dog': 1,
 'bites man': 2,
 'dog': 3,
 'dog bites': 4,
 'dog bites man': 5,
 'dog eats': 6,
 'dog eats meat': 7,
 'eats': 8,
 'eats food': 9,
 'eats meat': 10,
 'food': 11,
 'man': 12,
 'man bites': 13,
 'man bites dog': 14,
 'man eats': 15,
 'man eats food': 16,
 'meat': 17}

In [81]:
for rep in bow_representation:
  print(rep.toarray())

[[1 1 0 0 1 0]]
[[1 1 0 0 1 0]]
[[0 1 1 0 0 1]]
[[0 0 1 1 1 0]]


In [86]:
count_vect.transform([test_sample]).toarray()

array([[1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]])

###TF-IDF

In [87]:
tfidf = feature_extraction.text.TfidfVectorizer()

In [88]:
bow_rep_tfidf = tfidf.fit_transform(documents)

In [90]:
bow_rep_tfidf.toarray()

array([[0.65782931, 0.53256952, 0.        , 0.        , 0.53256952,
        0.        ],
       [0.65782931, 0.53256952, 0.        , 0.        , 0.53256952,
        0.        ],
       [0.        , 0.44809973, 0.55349232, 0.        , 0.        ,
        0.70203482],
       [0.        , 0.        , 0.55349232, 0.70203482, 0.44809973,
        0.        ]])

In [92]:
#Idf for Vocabulary Words
tfidf.idf_

array([1.51082562, 1.22314355, 1.51082562, 1.91629073, 1.22314355,
       1.91629073])

In [94]:
#Vocabulary Words
tfidf.get_feature_names()



['bites', 'dog', 'eats', 'food', 'man', 'meat']

In [97]:
tfidf.transform([test_sample]).toarray()

array([[0.5051001 , 0.40892206, 0.        , 0.        , 0.40892206,
        0.64065543]])

###Word2Vec

####Embeddings using Spacy

In [None]:
!python -m spacy download en_core_web_md

In [4]:
import spacy

In [8]:
nlp = spacy.load('en_core_web_md')

In [9]:
doc = nlp('i love pizza')

In [11]:
doc.vector

array([ 1.11230902e-01,  2.41379991e-01, -1.61966667e-01, -3.14129978e-01,
        1.05768003e-01,  3.91543359e-01,  3.54509979e-01, -1.27896652e-01,
        1.70420006e-01,  1.69123328e+00, -6.31393373e-01,  1.69920668e-01,
        7.96566680e-02, -1.29133360e-02, -1.50936663e-01,  1.39993383e-02,
       -1.84084341e-01,  1.19740999e+00, -2.22010016e-01,  2.34284341e-01,
        2.44166657e-01, -3.25873703e-01,  2.53593326e-01, -1.86454996e-01,
        1.27480000e-01,  4.88123260e-02, -3.57987314e-01, -2.18866989e-01,
        2.98794001e-01, -2.83073336e-02, -1.92849990e-02,  2.78633356e-01,
       -1.75589994e-01,  1.30146667e-01,  9.73366722e-02,  1.38193667e-01,
       -2.25796700e-02, -1.12435341e-01, -3.15516651e-01,  2.65180707e-01,
       -1.24165334e-01,  1.74489990e-01,  1.08916335e-01,  9.63599980e-02,
        2.16483325e-02,  5.32136679e-01, -2.61241674e-01,  2.68553346e-01,
        6.79000234e-03, -9.15016606e-02, -2.10650012e-01, -4.78706658e-02,
        1.10848665e-01, -

In [12]:
doc[0].vector

array([ 1.8733e-01,  4.0595e-01, -5.1174e-01, -5.5482e-01,  3.9716e-02,
        1.2887e-01,  4.5137e-01, -5.9149e-01,  1.5591e-01,  1.5137e+00,
       -8.7020e-01,  5.0672e-02,  1.5211e-01, -1.9183e-01,  1.1181e-01,
        1.2131e-01, -2.7212e-01,  1.6203e+00, -2.4884e-01,  1.4060e-01,
        3.3099e-01, -1.8061e-02,  1.5244e-01, -2.6943e-01, -2.7833e-01,
       -5.2123e-02, -4.8149e-01, -5.1839e-01,  8.6262e-02,  3.0818e-02,
       -2.1253e-01, -1.1378e-01, -2.2384e-01,  1.8262e-01, -3.4541e-01,
        8.2611e-02,  1.0024e-01, -7.9550e-02, -8.1721e-01,  6.5621e-03,
        8.0134e-02, -3.9976e-01, -6.3131e-02,  3.2260e-01, -3.1625e-02,
        4.3056e-01, -2.7270e-01, -7.6020e-02,  1.0293e-01, -8.8653e-02,
       -2.9087e-01, -4.7214e-02,  4.6036e-02, -1.7788e-02,  6.4990e-02,
        8.8451e-02, -3.1574e-01, -5.8522e-01,  2.2295e-01, -5.2785e-02,
       -5.5981e-01, -3.9580e-01, -7.9849e-02, -1.0933e-02, -4.1722e-02,
       -5.5576e-01,  8.8707e-02,  1.3710e-01, -2.9873e-03, -2.62

####Embeddings using Gensim

In [17]:
import gensim
corpus = [['dog','bites','man'], ["man", "bites" ,"dog"],["dog","eats","meat"],["man", "eats","food"]]

#####Continuous Bag of Words (CBOW)


In [19]:
model_cbow = gensim.models.Word2Vec(corpus, min_count=1, sg=0) 

In [22]:
words = list(model_cbow.wv.vocab)
print(words)

['dog', 'bites', 'man', 'eats', 'meat', 'food']


In [23]:
model_cbow['meat']

  """Entry point for launching an IPython kernel.


array([ 1.86382572e-03,  4.39878786e-03,  3.91667662e-03,  2.64534191e-03,
        1.39515847e-04, -2.49983254e-03,  2.38882843e-03, -1.69968116e-03,
        4.29632561e-03,  3.49699054e-04,  5.70764823e-04,  4.02356964e-03,
       -1.76552846e-03, -3.02880304e-03, -2.74629914e-03, -4.28141514e-03,
        1.33252272e-03, -3.10373516e-03,  1.10409586e-04,  5.98974148e-05,
       -2.84599466e-03, -2.70853122e-03, -4.16035764e-03, -1.03914899e-04,
       -1.86258834e-03,  3.50842049e-04, -3.41786398e-03, -8.81396525e-04,
        4.09322605e-03, -2.34383671e-03,  3.26915015e-03, -5.96057624e-04,
        4.13260376e-03, -3.64906224e-03,  4.76058852e-03, -2.61705997e-03,
        2.90847500e-03,  4.65514185e-03, -2.32121581e-03, -3.33201978e-03,
       -3.26441205e-03, -4.82923258e-03,  2.82629230e-03,  1.63173338e-03,
        2.22920161e-03, -1.38756237e-03,  2.21916381e-03, -2.05490948e-03,
        1.66887674e-03, -1.93758542e-03,  4.11832612e-03, -1.64936425e-03,
       -7.36534363e-04,  

In [24]:
model_cbow.similarity('bites', 'eats')

  """Entry point for launching an IPython kernel.


0.059683584

In [28]:
model_cbow.similarity('dog', 'eats')

  """Entry point for launching an IPython kernel.


-0.018639755

In [29]:
model_cbow.similarity('meat', 'eats')

  """Entry point for launching an IPython kernel.


0.0014443402

In [30]:
model_cbow.save('cbow_model.bin')

In [32]:
model_cbow = gensim.models.Word2Vec.load('cbow_model.bin')

#####SkipGram

In [33]:
model_skipgram = gensim.models.Word2Vec(corpus, min_count=1, sg=1)

In [34]:
words = list(model_skipgram.wv.vocab)
print(words)

['dog', 'bites', 'man', 'eats', 'meat', 'food']


In [35]:
model_skipgram['man']

  """Entry point for launching an IPython kernel.


array([ 2.70033441e-03,  5.27583470e-04, -4.30648820e-03, -1.87176338e-03,
       -2.85190740e-03,  8.82315624e-04, -9.35057818e-04,  2.34860973e-03,
       -3.11193289e-03,  2.71500554e-03,  2.00413563e-03,  3.42800026e-03,
        3.81431496e-03, -1.50338141e-03,  4.52308916e-03,  2.76574004e-03,
        2.84280744e-03, -4.34568094e-04,  3.64157860e-03,  3.24759458e-04,
        4.38285898e-03,  3.73717630e-03, -5.96458209e-04, -3.59512120e-03,
       -4.75550396e-03, -4.01656143e-03, -3.80055862e-03, -3.69138061e-03,
        2.55903648e-03,  2.36403127e-03, -1.27182808e-03, -4.41914512e-04,
        3.69670708e-03,  3.69484373e-03, -2.50596879e-03, -1.21253124e-03,
        1.58703560e-03, -3.38559010e-04, -1.33460097e-03,  1.15172705e-04,
       -2.64002127e-03, -1.48099149e-03, -2.89875106e-03,  3.01210280e-03,
        2.95349932e-03,  1.84020796e-03,  3.93898401e-04, -4.83518001e-03,
       -3.58779845e-03, -1.47141621e-03,  2.94377888e-03,  3.88921821e-03,
       -3.27451108e-03,  

In [36]:
model_skipgram.similarity('bites', 'eats')

  """Entry point for launching an IPython kernel.


0.059682388

In [37]:
model_skipgram.similarity('dog', 'eats')

  """Entry point for launching an IPython kernel.


-0.018645156

In [38]:
model_skipgram.similarity('meat', 'eats')

  """Entry point for launching an IPython kernel.


0.001372223