<a href="https://colab.research.google.com/github/mralamdari/NLP-Text-Processing/blob/main/NLP_Text_Representation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
import sklearn
import numpy as np
import matplotlib.pyplot as plt

In [2]:
documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
documents = [sentence.lower().replace('.', '') for sentence in documents]
documents

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

###One-Hot Encoding

In [9]:
#Vocabulary
all_words = []
for sentence in documents:
  all_words.extend(sentence.split())

unique_words = set(all_words)
vocab = dict(zip(unique_words, range(len(unique_words))))
vocab

{'bites': 0, 'dog': 1, 'eats': 4, 'food': 5, 'man': 3, 'meat': 2}

In [38]:
#onehot creator
def onehot_creator(sentence):
  sentence = sentence.lower().split()
  onehot_encoded = np.zeros((len(sentence), len(vocab)))
  for i, word in enumerate(sentence):
    word_id = vocab.get(word)
    if word_id != None:
      onehot_encoded[i][word_id] = 1
  return onehot_encoded

In [40]:
onehot_creator("Dog bites man")

array([[0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.]])

In [66]:
test_sample = "Dog bites man and then eates meat"
onehot_creator(test_sample)

array([[0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

One Hot Encoding with Scikit-learn

In [42]:
from sklearn import preprocessing

In [43]:
label_encoder = preprocessing.LabelEncoder()
onehot_encoder = preprocessing.OneHotEncoder()

In [44]:
label_encoder.fit_transform(all_words)

array([1, 0, 4, 4, 0, 1, 1, 2, 5, 4, 2, 3])

In [50]:
onehot_encoder.fit_transform(np.array(all_words).reshape(-1, 1)).toarray()

array([[0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.]])

###Bag of Words (BoW)

In [51]:
from sklearn import feature_extraction

In [52]:
count_vect = feature_extraction.text.CountVectorizer()

In [54]:
bow_representation = count_vect.fit_transform(documents)

In [56]:
count_vect.vocabulary_

{'bites': 0, 'dog': 1, 'eats': 2, 'food': 3, 'man': 4, 'meat': 5}

In [57]:
for rep in bow_representation:
  print(rep.toarray())

[[1 1 0 0 1 0]]
[[1 1 0 0 1 0]]
[[0 1 1 0 0 1]]
[[0 0 1 1 1 0]]


In [62]:
count_vect.transform([test_sample]).toarray()

array([[1, 1, 0, 0, 1, 1]])

BoW with Binary vectors

In [63]:
binary_count_vector = feature_extraction.text.CountVectorizer(binary=True)
binary_count_vector.fit(documents)

CountVectorizer(binary=True)

In [84]:
binary_count_vector.transform([test_sample]).toarray()

array([[1, 1, 0, 0, 1, 1]])

###Bag of N-Grams (BoN)

In [78]:
count_vect = feature_extraction.text.CountVectorizer(ngram_range=(1, 3))

In [79]:
bon_representation = count_vect.fit_transform(documents)

In [80]:
count_vect.vocabulary_

{'bites': 0,
 'bites dog': 1,
 'bites man': 2,
 'dog': 3,
 'dog bites': 4,
 'dog bites man': 5,
 'dog eats': 6,
 'dog eats meat': 7,
 'eats': 8,
 'eats food': 9,
 'eats meat': 10,
 'food': 11,
 'man': 12,
 'man bites': 13,
 'man bites dog': 14,
 'man eats': 15,
 'man eats food': 16,
 'meat': 17}

In [81]:
for rep in bow_representation:
  print(rep.toarray())

[[1 1 0 0 1 0]]
[[1 1 0 0 1 0]]
[[0 1 1 0 0 1]]
[[0 0 1 1 1 0]]


In [86]:
count_vect.transform([test_sample]).toarray()

array([[1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]])

###TF-IDF

In [87]:
tfidf = feature_extraction.text.TfidfVectorizer()

In [88]:
bow_rep_tfidf = tfidf.fit_transform(documents)

In [90]:
bow_rep_tfidf.toarray()

array([[0.65782931, 0.53256952, 0.        , 0.        , 0.53256952,
        0.        ],
       [0.65782931, 0.53256952, 0.        , 0.        , 0.53256952,
        0.        ],
       [0.        , 0.44809973, 0.55349232, 0.        , 0.        ,
        0.70203482],
       [0.        , 0.        , 0.55349232, 0.70203482, 0.44809973,
        0.        ]])

In [92]:
#Idf for Vocabulary Words
tfidf.idf_

array([1.51082562, 1.22314355, 1.51082562, 1.91629073, 1.22314355,
       1.91629073])

In [94]:
#Vocabulary Words
tfidf.get_feature_names()



['bites', 'dog', 'eats', 'food', 'man', 'meat']

In [97]:
tfidf.transform([test_sample]).toarray()

array([[0.5051001 , 0.40892206, 0.        , 0.        , 0.40892206,
        0.64065543]])

###Word2Vec

In [3]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 1.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [4]:
import spacy

In [8]:
nlp = spacy.load('en_core_web_md')

In [9]:
doc = nlp('i love pizza')

In [11]:
doc.vector

array([ 1.11230902e-01,  2.41379991e-01, -1.61966667e-01, -3.14129978e-01,
        1.05768003e-01,  3.91543359e-01,  3.54509979e-01, -1.27896652e-01,
        1.70420006e-01,  1.69123328e+00, -6.31393373e-01,  1.69920668e-01,
        7.96566680e-02, -1.29133360e-02, -1.50936663e-01,  1.39993383e-02,
       -1.84084341e-01,  1.19740999e+00, -2.22010016e-01,  2.34284341e-01,
        2.44166657e-01, -3.25873703e-01,  2.53593326e-01, -1.86454996e-01,
        1.27480000e-01,  4.88123260e-02, -3.57987314e-01, -2.18866989e-01,
        2.98794001e-01, -2.83073336e-02, -1.92849990e-02,  2.78633356e-01,
       -1.75589994e-01,  1.30146667e-01,  9.73366722e-02,  1.38193667e-01,
       -2.25796700e-02, -1.12435341e-01, -3.15516651e-01,  2.65180707e-01,
       -1.24165334e-01,  1.74489990e-01,  1.08916335e-01,  9.63599980e-02,
        2.16483325e-02,  5.32136679e-01, -2.61241674e-01,  2.68553346e-01,
        6.79000234e-03, -9.15016606e-02, -2.10650012e-01, -4.78706658e-02,
        1.10848665e-01, -