In [36]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\palla\AppData\Roaming\nltk_data...


True

In [2]:
doc = ["How is your week going It is going good I am learning NLP on a Tuesday evening which is a part of Deep learning"]

In [3]:
tok = Tokenizer() # Assigns a unique token value to each unique word in the sentence

In [4]:
tok.fit_on_texts(doc)

In [5]:
tok.word_index

{'is': 1,
 'going': 2,
 'learning': 3,
 'a': 4,
 'how': 5,
 'your': 6,
 'week': 7,
 'it': 8,
 'good': 9,
 'i': 10,
 'am': 11,
 'nlp': 12,
 'on': 13,
 'tuesday': 14,
 'evening': 15,
 'which': 16,
 'part': 17,
 'of': 18,
 'deep': 19}

In [6]:
tok = Tokenizer(oov_token="<UNK>") # Assigning a unique token to each unique word and assigning UNK to words which are not in training ; UNK - Unknown,oov - out of vocabulary

In [7]:
tok.fit_on_texts(doc)

In [8]:
tok.word_index

{'<UNK>': 1,
 'is': 2,
 'going': 3,
 'learning': 4,
 'a': 5,
 'how': 6,
 'your': 7,
 'week': 8,
 'it': 9,
 'good': 10,
 'i': 11,
 'am': 12,
 'nlp': 13,
 'on': 14,
 'tuesday': 15,
 'evening': 16,
 'which': 17,
 'part': 18,
 'of': 19,
 'deep': 20}

In [9]:
sequences = tok.texts_to_sequences(doc)

In [10]:
sequences

[[6,
  2,
  7,
  8,
  3,
  9,
  2,
  3,
  10,
  11,
  12,
  4,
  13,
  14,
  5,
  15,
  16,
  17,
  2,
  5,
  18,
  19,
  20,
  4]]

In [11]:
test = ["I am learning NLP","It is a great day","We are doing data cleaning today","Deep learning is interesting","Today is tuesday","It is third day of the week" ]

In [14]:
df = pd.DataFrame({"raw_text" : test})

In [15]:
df

Unnamed: 0,raw_text
0,I am learning NLP
1,It is a great day
2,We are doing data cleaning today
3,Deep learning is interesting
4,Today is tuesday
5,It is third day of the week


In [18]:
df["sequences"] = df["raw_text"].apply(lambda x: tok.texts_to_sequences([x]))

In [19]:
df

Unnamed: 0,raw_text,sequences
0,I am learning NLP,"[[11, 12, 4, 13]]"
1,It is a great day,"[[9, 2, 5, 1, 1]]"
2,We are doing data cleaning today,"[[1, 1, 1, 1, 1, 1]]"
3,Deep learning is interesting,"[[20, 4, 2, 1]]"
4,Today is tuesday,"[[1, 2, 15]]"
5,It is third day of the week,"[[9, 2, 1, 1, 19, 1, 8]]"


In [21]:
# Padding 
# 1. It is done to make all sentences of same length
# 2. It basically checks the length of the sentence, if the length is greater than the max_len it truncates the sentence and if the length < max_len it padds the sentence.
# 3. There are 2 types of truncating and padding namely, prepadding, post padding and pretruncating and posttruncating

In [22]:
df["prepadding"] = df["sequences"].apply(lambda x : pad_sequences(x, maxlen=5, padding="pre", truncating="pre"))

In [23]:
df

Unnamed: 0,raw_text,sequences,prepadding
0,I am learning NLP,"[[11, 12, 4, 13]]","[[0, 11, 12, 4, 13]]"
1,It is a great day,"[[9, 2, 5, 1, 1]]","[[9, 2, 5, 1, 1]]"
2,We are doing data cleaning today,"[[1, 1, 1, 1, 1, 1]]","[[1, 1, 1, 1, 1]]"
3,Deep learning is interesting,"[[20, 4, 2, 1]]","[[0, 20, 4, 2, 1]]"
4,Today is tuesday,"[[1, 2, 15]]","[[0, 0, 1, 2, 15]]"
5,It is third day of the week,"[[9, 2, 1, 1, 19, 1, 8]]","[[1, 1, 19, 1, 8]]"


In [24]:
df["postpadding"] = df["sequences"].apply(lambda x : pad_sequences(x, maxlen=5, padding="post", truncating="post"))

In [25]:
df

Unnamed: 0,raw_text,sequences,prepadding,postpadding
0,I am learning NLP,"[[11, 12, 4, 13]]","[[0, 11, 12, 4, 13]]","[[11, 12, 4, 13, 0]]"
1,It is a great day,"[[9, 2, 5, 1, 1]]","[[9, 2, 5, 1, 1]]","[[9, 2, 5, 1, 1]]"
2,We are doing data cleaning today,"[[1, 1, 1, 1, 1, 1]]","[[1, 1, 1, 1, 1]]","[[1, 1, 1, 1, 1]]"
3,Deep learning is interesting,"[[20, 4, 2, 1]]","[[0, 20, 4, 2, 1]]","[[20, 4, 2, 1, 0]]"
4,Today is tuesday,"[[1, 2, 15]]","[[0, 0, 1, 2, 15]]","[[1, 2, 15, 0, 0]]"
5,It is third day of the week,"[[9, 2, 1, 1, 19, 1, 8]]","[[1, 1, 19, 1, 8]]","[[9, 2, 1, 1, 19]]"


In [26]:
# Stemming and Lemmatizing - It is done to truncate unnecessary suffixes and prefixes

In [28]:
por = PorterStemmer()

In [31]:
df["stemmed_data"] = df["raw_text"].apply(lambda x : por.stem(x))

In [32]:
df

Unnamed: 0,raw_text,sequences,prepadding,postpadding,stemmed_data
0,I am learning NLP,"[[11, 12, 4, 13]]","[[0, 11, 12, 4, 13]]","[[11, 12, 4, 13, 0]]",i am learning nlp
1,It is a great day,"[[9, 2, 5, 1, 1]]","[[9, 2, 5, 1, 1]]","[[9, 2, 5, 1, 1]]",it is a great day
2,We are doing data cleaning today,"[[1, 1, 1, 1, 1, 1]]","[[1, 1, 1, 1, 1]]","[[1, 1, 1, 1, 1]]",we are doing data cleaning today
3,Deep learning is interesting,"[[20, 4, 2, 1]]","[[0, 20, 4, 2, 1]]","[[20, 4, 2, 1, 0]]",deep learning is interest
4,Today is tuesday,"[[1, 2, 15]]","[[0, 0, 1, 2, 15]]","[[1, 2, 15, 0, 0]]",today is tuesday
5,It is third day of the week,"[[9, 2, 1, 1, 19, 1, 8]]","[[1, 1, 19, 1, 8]]","[[9, 2, 1, 1, 19]]",it is third day of the week


In [34]:
lemma = WordNetLemmatizer()

In [37]:
df["lemmed_data"] = df["raw_text"].apply(lambda x : lemma.lemmatize(x))

In [38]:
df

Unnamed: 0,raw_text,sequences,prepadding,postpadding,stemmed_data,lemmed_data
0,I am learning NLP,"[[11, 12, 4, 13]]","[[0, 11, 12, 4, 13]]","[[11, 12, 4, 13, 0]]",i am learning nlp,I am learning NLP
1,It is a great day,"[[9, 2, 5, 1, 1]]","[[9, 2, 5, 1, 1]]","[[9, 2, 5, 1, 1]]",it is a great day,It is a great day
2,We are doing data cleaning today,"[[1, 1, 1, 1, 1, 1]]","[[1, 1, 1, 1, 1]]","[[1, 1, 1, 1, 1]]",we are doing data cleaning today,We are doing data cleaning today
3,Deep learning is interesting,"[[20, 4, 2, 1]]","[[0, 20, 4, 2, 1]]","[[20, 4, 2, 1, 0]]",deep learning is interest,Deep learning is interesting
4,Today is tuesday,"[[1, 2, 15]]","[[0, 0, 1, 2, 15]]","[[1, 2, 15, 0, 0]]",today is tuesday,Today is tuesday
5,It is third day of the week,"[[9, 2, 1, 1, 19, 1, 8]]","[[1, 1, 19, 1, 8]]","[[9, 2, 1, 1, 19]]",it is third day of the week,It is third day of the week
