# Feature Engineering on Text Data

In [1]:
# # Import necessary dependencies and settings

import pandas as pd
import numpy as np
import re
import nltk # natural language toolkit

In [2]:
# Let’s now load some sample text documents, do some basic pre-processing, 
# and learn about various feature engineering strategies to deal with 
# text data. The following code creates our sample text corpus (a
# collection of text documents)

corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]

labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals']

# making numpy array such that every element gets index values
corpus = np.array(corpus)

print("Corpus:\n",corpus)
print("\nType of Corpus:",type(corpus))

# making dataframe
corpus_df = pd.DataFrame({'Document': corpus, 'Category': labels})

 # giving Column Names
corpus_df = corpus_df[['Document', 'Category']] # Optional

corpus_df

Corpus:
 ['The sky is blue and beautiful.' 'Love this blue and beautiful sky!'
 'The quick brown fox jumps over the lazy dog.'
 'The brown fox is quick and the blue dog is lazy!'
 'The sky is very blue and the sky is very beautiful today'
 'The dog is lazy but the brown fox is quick!']

Type of Corpus: <class 'numpy.ndarray'>


Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,The brown fox is quick and the blue dog is lazy!,animals
4,The sky is very blue and the sky is very beaut...,weather
5,The dog is lazy but the brown fox is quick!,animals


# Text Pre-Processing

>>Stemming is the process of reducing inflected words to their word stem, base form. A stemming algorithm reduces the word "saying" to the root word "say", whereas "presumably" becomes "presume". As you can seem this may or may not be always 100% correct.

>>Lemmatization is closely related to stemming, but Lemmatization is the algorithmic process of determining the lemma of a word based on its intended meaning. For example, in English, the verb "to walk" may appear as "walk", "walked", "walks", or "walking". The base form "walk" that one might look up in the dictionary, is called lemma of the word. spaCy doesnt have any in-built stemmer, as lemmatization is considered more correct and productive.

In [3]:
# We will be normalizing our text here by lowercasing,
# removing special characters, tokenizing, and removing stopwords.
import nltk
nltk.download('stopwords')

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters and whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)
# ^ -> "reverse of" , "anything other than"
# Hence anything other than a-zA-Z0-9\s would be replaced with '' i.e nothing
# re.I -> means Ignore case

    doc = doc.lower() # everything to lowercase
    doc = doc.strip() # any leading or trailing whitespaces
    
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

# The np.vectorize(...) function helps us run the same function 
# over all elements of a numpy array
# instead of writing a loop.
normalize_corpus = np.vectorize(normalize_document) 
#np.vectorize is like a for loop which helps us 
# doing the same work without using a for loop

#calling the normalize function over the corpus
norm_corpus = normalize_corpus(corpus)

norm_corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
       'sky blue sky beautiful today', 'dog lazy brown fox quick'],
      dtype='<U30')

In [4]:
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Bag of Words Model

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0., max_df=1.)
# have given min_document_freqn = 0.0 ->  which means ignore 
# terms that appear in less than 1% of the documents 

# and max_document_freqn = 1.0 ->  which means ignore terms that appear 
# in more than 100% of the documents".
# In short nothing is to be ignored. all data values would be considered 

cv_matrix = cv.fit_transform(norm_corpus)

cv_matrix = cv_matrix.toarray()

cv_matrix

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1],
       [0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0]], dtype=int64)

In [6]:
# The output represents a numeric term frequency based feature 
# vector for each document. To understand it better, we can represent 
# it using the feature names and view it as a dataframe.

vocab = cv.get_feature_names()
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,lazy,love,quick,sky,today
0,1,1,0,0,0,0,0,0,0,1,0
1,1,1,0,0,0,0,0,1,0,1,0
2,0,0,1,1,1,1,1,0,1,0,0
3,0,1,1,1,1,0,1,0,1,0,0
4,1,1,0,0,0,0,0,0,0,2,1
5,0,0,1,1,1,0,1,0,1,0,0


# Bag of N-Grams Model

In [7]:
bv = CountVectorizer(ngram_range=(2,2))
# looking at 2 word pairings only.
# like for example : ngram_range=(1, 2) would count the frequency 
# of 1 and 2 words both. try ngram_range=(1, 2) by self and see the difference.
#(2,1) or (3,1) wont work

bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()

pd.DataFrame(bv_matrix, columns=vocab)

Unnamed: 0,beautiful sky,beautiful today,blue beautiful,blue dog,blue sky,brown fox,dog lazy,fox jumps,fox quick,jumps lazy,lazy brown,lazy dog,love blue,quick blue,quick brown,sky beautiful,sky blue
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0
3,0,0,0,1,0,1,1,0,1,0,0,0,0,1,0,0,0
4,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
5,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0


# TF-IDF Model

![tf-idf image](datasets_n_images/images/tf_idf_formulae_image.png 'tf-idf image')

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()

pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,lazy,love,quick,sky,today
0,0.6,0.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0
1,0.46,0.39,0.0,0.0,0.0,0.0,0.0,0.66,0.0,0.46,0.0
2,0.0,0.0,0.38,0.38,0.38,0.54,0.38,0.0,0.38,0.0,0.0
3,0.0,0.36,0.42,0.42,0.42,0.0,0.42,0.0,0.42,0.0,0.0
4,0.36,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72,0.52
5,0.0,0.0,0.45,0.45,0.45,0.0,0.45,0.0,0.45,0.0,0.0


# Document Similarity

![cosine similarity](datasets_n_images/images/cosine_similarity_image.png 'cosine_similarity_image')

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
print(similarity_df)

          0         1         2         3         4         5
0  1.000000  0.753128  0.000000  0.185447  0.807539  0.000000
1  0.753128  1.000000  0.000000  0.139665  0.608181  0.000000
2  0.000000  0.000000  1.000000  0.784362  0.000000  0.839987
3  0.185447  0.139665  0.784362  1.000000  0.109653  0.933779
4  0.807539  0.608181  0.000000  0.109653  1.000000  0.000000
5  0.000000  0.000000  0.839987  0.933779  0.000000  1.000000


In [10]:
# ## Clustering documents using similarity features

from sklearn.cluster import KMeans

km = KMeans(n_clusters=2) # 2 clusters

km.fit_transform(similarity_df) # fitting the clustering model 

cluster_labels = km.labels_  # labels_ is the predefn attribute

cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])

print(pd.concat([corpus_df, cluster_labels], axis=1))

                                            Document Category  ClusterLabel
0                     The sky is blue and beautiful.  weather             1
1                  Love this blue and beautiful sky!  weather             1
2       The quick brown fox jumps over the lazy dog.  animals             0
3   The brown fox is quick and the blue dog is lazy!  animals             0
4  The sky is very blue and the sky is very beaut...  weather             1
5        The dog is lazy but the brown fox is quick!  animals             0
