# Text Encoding 
![transform](TextEncoding.png)

This jupyter notebook explains how text encoding can be done before feeding text data to train machine learning model.


### 1.> Lets load required libraries.

In [88]:
import pandas as pd
import math

### 2.> Define corpus.

In [89]:
# Defining corpus => two seperate documents
# Following is list of string, having only one element in list.
document1 = ["The sky is blue. The sun is bright today."]
document2 = ["The sun in the sky is bright. We can see the shining sun, the brighter sun."]

print(document1)
print(document2)

['The sky is blue. The sun is bright today.']
['The sun in the sky is bright. We can see the shining sun, the brighter sun.']


### 3.> Stemming : Words with the same root, but written grammatically differently, can be clubbed together.

In [90]:
# In document2, we have word 'brighter' which is having same root/stem to word 'bright'.
# Replace 'brighter' with 'bright'
document2[0] = document2[0].replace("brighter", "bright")

print(document2)


['The sun in the sky is bright. We can see the shining sun, the bright sun.']


### 4.> Text cleaning and tokenization.

In [91]:

# Following is list of string, having each unique words from each document.
all_tokens_document1 = sum([doc.lower().replace(',', '').replace('.', '').split() for doc in document1], [])
all_tokens_document2 = sum([doc.lower().replace(',', '').replace('.', '').split() for doc in document2], [])

print(all_tokens_document1)
print(all_tokens_document2)

['the', 'sky', 'is', 'blue', 'the', 'sun', 'is', 'bright', 'today']
['the', 'sun', 'in', 'the', 'sky', 'is', 'bright', 'we', 'can', 'see', 'the', 'shining', 'sun', 'the', 'bright', 'sun']


### 5.> Stop word removal

In [92]:
# Define stopwords as list of words
stopwords = ['a', 'the', 'i', 'me',  'is', 'to', 'then', 'what', 'are', 'for', 'my', 'as', 'can', 'and', 'in', 'of', 'am', 'it']

# Subtract stopwords from each tokens.
unique_token_document1 = set(all_tokens_document1) - set(stopwords)
unique_token_document2 = set(all_tokens_document2) - set(stopwords)


print(unique_token_document1)
print(unique_token_document2)

{'blue', 'sun', 'bright', 'sky', 'today'}
{'see', 'we', 'sun', 'bright', 'sky', 'shining'}


In [93]:
# Find unique set of tokens from entire corpus.
unique_tokens_corpus = set(unique_token_document1).union(set(unique_token_document2))

print(unique_tokens_corpus)

{'see', 'we', 'sun', 'blue', 'bright', 'sky', 'today', 'shining'}


### 6.> Making dictionary of unique words and word count appearing for each unique word in each document

In [94]:
# Create dictionary from unique tokens, and assign each key's value
dictionary_of_doc1 = dict.fromkeys(unique_tokens_corpus, 0)     # dictionary_of_doc1 contents all tokens from corpus
print(dictionary_of_doc1)
print(type(dictionary_of_doc1))

{'see': 0, 'we': 0, 'sun': 0, 'blue': 0, 'bright': 0, 'sky': 0, 'today': 0, 'shining': 0}
<class 'dict'>


In [95]:
# Count frequency of each word from first document, and update frequency to dictionary_of_doc1
for token in all_tokens_document1:
    if token in dictionary_of_doc1:
        dictionary_of_doc1[token] += 1

print(unique_token_document1)
print(document1)
print(dictionary_of_doc1)

{'blue', 'sun', 'bright', 'sky', 'today'}
['The sky is blue. The sun is bright today.']
{'see': 0, 'we': 0, 'sun': 1, 'blue': 1, 'bright': 1, 'sky': 1, 'today': 1, 'shining': 0}


In [96]:
# Simillarly, count frequency of each word from second document, and update frequency to dictionary_of_doc2
dictionary_of_doc2 = dict.fromkeys(unique_tokens_corpus, 0)     # dictionary_of_doc2 contents all tokens from corpus
for token in all_tokens_document2:
    if token in dictionary_of_doc2:
        dictionary_of_doc2[token] += 1

print(unique_token_document2)
print(document2)
print(dictionary_of_doc2)

{'see', 'we', 'sun', 'bright', 'sky', 'shining'}
['The sun in the sky is bright. We can see the shining sun, the bright sun.']
{'see': 1, 'we': 1, 'sun': 3, 'blue': 0, 'bright': 2, 'sky': 1, 'today': 0, 'shining': 1}


### 7.> Calculate term frequency : TF

In [97]:
def calculate_term_frequency(doc_dictionary, lenght_of_doc_tokens):
    tf = dict()
    for key, value in doc_dictionary.items():
        #print(f'key : {key} => value : {value}')           # Enable if want to see how internal function is working.
        tf[key] = value / lenght_of_doc_tokens
    return tf    

token_count = float(len(unique_token_document1))            # unique_token_document1 contents only unique tokens from document 1
print("Token count for document 1: " + str(token_count) + "\n")
print("dictionary_of_doc1 => " + str(dictionary_of_doc1) + "\n")
term_frequency_document1 = calculate_term_frequency(doc_dictionary=dictionary_of_doc1, lenght_of_doc_tokens=token_count)
print(term_frequency_document1)



Token count for document 1: 5.0

dictionary_of_doc1 => {'see': 0, 'we': 0, 'sun': 1, 'blue': 1, 'bright': 1, 'sky': 1, 'today': 1, 'shining': 0}

{'see': 0.0, 'we': 0.0, 'sun': 0.2, 'blue': 0.2, 'bright': 0.2, 'sky': 0.2, 'today': 0.2, 'shining': 0.0}


In [98]:
token_count = float(len(unique_token_document2))            # unique_token_document2 contents only unique tokens from document 2
print("Token count for document 2: " + str(token_count) + "\n")
print("dictionary_of_doc2 => " + str(dictionary_of_doc2) + "\n")
term_frequency_document2 = calculate_term_frequency(doc_dictionary=dictionary_of_doc2, lenght_of_doc_tokens=token_count)
print(term_frequency_document2)



Token count for document 2: 6.0

dictionary_of_doc2 => {'see': 1, 'we': 1, 'sun': 3, 'blue': 0, 'bright': 2, 'sky': 1, 'today': 0, 'shining': 1}

{'see': 0.16666666666666666, 'we': 0.16666666666666666, 'sun': 0.5, 'blue': 0.0, 'bright': 0.3333333333333333, 'sky': 0.16666666666666666, 'today': 0.0, 'shining': 0.16666666666666666}


### 8.> Calculate inverse document frequency : IDF

In [99]:
def calculate_idf(*all_document_dictionary):
    idf = dict()
    number_of_dictionary = len(all_document_dictionary)

    # Iterate over either of the document dictionary.
    # Every document dictionary is having same set of keys but different values and we are only interested in keys,
    #  to find its occurance if available in all documents.
    # Keys in each dictionaries are : ['see', 'we', 'sun', 'blue', 'bright', 'sky', 'today', 'shining']
    # We just want to know if key's value is > 0 in each dictonary to know count of documents key belongs to.
    for key in all_document_dictionary[0].keys(): # taking first occurance of document dictionary with index zero (0)
        df = 0
        # Calculate document frequency for 'key'
        for doc_dict in all_document_dictionary:            # Check availability of key in each document.
            if key in doc_dict and doc_dict[key] > 0:       # If key exists in dictionary, and its value is > 0 then increase the document frequency.
                df += 1

        # Calculate IDF for 'key'
        idf_of_term = math.log(number_of_dictionary / (1 + df)) + 1
        idf[key] = idf_of_term

    return idf




inverse_document_frequency_document = calculate_idf(dictionary_of_doc1, dictionary_of_doc2)

print("\n\n\n\n=> " + str(inverse_document_frequency_document))






=> {'see': 1.0, 'we': 1.0, 'sun': 0.5945348918918356, 'blue': 1.0, 'bright': 0.5945348918918356, 'sky': 0.5945348918918356, 'today': 1.0, 'shining': 1.0}


### 9.> Calculate TF-IDF = TF * IDF

In [100]:
# Function to multiply both TF and IDF and store its value into dictionary and return.
def calculate_tfidf(tf, idf):
    tfidf = dict()
    for token, count in tf.items():
        tfidf[token] = count * idf[token]
    return tfidf

In [101]:
print(unique_token_document1)
print(unique_token_document2)

{'blue', 'sun', 'bright', 'sky', 'today'}
{'see', 'we', 'sun', 'bright', 'sky', 'shining'}


In [102]:
# Calculate TF-IDF for each unique tokens for both documents
tfidf1 = calculate_tfidf(term_frequency_document1, inverse_document_frequency_document)
tfidf2 = calculate_tfidf(term_frequency_document2, inverse_document_frequency_document)

# Create a dataframe for all the calculated values
tfidf_df = pd.DataFrame([tfidf1, tfidf2])
tfidf_df


Unnamed: 0,see,we,sun,blue,bright,sky,today,shining
0,0.0,0.0,0.118907,0.2,0.118907,0.118907,0.2,0.0
1,0.166667,0.166667,0.297267,0.0,0.198178,0.099089,0.0,0.166667
