# <font color='red'>description_preprocessing.ipynb</font>

<br><b>Filename: description_preprocessing.ipynb</b> ---> <font color='purple'>defines the implementation pipeline for preprocessing the description for each record in the provided dataset. The adopted preprocessing sub-approaches are: stop-word removal, lemmatization (text normalization), custom word set removal etc.</font>
<hr/>
This notebook specifies the following functions: ( the sequence of description is same as the sequence of their definition in the notebook cells below )
<ol>
    <li><b>remove_stopwords(text): </b> Given input text, remove all the stopwords. </li>
    <li><b>perform_lemmatization(text):</b> Given the input text, perform text normalization (lemmatization)</li>
    <li><b>custom_word_removal( text, brand ):</b> Given the input text, remove a set of custom words ( manually created frequently occurring words in this dataset ) and the brand name from the description.</li>
    <li><b>preprocess_description( df ):</b>The driver function for description text preprocessing pipeline.</li>
</ol>

<img src='images/text_preprocess.png'>

### CELL #1: importing required modules

In [9]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

### CELL #2: defining remove_stopwords(text)
<br>Function description in the top cell
<br>This function removes the stop words from the input text and returns the updated state.


In [14]:
def remove_stopwords(text):
    new_text=""
    text = text.split()
    for word in text:
        if not word in stop_words:
            new_text+=word+" "
    return new_text

### CELL #3: defining perform_lemmatization(text)
<br>Function description in the top cell
<br>This function normalizes the input text ( lemmatization ) and returns the updated state.

In [1]:
def perform_lemmatization(text):
    new_text=""
    text = text.split()
    for word in text:
        #print("Word before: ",word)
        word = lemmatizer.lemmatize(word)
        #print("Word after: ",word)
        new_text+=word+" "
    return new_text

### CELL #4: defining custom_word_removal( text,brand )
<br>Function description in the top cell
<br>This function does the following sequence of operations:
<ol>
    <li>Define a set of surplus words including the brand name.</li>
    <li>Remove all non-alphanumeric characters</li>
    <li>Remove all words with length <=2 </li>
    <li>Remove all purely numeric strings</li>
    <li>Remove duplicate words</li>
    <li>Return the updated description text.</li>
</ol>

In [2]:
def custom_word_removal(text,brand):
    new_text=""
    
    #----------------------------------- STEP-1
    
    to_remove=[brand,'free','cost','delivery','india','offer','customer','price','sale','warranty','guarantee','satisfaction','replacement','shipping','cash','code','buy','flipkartcom','now','hurry','genuine','product','day','key','specification']
    
    #----------------------------------- STEP-2
    
    text = re.sub(r'\w*[0-9]\w*',"",text)
    text = text.split()
    
    #----------------------------------- STEPS 3,4,AND 5 START HERE
    for i in range(len(text)):
        if text[i] in to_remove or len(text[i])<=2 or text[i].isnumeric() or text[i] in new_text.split():
            continue
        else:
            new_text+=text[i]+" "
    #----------------------------------- STEPS 3,4,AND 5 START HERE
    
    return new_text #----------------------------------- STEP-6

### CELL #5: defining preprocess_description( df )
<br>The driver function for description preprocessing
<br>Function description in the top cell
<br>This function does the following sequence of operations:
<ol>
    <li>Replace multiple spaces with single whitespace.</li>
    <li>Convert entire text to lowercase.</li>
    <li>Use remove_stopwords() to remove the stop words form the description</li>
    <li>Use perform_lemmatization() to normalize the description.</li>
    <li>Remove the custom words from description using custom_word_removal()</li>
    <li>Record each intermediate stage of function call preprocessing as a separate column in the original dataset.</li>
    <li>Remove all records with empty description field.</li>
    <li>Return the updated dataset</li>
</ol>

In [23]:
def preprocess_description(df):
    print("---------------------- PREPROCESSING STARTS HERE")
    ws_description=[]
    lem_ws_description=[]
    custom = []
    for i in range(len(df)):
        #print("BEFORE:  ",df.loc[i,'description'])
        #print(type(df.loc[i,'brand']))
        #print(df.loc[i,'brand'])
        df.loc[i,'description'] = re.sub('\s+',' ',df.loc[i,'description']) #------------- STEP-1
        
        df.loc[i,'description'] = (df.loc[i,'description']).lower() # -------------------- STEP-2
        
        if type(df.loc[i,'brand'])==str:
            df.loc[i,'brand'] = (df.loc[i,'brand']).lower() #---------------- STEP-2 FOR BRAND NAME
            
        else:
            df.loc[i,'brand'] = 'brand' #if any ambiguity in the brand string,replace brand name with 'brand'\
            
        df.loc[i,'description'] = "".join(e for e in df.loc[i,'description'] if e.isalnum() or e==' ')
        df.loc[i,'description'] = (df.loc[i,'description']).replace(df.loc[i,'brand']+" ","")
        
        ws_description.append(remove_stopwords(df.loc[i,'description'])) #---------- STEP-3
        
        lem_ws_description.append(perform_lemmatization(ws_description[i])) #------- STEP-4
        
        custom.append(custom_word_removal(lem_ws_description[i],df.loc[i,'brand'])) #---------STEP-5
        
        #print("AFTER:  ",df.loc[i,'description'])
        #print("---------------------------------")
        
    print("------ PREPROCESSING DONE....")
    
    #----------------------------------------------------------------------------- STEP-6 STARTS HERE
    
    df['ws_description'] = ws_description
    df['lem_ws_description'] = lem_ws_description
    df['custom'] = custom
    
    #----------------------------------------------------------------------------- STEP-6 ENDS HERE 
    
    indices=[]
    
    #----------------------------------------------------------------------------- STEP-7 STARTS HERE
    
    for i in range(len(df)):
        if df.loc[i,'custom']==" " or len(df.loc[i,'custom'])==0 or not df.loc[i,'custom']:
            indices.append(i)
    df = df.drop(index=indices)
    df = df.reset_index(drop=True)
    #----------------------------------------------------------------------------- STEP-7 ENDS HERE
    
    return df #------------------------------------------------------------------ STEP-8