## Text Preprocessing 

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import check_output
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import os
import gc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import re
from nltk.corpus import stopwords
import distance
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup

### Reading the dataset

In [2]:
df=pd.read_csv("Xtrain.csv")
df

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_wordlen,q2_wordlen,common_words,total_words_union,similarity
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,14,12,10,16,0.154303
1,1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,8,13,4,17,0.154303
2,2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,14,10,3,21,0.308607
3,3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,11,9,0,20,0.462910
4,4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,13,7,2,18,0.617213
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,99995,165922,165923,What are some of the most iconic images of foo...,What are some of the most iconic images of women?,0,10,10,8,12,0.577350
99996,99996,99996,165924,165925,How green is green tea?,"Why isn't ""green"" tea green?",0,5,5,0,10,0.577350
99997,99997,99997,165926,165927,Who would win: Black Panther or Batman?,Who would win in a fight between Black Panther...,1,7,11,5,13,0.866025
99998,99998,99998,165928,165929,"Which school is better, for what, and why: Par...",What are some good design schools/colleges on ...,0,11,10,0,20,0.000000


In [5]:
df['question1'] = df['question1'].apply(lambda x: str(x))## converting other object types to str while preprocessing
df['question2'] = df['question2'].apply(lambda x: str(x))
df.loc[10,'question1']

'Method to find separation of slits using fresnel biprism?'

## Preprocessing of text

In [53]:
#Text preprocessing template from 
#https://gist.github.com/jiahao87/d57a2535c2ed7315390920ea9296d79f

from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
import gensim.downloader as api

import en_core_web_sm
nlp = en_core_web_sm.load()

# Choose model accordingly for contractions function
model = api.load("glove-twitter-25")
# model = api.load("glove-twitter-100")
# model = api.load("word2vec-google-news-300")



# exclude words from spacy stopwords list
deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False


def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text


def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())


def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text


def expand_contractions(x):
    x = str(x).lower()
    x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")
    x = re.sub(r"([0-9]+)000000", r"\1m", x)
    x = re.sub(r"([0-9]+)000", r"\1k", x)
    return x
    


def text_preprocessing(text, accented_chars=True, contractions=True, 
                       convert_num=True, extra_whitespace=True, 
                       lemmatization=True, lowercase=True, punctuations=True,
                       remove_html=True, remove_num=True, special_chars=True, 
                       stop_words=True):
    """preprocess text with default option set to true for all steps"""
    if remove_html == True: #remove html tags
        text = strip_html_tags(text)
    if extra_whitespace == True: #remove extra whitespaces
        text = remove_whitespace(text)
    if accented_chars == True: #remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: #expand contractions
        text = expand_contractions(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()

    doc = nlp(text) #tokenise text

    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
        # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        # remove punctuations
        if punctuations == True and token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        # remove special characters
        if special_chars == True and token.pos_ == 'SYM' and flag == True: 
            flag = False
        # remove numbers
        if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) \
        and flag == True:
            flag = False
        # convert number words to numeric numbers
        if convert_num == True and token.pos_ == 'NUM' and flag == True:
            edit = w2n.word_to_num(token.text)
        # convert tokens to base form
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list 
        if edit != "" and flag == True:
            clean_text.append(edit)        
    return clean_text

In [65]:
q1=list()
for i in range(len(df)) : 
    q1.append(df.loc[i, "question1"])
q2=list()
for i in range(len(df)):
     q2.append(df.loc[i,"question2"])

In [66]:
from nltk.tokenize.treebank import TreebankWordDetokenizer## The values we get is splitted into words we combine them to form a process text
for i in range(len(q1)):
    q1[i]=text_preprocessing(q1[i])

for i in range(len(q1)):
    q1[i]=TreebankWordDetokenizer().detokenize(q1[i])
print(q1[0])

for i in range(len(q2)):
    q2[i]=text_preprocessing(q2[i])

for i in range(len(q2)):
    q2[i]=TreebankWordDetokenizer().detokenize(q2[i])
print(q2[0])

step step guide invest share market india
step step guide invest share market


In [69]:
data=pd.DataFrame(q1,columns=["q1processed"])##converting the list to a dataframe
data

Unnamed: 0,q1processed
0,step step guide invest share market india
1,story kohinoor koh noor diamond
2,increase speed internet connection vpn
3,mentally lonely solve
4,dissolve water quikly sugar salt methane carbo...
...,...
99995,iconic image football
99996,green green tea
99997,win black panther batman
99998,school well parson risd


In [70]:
data1=pd.DataFrame(q2,columns=["q2processed"])
data1

Unnamed: 0,q2processed
0,step step guide invest share market
1,happen indian government steal kohinoor koh no...
2,internet speed increase hack dns
3,find remainder math]23^{24}[/math divide
4,fish survive salt water
...,...
99995,iconic image woman
99996,not green tea green
99997,win fight black panther batman
99998,good design school college par risd


In [72]:
df["question1"]=data["q1processed"]  #overwritting the questions with processed text
df["question2"]=data1["q2processed"]


In [73]:
df

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_wordlen,q2_wordlen,common_words,total_words_union,similarity
0,0,0,1,2,step step guide invest share market india,step step guide invest share market,0,14,12,10,16,0.154303
1,1,1,3,4,story kohinoor koh noor diamond,happen indian government steal kohinoor koh no...,0,8,13,4,17,0.154303
2,2,2,5,6,increase speed internet connection vpn,internet speed increase hack dns,0,14,10,3,21,0.308607
3,3,3,7,8,mentally lonely solve,find remainder math]23^{24}[/math divide,0,11,9,0,20,0.462910
4,4,4,9,10,dissolve water quikly sugar salt methane carbo...,fish survive salt water,0,13,7,2,18,0.617213
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,99995,165922,165923,iconic image football,iconic image woman,0,10,10,8,12,0.577350
99996,99996,99996,165924,165925,green green tea,not green tea green,0,5,5,0,10,0.577350
99997,99997,99997,165926,165927,win black panther batman,win fight black panther batman,1,7,11,5,13,0.866025
99998,99998,99998,165928,165929,school well parson risd,good design school college par risd,0,11,10,0,20,0.000000


In [None]:
df.to_csv("xtrain.csv")