## Text Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
import string
from nltk.stem import WordNetLemmatizer 

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [2]:
# avoid decoding problems
df = pd.read_csv("train.csv")
 
# encode questions to unicode
# https://stackoverflow.com/a/6812069
# ----------------- python 2 ---------------------
# df['question1'] = df['question1'].apply(lambda x: unicode(str(x),"utf-8"))
# df['question2'] = df['question2'].apply(lambda x: unicode(str(x),"utf-8"))
# ----------------- python 3 ---------------------
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [3]:
df.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0


### 4.2 Text Preprocessing
- Preprocessing:
    - Removing html tags 
    - Removing Punctuations
    - Performing stemming
    - Removing Stopwords
    - Expanding contractions etc.

In [4]:
def text_preproces(x):
    x = str(x).lower()
    x = x.replace('%',' percent').replace('₹',' rupee').replace('$',' dollar').replace('€',' euro')\
                                .replace(',000,000','m').replace('000','k').replace('′',"'").replace("’","'")\
                                .replace("won't","will not").replace("can't",'can not').replace("shouldn't","should not")\
                                .replace("what's",'"what is"').replace("that's",'that is').replace("he's","he is")\
                                .replace("she's","she is").replace("it's","it is").replace("'ve"," have").replace("'re"," are")\
                                .replace("'ll"," will").replace("i'm","i am").replace("n't", " not")
    x = re.sub(r'([0-9]+)000000',r'\1m',x)
    x = re.sub(r'([0-9]+)000',r'\1k',x)
    
    return x    

In [5]:
def extract_features(df):
    df['question1'] = df['question1'].fillna("").apply(text_preproces)
    df['question2'] = df['question2'].fillna("").apply(text_preproces)
    print("token features...")
    
    return df

In [6]:
df = extract_features(df)

token features...


### Removal of URLs

In [7]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

df.question1=df.question1.apply(remove_urls)
df.question2=df.question2.apply(remove_urls)

### Removal of HTML tags

In [8]:
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

df.question1=df.question1.apply(remove_html)
df.question2=df.question2.apply(remove_html)

### 4.2.4 Tokenization of Data
-findall

findall() is probably the single most powerful function in the re module. Above we used re.search() to find the first match for a pattern. findall() finds *all* the matches and returns them as a list of strings, with each string representing one match.

#### Suppose we have a text with many email addresses
  str = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'

#### Here re.findall() returns a list of all the found email strings
  emails = re.findall(r'[\w\.-]+@[\w\.-]+', str) ## ['alice@google.com', 'bob@abc.com']
  for email in emails:
    # do something with each found email string
    print email

In [9]:
def word_tokenize(txt):
    tokens = re.findall("[\w']+", txt)
    return tokens
df.question1=df.question1.apply(word_tokenize)
df.question2=df.question2.apply(word_tokenize)

### Lemmatization of Data

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
def lemmatzation(lst):
    new_lst=[]
    for i in lst:
        i=lemmatizer.lemmatize(i)
        new_lst.append(i)
    return new_lst
df.question1=df.question1.apply(lemmatzation)
df.question2=df.question2.apply(lemmatzation)

In [20]:
df.question2[2]

['how',
 'can',
 'internet',
 'speed',
 'be',
 'increased',
 'by',
 'hacking',
 'through',
 'dns']

In [21]:
df.question1[2]

['how',
 'can',
 'i',
 'increase',
 'the',
 'speed',
 'of',
 'my',
 'internet',
 'connection',
 'while',
 'using',
 'a',
 'vpn']

In [27]:
df.to_csv('df_text_preprocessing.csv',index = 'False')