In [10]:
from keras.preprocessing import text
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [11]:
def load_quora_data(file_name, top=0):
	df = pd.read_csv(file_name, sep='\t')
	df = df.dropna()

	n_len = df.shape[0]
	print("n_len:", n_len)

	if isinstance(top, int) and (top > 0) and (top < n_len):
		print("Get top {} rows of the quora.".format(top))
		df = df[:top]

	text1 = list(df['question1'])
	text2 = list(df['question2'])
	is_duplicate = list(df['is_duplicate'])

	return text1, text2, is_duplicate

In [12]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.

    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if w not in stops]

    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    # Return a list of words
    return(text)

In [13]:
QUORA_RAW_FILE = 'email_recommendation/data/quora/quora_duplicate_questions.tsv'
QUORA_STANDARD_FILE = 'email_recommendation/data/quora/quora_duplicate_questions_standardized.tsv'

In [14]:
text1, text2, is_duplicate = load_quora_data(QUORA_RAW_FILE)

n_len: 404287


In [15]:
text1[:10]

['What is the step by step guide to invest in share market in india?',
 'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
 'How can I increase the speed of my internet connection while using a VPN?',
 'Why am I mentally very lonely? How can I solve it?',
 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?',
 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?',
 'Should I buy tiago?',
 'How can I be a good geologist?',
 'When do you use シ instead of し?',
 'Motorola (company): Can I hack my Charter Motorolla DCX3400?']

In [16]:
text_to_wordlist("What's the step by step guide to invest in share market in india?")

'what is the step by step guide to invest in share market in india '

In [17]:
n_len = len(text1)
f = open(QUORA_STANDARD_FILE, "w")
f.write("{}{}{}{}{}\n".format("question1", "\t", "question2", "\t", "is_duplicate"))
for i in range(n_len):
    s1 = ' '.join(text.text_to_word_sequence(text1[i]))
    s2 = ' '.join(text.text_to_word_sequence(text2[i]))
    s1 = text_to_wordlist(s1)
    s2 = text_to_wordlist(s2)
    d = is_duplicate[i]
    f.write("{}{}{}{}{}\n".format(s1, "\t", s2, "\t", d))
f.close()