In [1]:
from keras.preprocessing import text
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def load_enron_data(file_name, top=0):
    df = pd.read_csv(file_name, sep=':##:', engine='python')
    df = df.dropna()
    
    n_len = df.shape[0]
    print("n_len:", n_len)
    
    if isinstance(top, int) and (top > 0) and (top < n_len):
        print("Get top {} rows of the quora.".format(top))
        df = df[:top]
    df['is_pair'] = 1
    
    text1 = list(df['request'])
    text2 = list(df['response'])
    is_pair = list(df['is_pair'])
    
    return text1, text2, is_pair

In [3]:
ENRON_RAW = "./pre/enron_email/message_pairs.csv"
ENRON_STANDARDIZED = "./pre/enron_email/message_pairs_preprocessed.csv"

In [4]:
df = pd.read_csv(ENRON_RAW, sep=":##:", engine='python')

In [5]:
df.head()

Unnamed: 0,request,response
0,John Lavorato-M Mike Grigsby-D Keith Holst-D ...,Program Importance: High Hi Phillip. We app...
1,Program Importance: High Hi Phillip. We app...,Program We have not received your completed ...
2,John Lavorato-M Mike Grigsby-D Keith Holst-D ...,Hi Phillip. We appreciate your prompt attenti...
3,"> > > > George, > > Can you please call m...",I'll get back to them on this. I know we have...
4,I am planning to build a house in the Texas hi...,Re. Your living.com inquiry Thank you for you...


In [6]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.

    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text.strip() if w not in stops]

    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\t{1,}", " ", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\"", " ", text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word.strip()) for word in text]
        text = " ".join(stemmed_words)

    # Return a list of words
    return(text)

In [7]:
text1, text2, is_pair = load_enron_data(ENRON_RAW)

n_len: 63438


In [8]:
text1[:10]

['John Lavorato-M  Mike Grigsby-D Keith Holst-D Frank Ermis-D Steve South-D Janie Tholt-D  Scott Neal-P Hunter Shively-P Tom Martin-P John Arnold-P',
 "Program Importance: High   Hi Phillip.  We appreciate your prompt attention and completing the Team Selection information.  Ideally, we needed to receive your team of raters on the Team Selection form we sent you.  The information needed is then easily transferred into the database directly from that Excel spreadsheet.  If you do not have the ability to complete that form, inserting what you listed below, we still require additional information.  We need each person's email address.  Without the email address, we cannot email them their internet link and ID to provide feedback for you, nor can we send them an automatic reminder via email.  It would also be good to have each person's phone number, in the event we need to reach them.  So, we do need to receive that complete TS Excel spreadsheet, or if you need to instead, provide the need

In [9]:
text_to_wordlist("What's \t the step by step guide to invest in share market in india?")

'what is the step by step guide to invest in share market in india '

In [10]:
sep = ":##:"
n_len = len(text1)
f = open(ENRON_STANDARDIZED, "w")
f.write("{}{}{}{}{}".format("request", sep, "response", sep, "is_pair"))
for i in range(n_len):
    s1 = ' '.join(text.text_to_word_sequence(text1[i]))
    s2 = ' '.join(text.text_to_word_sequence(text2[i]))
    s1 = text_to_wordlist(s1)
    s2 = text_to_wordlist(s2)
    d = is_pair[i]
    f.write("\n{}{}{}{}{}".format(s1, sep, s2, sep, d))
f.close()

In [11]:
df = pd.read_csv(ENRON_STANDARDIZED, sep=sep, engine='python')

In [12]:
df.head(100)

Unnamed: 0,request,response,is_pair
0,john lavorato m mike grigsby d keith holst d f...,program importance high hi phillip we apprecia...,1
1,program importance high hi phillip we apprecia...,program we have not received your completed te...,1
2,john lavorato m mike grigsby d keith holst d f...,hi phillip we appreciate your prompt attention...,1
3,george can you please call my credit desk at 7...,i will get back to them on this i know we have...,1
4,i am planning to build a house in the texas hi...,re your living com inquiry thank you for your ...,1
5,the topic will the the western natural gas mar...,ina can you please forward the presentation to...,1
6,larrry i realize you are disappointed about th...,jacques here is larry lewter response to my re...,1
7,importance high i 01 ve been asked to provide ...,john did you put frank hayden up to this if th...,1
8,lee my fax number is 713 646 2391 please fax m...,jeff here is the application from spb i guess ...,1
9,jeff here is the application from spb i guess ...,here is what you need to bring updated rent ro...,1
