# Training set preprocess
Removes html formatting, punctuation and numerals from 'Summary' and 'Text' columns

In [1]:
import pandas as pd
import re

In [2]:
pipeline = [lambda s: re.sub('<[^>]*>', ' ', s),
            lambda s: re.sub('[^\w\s]', '', s),
            lambda s: re.sub('[\d]', '', s),
            lambda s: s.lower()]

In [3]:
def preprocess(s, pipeline = pipeline):
    if len(pipeline) == 0:
        return s
    else:
        return preprocess(pipeline[0](s), pipeline[1:])

In [4]:
def preprocess_df(df):
    df.fillna('', inplace = True)
    
    for i in xrange(len(df.Text)):
        df.set_value(i, 'Summary', preprocess(df.Summary[i]))
        df.set_value(i, 'Text', preprocess(df.Text[i]))

In [5]:
fields = ['Id', 'Prediction', 'Summary', 'Text']
df_train = pd.read_csv('../data/train.csv', usecols = fields)
print(df_train.head(5))

       Id  Prediction                        Summary  \
0  492346           5             Just what I needed   
1  343811           5         Most excellent oatmeal   
2  347257           4                    Quality Tea   
3  225529           5  Great Coffee at a Great price   
4  485488           4       Popchips Jalapeno flavor   

                                                Text  
0  I got them in a very timely manner and they're...  
1  McCann's Steel Cut Oatmeal is the perfect brea...  
2  I would recommend it. Quality pearls and a few...  
3  I have to admit, I first purchased these Hazel...  
4  I bought a case of jalapeno chips as I love th...  


In [6]:
preprocess_df(df_train)
print(df_train.head(5))

       Id  Prediction                        Summary  \
0  492346           5             just what i needed   
1  343811           5         most excellent oatmeal   
2  347257           4                    quality tea   
3  225529           5  great coffee at a great price   
4  485488           4       popchips jalapeno flavor   

                                                Text  
0  i got them in a very timely manner and theyre ...  
1  mccanns steel cut oatmeal is the perfect break...  
2  i would recommend it quality pearls and a few ...  
3  i have to admit i first purchased these hazeln...  
4  i bought a case of jalapeno chips as i love th...  


In [7]:
df_train.to_csv("../data/preprocessed_train.csv", index = False)