In [1]:
%pylab inline



Populating the interactive namespace from numpy and matplotlib


In [22]:
from nltk.corpus import stopwords
import cPickle as pickle
import string
import nltk
from nltk.stem.porter import PorterStemmer

In [4]:
## Get the stopwords for preprocessing

NLTK_STOPWORDS = set(stopwords.words('english'))
MORE_STOPWORDS = set([line.strip() for line in open('more_stopwords.txt', 'r')])

In [5]:
## NLTK pipeline

def lowercase(s):
    return s.lower()

def tokenize(s):
    token_list = nltk.word_tokenize(s)
    return token_list

def remove_punctuation(s):
    return s.translate(None, string.punctuation)

def remove_numbers(s):
    return s.translate(None, string.digits)
 
def remove_stopwords(token_list):
    exclude_stopwords = lambda token : token not in NLTK_STOPWORDS
    return filter(lambda tok : tok not in MORE_STOPWORDS, filter(exclude_stopwords, token_list))

def stemming_token_list(token_list):
    STEMMER = PorterStemmer()
    return [STEMMER.stem(tok.decode('utf-8')) for tok in token_list]

def restring_tokens(token_list):
    return ' '.join(token_list)

In [6]:
# Clean all the reviews by removing stop words as well as punctutation marks
def process_reviews(data_set):
    clean_data_set = []
    for text in data_set:
        text = lowercase(text)
        text = remove_punctuation(text)
        text = remove_numbers(text)

        token_list = tokenize(text)
        token_list = remove_stopwords(token_list)

        token_list = stemming_token_list(token_list)
        
        try:
            clean_data_set.append(restring_tokens(token_list))
        except:
            pass
    return clean_data_set

In [11]:
## Load the previously saved pickle file
review_data = pickle.load( open( "resto_review.p", "rb" ) )

In [13]:
# Group all reviews per star rating and extract text out of them
starsGroup = review_data.groupby('stars_review')

text_star_1 = starsGroup.get_group(1.0)['text']
text_star_2 = starsGroup.get_group(2.0)['text']
text_star_3 = starsGroup.get_group(3.0)['text']
text_star_4 = starsGroup.get_group(4.0)['text']
text_star_5 = starsGroup.get_group(5.0)['text']

In [12]:
# Optional : To reduce the dataset size and prevent laptop from frying, reduce the dataset size by sampling
#sampling = 5000 # No of rows to be sampled

#text_star_1 = text_star_1.sample(sampling)
#text_star_2 = text_star_2.sample(sampling)
#text_star_3 = text_star_3.sample(sampling)
#text_star_4 = text_star_4.sample(sampling)
#text_star_5 = text_star_5.sample(sampling)

In [14]:
# Add all the corresponding original labels to reviews
label_star_1 = [1.0]*len(text_star_1)
label_star_2 = [2.0]*len(text_star_2)
label_star_3 = [3.0]*len(text_star_3)
label_star_4 = [4.0]*len(text_star_4)
label_star_5 = [5.0]*len(text_star_5)

In [15]:
from sklearn.cross_validation import train_test_split

# Create test and training dataset. We use 80-20 sampling here. We can use 66-33 sampling too
train_stars_1, test_stars_1, train_labels_stars_1, all_1stars_labels_test = train_test_split(text_star_1, label_star_1, test_size=0.30)
train_stars_2, test_stars_2, train_labels_stars_2, all_2stars_labels_test = train_test_split(text_star_2, label_star_2, test_size=0.30)
train_stars_3, test_stars_3, train_labels_stars_3, all_3stars_labels_test = train_test_split(text_star_3, label_star_3, test_size=0.30)
train_stars_4, test_stars_4, train_labels_stars_4, all_4stars_labels_test = train_test_split(text_star_4, label_star_4, test_size=0.30)
train_stars_5, test_stars_5, train_labels_stars_5, all_5stars_labels_test = train_test_split(text_star_5, label_star_5, test_size=0.30)

In [16]:
len(train_labels_stars_1)

58739

In [23]:
## Cleaning all the reviews and building corpus out of them
corpus_5stars = process_reviews(train_stars_5)
corpus_4stars = process_reviews(train_stars_4)
corpus_3stars = process_reviews(train_stars_3)
corpus_2stars = process_reviews(train_stars_2)
corpus_1stars = process_reviews(train_stars_1)

In [24]:
print "Number of 5-star reviews after processing: ", len(corpus_5stars)
print "Number of 4-star reviews after processing: ", len(corpus_4stars)
print "Number of 3-star reviews after processing: ", len(corpus_3stars)
print "Number of 2-star reviews after processing: ", len(corpus_2stars)
print "Number of 1-star reviews after processing: ", len(corpus_1stars)

Number of 5-star reviews after processing:  284902
Number of 4-star reviews after processing:  174334
Number of 3-star reviews after processing:  72107
Number of 2-star reviews after processing:  47154
Number of 1-star reviews after processing:  58739


In [25]:
# Creating combined dataset for training, containing representation of all the 5 star ratings possible
all_5_4_train = np.append(corpus_5stars, corpus_4stars)
all_5_4_3_train = np.append(all_5_4_train, corpus_3stars)
all_5_4_3_2_train = np.append(all_5_4_3_train, corpus_2stars)
all_text_train = np.append(all_5_4_3_2_train, corpus_1stars)


In [26]:
pickle.dump(all_text_train, open("all_text_train.p", "wb"))

In [27]:
pickle.dump(corpus_5stars, open("corpus_5stars_train.p", "wb"))
pickle.dump(corpus_4stars, open("corpus_4stars_train.p", "wb"))
pickle.dump(corpus_3stars, open("corpus_3stars_train.p", "wb"))
pickle.dump(corpus_2stars, open("corpus_2stars_train.p", "wb"))
pickle.dump(corpus_1stars, open("corpus_1stars_train.p", "wb"))

In [30]:
corpus_5stars_test = process_reviews(test_stars_5)
corpus_4stars_test = process_reviews(test_stars_4)
corpus_3stars_test = process_reviews(test_stars_3)
corpus_2stars_test = process_reviews(test_stars_2)
corpus_1stars_test = process_reviews(test_stars_1)

In [31]:
all_5_4_test = np.append(corpus_5stars_test, corpus_4stars_test)
all_5_4_3_test = np.append(all_5_4_test, corpus_3stars_test)
all_5_4_3_2_test = np.append(all_5_4_3_test, corpus_2stars_test)
all_text_test = np.append(all_5_4_3_2_test, corpus_1stars_test)

In [32]:
pickle.dump(all_text_test, open("all_text_test.p", "wb"))
pickle.dump(corpus_5stars_test, open("corpus_5stars_test.p", "wb"))
pickle.dump(corpus_4stars_test, open("corpus_4stars_test.p", "wb"))
pickle.dump(corpus_3stars_test, open("corpus_3stars_test.p", "wb"))
pickle.dump(corpus_2stars_test, open("corpus_2stars_test.p", "wb"))
pickle.dump(corpus_1stars_test, open("corpus_1stars_test.p", "wb"))


In [37]:
pickle.dump(all_text_train, open("all_text_train_final.p", "wb"))