In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [28]:
from nltk.corpus import stopwords
import cPickle as pickle
import string
import nltk
from nltk.stem.porter import PorterStemmer
import pandas as pd

In [3]:
## Get the stopwords for preprocessing

NLTK_STOPWORDS = set(stopwords.words('english'))
MORE_STOPWORDS = set([line.strip() for line in open('more_stopwords.txt', 'r')])

In [4]:
## NLTK pipeline

def lowercase(s):
    return s.lower()

def tokenize(s):
    token_list = nltk.word_tokenize(s)
    return token_list

def remove_punctuation(s):
    return s.translate(None, string.punctuation)

def remove_numbers(s):
    return s.translate(None, string.digits)
 
def remove_stopwords(token_list):
    exclude_stopwords = lambda token : token not in NLTK_STOPWORDS
    return filter(lambda tok : tok not in MORE_STOPWORDS, filter(exclude_stopwords, token_list))

def stemming_token_list(token_list):
    STEMMER = PorterStemmer()
    return [STEMMER.stem(tok.decode('utf-8')) for tok in token_list]

def restring_tokens(token_list):
    return ' '.join(token_list)

In [5]:
# Clean all the reviews by removing stop words as well as punctutation marks
def clean_reviews(data_set):
    clean_data_set = []
    for text in data_set:
        text = lowercase(text)
        text = remove_punctuation(text)
        text = remove_numbers(text)

        token_list = tokenize(text)
        token_list = remove_stopwords(token_list)

        token_list = stemming_token_list(token_list)
        
        try:
            clean_data_set.append(restring_tokens(token_list))
        except:
            pass
    return clean_data_set

In [6]:
## Load the previously saved pickle file
review_data = pickle.load( open( "resto_review.p", "rb" ) )

In [7]:
##Sampling
review_data = review_data.sample(5000)

In [8]:
# Group all reviews per star rating and extract text out of them
starsGroup = review_data.groupby('stars_review')

text_star_1 = starsGroup.get_group(1.0)['text']
text_star_2 = starsGroup.get_group(2.0)['text']
text_star_3 = starsGroup.get_group(3.0)['text']
text_star_4 = starsGroup.get_group(4.0)['text']
text_star_5 = starsGroup.get_group(5.0)['text']

In [15]:
# Optional : To reduce the dataset size and prevent laptop from frying, reduce the dataset size by sampling
#sampling = 5000 # No of rows to be sampled

#text_star_1 = text_star_1.sample(sampling)
#text_star_2 = text_star_2.sample(sampling)
#text_star_3 = text_star_3.sample(sampling)
#text_star_4 = text_star_4.sample(sampling)
#text_star_5 = text_star_5.sample(sampling)

In [9]:
# Add all the corresponding original labels to reviews
label_star_1 = [1.0]*len(text_star_1)
label_star_2 = [2.0]*len(text_star_2)
label_star_3 = [3.0]*len(text_star_3)
label_star_4 = [4.0]*len(text_star_4)
label_star_5 = [5.0]*len(text_star_5)

In [21]:
from sklearn.cross_validation import train_test_split

# Create test and training dataset. We use 80-20 sampling here. We can use 66-33 sampling too
train_stars_1, test_stars_1, train_labels_stars_1, test_labels_stars_1 = train_test_split(text_star_1, label_star_1, test_size=0.30)
train_stars_2, test_stars_2, train_labels_stars_2, test_labels_stars_2 = train_test_split(text_star_2, label_star_2, test_size=0.30)
train_stars_3, test_stars_3, train_labels_stars_3, test_labels_stars_3 = train_test_split(text_star_3, label_star_3, test_size=0.30)
train_stars_4, test_stars_4, train_labels_stars_4, test_labels_stars_4 = train_test_split(text_star_4, label_star_4, test_size=0.30)
train_stars_5, test_stars_5, train_labels_stars_5, test_labels_stars_5 = train_test_split(text_star_5, label_star_5, test_size=0.30)

In [19]:
train_labels_stars_1

[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0

In [11]:
## Cleaning all the reviews and building corpus out of them
corpus_5stars_train = clean_reviews(train_stars_5)
corpus_4stars_train = clean_reviews(train_stars_4)
corpus_3stars_train = clean_reviews(train_stars_3)
corpus_2stars_train = clean_reviews(train_stars_2)
corpus_1stars_train = clean_reviews(train_stars_1)

In [20]:
print "Number of 5-star reviews after processing: ", len(corpus_5stars_train)
print "Number of 4-star reviews after processing: ", len(corpus_4stars_train)
print "Number of 3-star reviews after processing: ", len(corpus_3stars_train)
print "Number of 2-star reviews after processing: ", len(corpus_2stars_train)
print "Number of 1-star reviews after processing: ", len(corpus_1stars_train)

Number of 5-star reviews after processing:  1542
Number of 4-star reviews after processing:  998
Number of 3-star reviews after processing:  389
Number of 2-star reviews after processing:  262
Number of 1-star reviews after processing:  308


In [12]:
# Creating combined dataset for training, containing representation of all the 5 star ratings possible
all_5_4_train = np.append(corpus_5stars_train, corpus_4stars_train)
all_5_4_3_train = np.append(all_5_4_train, corpus_3stars_train)
all_5_4_3_2_train = np.append(all_5_4_3_train, corpus_2stars_train)
all_text_train = np.append(all_5_4_3_2_train, corpus_1stars_train)

In [31]:
all_stars_train = train_labels_stars_5 + train_labels_stars_4 + train_labels_stars_3 + train_labels_stars_2 + train_labels_stars_1
all_stars_test = test_labels_stars_5 + test_labels_stars_4 + test_labels_stars_3 + test_labels_stars_2 + test_labels_stars_1
pickle.dump(pd.DataFrame(all_stars_train)[0], open("all_stars_train.p","wb"))
pickle.dump(pd.DataFrame(all_stars_test)[0], open("all_stars_test.p","wb"))

In [13]:
pickle.dump(all_text_train, open("all_text_train.p", "wb"))
pickle.dump(corpus_5stars_train, open("corpus_5stars_train.p", "wb"))
pickle.dump(corpus_4stars_train, open("corpus_4stars_train.p", "wb"))
pickle.dump(corpus_3stars_train, open("corpus_3stars_train.p", "wb"))
pickle.dump(corpus_2stars_train, open("corpus_2stars_train.p", "wb"))
pickle.dump(corpus_1stars_train, open("corpus_1stars_train.p", "wb"))

In [14]:
corpus_5stars_test = clean_reviews(test_stars_5)
corpus_4stars_test = clean_reviews(test_stars_4)
corpus_3stars_test = clean_reviews(test_stars_3)
corpus_2stars_test = clean_reviews(test_stars_2)
corpus_1stars_test = clean_reviews(test_stars_1)

In [15]:
all_5_4_test = np.append(corpus_5stars_test, corpus_4stars_test)
all_5_4_3_test = np.append(all_5_4_test, corpus_3stars_test)
all_5_4_3_2_test = np.append(all_5_4_3_test, corpus_2stars_test)
all_text_test = np.append(all_5_4_3_2_test, corpus_1stars_test)

In [16]:
pickle.dump(all_text_test, open("all_text_test.p", "wb"))
pickle.dump(corpus_5stars_test, open("corpus_5stars_test.p", "wb"))
pickle.dump(corpus_4stars_test, open("corpus_4stars_test.p", "wb"))
pickle.dump(corpus_3stars_test, open("corpus_3stars_test.p", "wb"))
pickle.dump(corpus_2stars_test, open("corpus_2stars_test.p", "wb"))
pickle.dump(corpus_1stars_test, open("corpus_1stars_test.p", "wb"))

In [17]:
pickle.dump(starsGroup, open("starsGroup.p", "wb"))