In [None]:
import cPickle as pickle
import nltk #to preprocess and tokenize text data
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import Counter

#download english stopwords corpus
nltk.download() 
nltk.download('punkt')

In [None]:
#
POSITIVE_WORDS = set([line.strip() for line in open('positive-words.txt', 'r')])
NEGATIVE_WORDS = set([line.strip() for line in open('negative-words.txt', 'r')])
NLTK_STOPWORDS = set(stopwords.words('english'))
MORE_STOPWORDS = set([line.strip() for line in open('more_stopwords.txt', 'r')])

In [None]:
def lcase_punct_tokenize(review):
    review = review.lower()
    reviews = review.translate(None, string.punctuation)
    token_list = nltk.word_tokenize(review)
    exclude_stopwords = lambda token : token not in NLTK_STOPWORDS
    return filter(exclude_stopwords, token_list)

def concat_preprocess_tokenize(review):
    r = review.text.sum()
    return lcase_punct_tokenize(r)

In [None]:
resto_review = pickle.load( open( "resto_review_data.p", "rb" ) )

In [None]:
print(list(resto_review.columns.values))

In [None]:
features = ['business_id', 'name', 'stars_review', 'text']
df_features = resto_review[features]

In [None]:
df_find_ngram_counts_by_star_categories = df_features
stars_vs_category_texts = df_find_ngram_counts_by_star_categories.groupby('stars_review').apply(concat_preprocess_tokenize)

In [None]:
for stars in range(0, 1):
    bigrams = ["%s %s" % bi for bi in nltk.bigrams(stars_vs_cat_texts[stars+1])]
    bigrams_df = DataFrame.from_dict(Counter(bigrams).most_common(len(stars_vs_cat_texts[stars+1])))
    bigrams_df.to_csv(str(stars+1) + '_bigrams_star.csv', index=False)
    
    trigrams = ["%s %s %s" % tri for tri in nltk.trigrams(stars_vs_cat_texts[stars+1])]
    trigrams_df = DataFrame.from_dict(Counter(trigrams).most_common(len(stars_vs_cat_texts[stars+1])))
    trigrams_df.to_csv(str(stars+1) + '_trigrams_star.csv', index=False)

In [None]:
%%R
library(tm)
library(wordcloud)
library(RColorBrewer)

palettes <- c("Reds", "Oranges", "Blues", "Purples", "Greens")

for (stars in 1:5) {
    stars.bigrams.path <- paste(stars, '_bigrams_star.csv',sep='')
    stars.trigrams.path <- paste(stars, '_trigrams_star.csv',sep='')
    stars.bigrams.df <- read.csv(stars.bigrams.path)
    stars.trigrams.df <- read.csv(stars.trigrams.path)
    stars.wordcloud.df <- rbind(stars.bigrams.df[1:100,], stars.trigrams.df[1:100,])
    stars.wordcloud.df <- stars.wordcloud.df[with(stars.wordcloud.df, order(-X1, X0)),]
    
    pal <- brewer.pal(9, palettes[stars])
    pal <- pal[-(1:3)]
    png(paste(stars, '_star_wordcloud.png',sep=''), width=960, height=960)
    wordcloud(stars.wordcloud.df$X0, stars.wordcloud.df$X1 , max.words=200, colors=pal)
    dev.off()
}

In [None]:
from IPython.core.display import Image
Image(filename='1_star_wordcloud.png')

In [None]:
Image(filename='2_star_wordcloud.png')

In [None]:
Image(filename='3_star_wordcloud.png')

In [None]:
Image(filename='4_star_wordcloud.png')

In [None]:
Image(filename='5_star_wordcloud.png')