In [1]:
# import dependencies
from google.cloud import bigquery
from google.cloud import storage
import pickle
import pandas as pd
import numpy as np
import nltk, re, time, gensim
from nltk.corpus import stopwords
from gensim import models, corpora, similarities
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from nltk import FreqDist
from nltk.stem.porter import PorterStemmer
from scipy.stats import entropy
import matplotlib.pyplot as plt

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
print('Downloads Complete')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloads Complete


In [7]:
def initial_clean(text):
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower() # lower case the text
    text = nltk.word_tokenize(text)
    return text

stop_words = stopwords.words('english')
def remove_stop_words(text):
    return [word for word in text if word not in stop_words]


def pos(word):
    return nltk.pos_tag([word])[0][1]

informative_pos = ('JJ','VB', 'NN','RBS','VBP','IN','RBR','JJR','JJS','PDT','RP','UH','FW','NNS','VBN','VBG')
def select_informative_pos(text):
    tagged_words = nltk.pos_tag(text)
    return [word for word, tag in tagged_words if tag in informative_pos]

stemmer = PorterStemmer()
def stem_words(text):
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # make sure we have no 1 letter words
    except IndexError: # the word "oed" broke this, so needed try except
        pass
    return text

def apply_all(text):
    return stem_words(select_informative_pos(remove_stop_words(initial_clean(text))))
  

In [13]:
csv_names = ["pittsburgh_reviews", "mesa_reviews", "charlotte_reviews"]
k = 5000

In [10]:
def preprocess(name):
    df = pd.read_csv(name)
    df = df.groupby(['business_id','name','latitude','longitude','address','stars','is_open'])['text'].apply(' '.join).reset_index()
    df = df[df['text'].map(type) == str]
    df.dropna(axis=0, inplace=True, subset=['text'])

    # preprocess the text and business name and create new column "tokenized"
    t1 = time.time()
    df['tokenized'] = df['text'].apply(apply_all)
    t2 = time.time()
    print("Time to clean and tokenize", len(df), "businesses' reviews:", (t2-t1)/60, "min")

    # use nltk fdist to get a frequency distribution of all words
    all_words = [word for item in list(df['tokenized']) for word in item]
    fdist = FreqDist(all_words)

    #only keep words in the top k words
    top_k_words,_ = zip(*fdist.most_common(k))
    top_k_words = set(top_k_words)
    
    def keep_top_k_words(text):
        return [word for word in text if word in top_k_words]
    
    df['tokenized'] = df['tokenized'].apply(keep_top_k_words)
    return df

In [14]:
for name in csv_names:
  df = preprocess("%s.csv" % name)
  csv = df.to_csv("%s.csv" % name)


Time to clean and tokenize 442 businesses' reviews: 12.430062051614126 min
Time to clean and tokenize 284 businesses' reviews: 5.712130137284597 min
Time to clean and tokenize 580 businesses' reviews: 15.177245795726776 min
