# Format the text data for processing

In [4]:
import pandas as pd
import spacy

In [90]:
def save_data_to_file(data):
    print("Given data with head:")
    print(data.head())
    should_save = input("Do you wish to save it? (y/n): ")
    if should_save == "y":
        filename = input("Specify the filename to save to: ")
        data.to_csv(filename, index=False)
        print("Saved data!")

## Load the data

In [15]:
data = pd.read_csv('data/reviews_Bacchanal_Buffet.csv', parse_dates=['date'], lineterminator='\n')
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10129 entries, 0 to 10128
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      10129 non-null  object        
 1   business_id  10129 non-null  object        
 2   stars        10129 non-null  int64         
 3   useful       10129 non-null  int64         
 4   text         10129 non-null  object        
 5   date         10129 non-null  datetime64[ns]
 6   text_length  10129 non-null  int64         
 7   nbr_words    10129 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(3)
memory usage: 633.2+ KB


Unnamed: 0,user_id,business_id,stars,useful,text,date,text_length,nbr_words
0,6PgdGb3HrZdsfl2GiULo8w,RESDUcs7fIiihp38-d6_6g,5,0,After getting food poisoning at the Palms hote...,2012-12-04 03:10:18,937,176
1,IS9yw8P2uAPBX6FNLLX4KA,RESDUcs7fIiihp38-d6_6g,4,39,"""A feast worthy of Gods""\n\nBaccarnal Buffet i...",2014-01-17 00:50:50,4975,940
2,uZdFsE_aHbFBChgN6Xa8tw,RESDUcs7fIiihp38-d6_6g,4,1,The crab legs are better than the ones at Wick...,2015-06-08 18:03:09,671,131
3,8ZWJNAEWsymXDzKx3B0tTQ,RESDUcs7fIiihp38-d6_6g,1,0,Not worth it! Too salty food and expensive! Th...,2016-12-19 16:15:29,92,18
4,E0sm4Ve7ifanFYeQMcV8Eg,RESDUcs7fIiihp38-d6_6g,5,0,I would give this infinite stars if I could. M...,2015-07-28 07:13:17,333,63


## Format the data

In [None]:
spacy_parser = spacy.load('en')

In [69]:
def get_lemmatized_text(text, parser, include_stopwords=True, include_non_alpha=True):
    def should_include_token(token):
        if token.lemma_.strip():
            if not token.is_alpha and not include_non_alpha and not token.lemma_.strip()=="$":
                return False
            if token.is_stop and not include_stopwords:
                return False
            return True
        return False

    doc = parser(text)
    lemmatized_text = " ".join([token.lemma_.strip() for token in doc if should_include_token(token)])
    return lemmatized_text

In [None]:
def get_text(text, parser, include_stopwords=True):
    doc = parser(text)
    if include_stopwords:
        lemmatized_text = " ".join([token.lemma_.strip() for token in doc if token.lemma_.strip()])
    else:
        lemmatized_text= " ".join([token.lemma_.strip() for token in doc if (token.lemma_.strip() and not token.is_stop)])
    return lemmatized_text

In [37]:
data.iloc[1].text

'"A feast worthy of Gods"\n\nBaccarnal Buffet in Caesar Palace is consider the best buffet in the Las Vegas by most food critics   It rated highest in trip advisor and yelp for a buffet restaurant in Vegas.  I was only going to eat Buffet once in this trip, so I decided must try the best there is available.  My review is based on late breakfast and lunch during a non-holiday weekday.  Finding this place is confusing and take more than five minutes from the main entrance to get here, this is common in most buffet in the strip.  \n\nAdmission Price\n*Breakfast (Monday - Friday, 7 a.m. - 11 a.m.): $25.99\n*Brunch (Saturday - Sunday, 8 a.m. - 3 p.m.): $40.99\n* Lunch (Monday - Friday, 11 a.m. - 3 p.m.): $32.99\n* Dinner (Daily, 3 p.m. - 10 p.m.): $45.99 on weekdays, $50.99 on weekends \n* Holiday expect to paid $10 more.\n* Total Reward Card Holder gets $1 off.\n* Price not included the 8.1% sales tax.\n\nAtmosphere\nThe interior is similar to those of mid-level restaurant, they claim it c

In [66]:
get_lemmatized_text(data.iloc[1].text, spacy_parser)

'" a feast worthy of god " Baccarnal Buffet in Caesar Palace be consider the good buffet in the Las Vegas by most food critic -PRON- rate highest in trip advisor and yelp for a buffet restaurant in Vegas . -PRON- be only go to eat Buffet once in this trip , so -PRON- decide must try the good there be available . -PRON- review be base on late breakfast and lunch during a non - holiday weekday . find this place be confusing and take more than five minute from the main entrance to get here , this be common in most buffet in the strip . Admission Price * Breakfast ( Monday - Friday , 7 a.m. - 11 a.m. ) : $ 25.99 * Brunch ( Saturday - Sunday , 8 a.m. - 3 p.m. ) : $ 40.99 * Lunch ( Monday - Friday , 11 a.m. - 3 p.m. ) : $ 32.99 * Dinner ( Daily , 3 p.m. - 10 p.m. ) : $ 45.99 on weekday , $ 50.99 on weekend * holiday expect to pay $ 10 more . * Total Reward Card Holder get $ 1 off . * price not include the 8.1 % sale tax . Atmosphere the interior be similar to those of mid - level restaurant 

In [67]:
get_lemmatized_text(data.iloc[2].text, spacy_parser, include_stopwords=False)

'crab leg well one Wicked Spoon huge prawn . sheer selection insane . save room pasta , pizza , cheese sushi oink crab leg , prawn dessert . tonkatsu raman ok piece pork . $ 105 people Sunday brunch bottomless mimosas bloody mary . bloody mary disgust . come 2p kiosk get 3p dinner price . dessert pretty good . shave ice , crepe , black sesame green mint tea cream . taffy , gummy bear mini dessert .'

In [73]:
get_lemmatized_text(data.iloc[1].text, spacy_parser, include_stopwords=False, include_non_alpha=False)

'feast worthy god Baccarnal Buffet Caesar Palace consider good buffet Las Vegas food critic rate highest trip advisor yelp buffet restaurant Vegas go eat Buffet trip decide try good available review base late breakfast lunch non holiday weekday find place confusing minute main entrance common buffet strip Admission Price Breakfast Monday Friday $ Brunch Saturday Sunday $ Lunch Monday Friday $ Dinner Daily $ weekday $ weekend holiday expect pay $ Total Reward Card Holder get $ price include sale tax Atmosphere interior similar mid level restaurant claim cost million feel like include equipment kitchen tool cost lot seating count mean dinner weekend quick dessert section separate counter rest food Customer Service get assign individual server helpful quickly kind forget table weekday lunch busy table Imagine dinner weekend service worst nice thing enforce hour eat time limit normally busy time peak hour need estimate wait time kiosk machine beverage choose variety juice coffee yelper rig

In [74]:
def get_lemmatized_text_data(data, text_col, parser, include_stopwords=True, include_non_alpha=True):
    lemmatized_text = data[text_col].apply(lambda x: get_lemmatized_text(x, parser, include_stopwords, include_non_alpha))
    return lemmatized_text

In [75]:
data['lemmatized_text'] = get_lemmatized_text_data(data, "text", spacy_parser, False, False)
data.head()

Unnamed: 0,user_id,business_id,stars,useful,text,date,text_length,nbr_words,lemmatized_text
0,6PgdGb3HrZdsfl2GiULo8w,RESDUcs7fIiihp38-d6_6g,5,0,After getting food poisoning at the Palms hote...,2012-12-04 03:10:18,937,176,get food poisoning Palms hotel scared eat buff...
1,IS9yw8P2uAPBX6FNLLX4KA,RESDUcs7fIiihp38-d6_6g,4,39,"""A feast worthy of Gods""\n\nBaccarnal Buffet i...",2014-01-17 00:50:50,4975,940,feast worthy god Baccarnal Buffet Caesar Palac...
2,uZdFsE_aHbFBChgN6Xa8tw,RESDUcs7fIiihp38-d6_6g,4,1,The crab legs are better than the ones at Wick...,2015-06-08 18:03:09,671,131,crab leg well one Wicked Spoon huge prawn shee...
3,8ZWJNAEWsymXDzKx3B0tTQ,RESDUcs7fIiihp38-d6_6g,1,0,Not worth it! Too salty food and expensive! Th...,2016-12-19 16:15:29,92,18,worth salty food expensive furst kast visit bu...
4,E0sm4Ve7ifanFYeQMcV8Eg,RESDUcs7fIiihp38-d6_6g,5,0,I would give this infinite stars if I could. M...,2015-07-28 07:13:17,333,63,infinite star family diamond card Caesars Pala...


In [79]:
print("ORIGINAL TEXT: ")
print(data.iloc[2].text)
print("")
print("PROCESSED TEXT:")
print(data.iloc[2].lemmatized_text)

ORIGINAL TEXT: 
The crab legs are better than the ones at Wicked Spoon and they have huge prawns here.  The sheer selection was insane.  I didn't get to save any room for pasta, pizza, cheese or sushi because I was oinking out on the crab legs, prawns and dessert.  The tonkatsu ramen is ok and had a piece of pork in it.

It's $105 for two people for Sunday brunch with bottomless mimosas and bloody marys.  The bloody marys were disgusting.  We came around 2p at the kiosk and got in before 3p when they up the dinner prices.  

The dessert was pretty good.  They have shaved ice, crepes, black sesame and green mint tea cream.  They also have taffy, gummy bears and the mini desserts.

PROCESSED TEXT:
crab leg well one Wicked Spoon huge prawn sheer selection insane save room pasta pizza cheese sushi oink crab leg prawn dessert tonkatsu raman ok piece pork $ people Sunday brunch bottomless mimosas bloody mary bloody mary disgust come kiosk get dinner price dessert pretty good shave ice crepe 

In [91]:
save_data_to_file(data)

Given data with head:
                  user_id             business_id  stars  useful  \
0  6PgdGb3HrZdsfl2GiULo8w  RESDUcs7fIiihp38-d6_6g      5       0   
1  IS9yw8P2uAPBX6FNLLX4KA  RESDUcs7fIiihp38-d6_6g      4      39   
2  uZdFsE_aHbFBChgN6Xa8tw  RESDUcs7fIiihp38-d6_6g      4       1   
3  8ZWJNAEWsymXDzKx3B0tTQ  RESDUcs7fIiihp38-d6_6g      1       0   
4  E0sm4Ve7ifanFYeQMcV8Eg  RESDUcs7fIiihp38-d6_6g      5       0   

                                                text                date  \
0  After getting food poisoning at the Palms hote... 2012-12-04 03:10:18   
1  "A feast worthy of Gods"\n\nBaccarnal Buffet i... 2014-01-17 00:50:50   
2  The crab legs are better than the ones at Wick... 2015-06-08 18:03:09   
3  Not worth it! Too salty food and expensive! Th... 2016-12-19 16:15:29   
4  I would give this infinite stars if I could. M... 2015-07-28 07:13:17   

   text_length  nbr_words                                    lemmatized_text  
0          937        176  get fo

## Create a VOCABULARY

In [1]:
import pandas as pd
import help_functions

In [2]:
def load_data_from_file(filename=None):
    should_load = input("Do you wish to load data from a file? (y/n): ")
    if should_load == "y":
        if not filename:
            filename = input("Specify the filename to load from: ")
        data = pd.read_csv(filename, parse_dates=['date'], lineterminator='\n')
        return data

In [3]:
data = load_data_from_file("saved_data/lemmatized_reviews_Bacchanal_Buffet.csv")
data = data.dropna(axis=0)
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10123 entries, 0 to 10128
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   user_id          10123 non-null  object        
 1   business_id      10123 non-null  object        
 2   stars            10123 non-null  int64         
 3   useful           10123 non-null  int64         
 4   text             10123 non-null  object        
 5   date             10123 non-null  datetime64[ns]
 6   text_length      10123 non-null  int64         
 7   nbr_words        10123 non-null  int64         
 8   lemmatized_text  10123 non-null  object        
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 790.9+ KB


Unnamed: 0,user_id,business_id,stars,useful,text,date,text_length,nbr_words,lemmatized_text
0,6PgdGb3HrZdsfl2GiULo8w,RESDUcs7fIiihp38-d6_6g,5,0,After getting food poisoning at the Palms hote...,2012-12-04 03:10:18,937,176,get food poisoning Palms hotel scared eat buff...
1,IS9yw8P2uAPBX6FNLLX4KA,RESDUcs7fIiihp38-d6_6g,4,39,"""A feast worthy of Gods""\n\nBaccarnal Buffet i...",2014-01-17 00:50:50,4975,940,feast worthy god Baccarnal Buffet Caesar Palac...
2,uZdFsE_aHbFBChgN6Xa8tw,RESDUcs7fIiihp38-d6_6g,4,1,The crab legs are better than the ones at Wick...,2015-06-08 18:03:09,671,131,crab leg well one Wicked Spoon huge prawn shee...
3,8ZWJNAEWsymXDzKx3B0tTQ,RESDUcs7fIiihp38-d6_6g,1,0,Not worth it! Too salty food and expensive! Th...,2016-12-19 16:15:29,92,18,worth salty food expensive furst kast visit bu...
4,E0sm4Ve7ifanFYeQMcV8Eg,RESDUcs7fIiihp38-d6_6g,5,0,I would give this infinite stars if I could. M...,2015-07-28 07:13:17,333,63,infinite star family diamond card Caesars Pala...


In [4]:
voc = help_functions.Vocabulary(min_word_freq=100, lower=True)
voc.build(data.lemmatized_text.apply(lambda x: x.split(" ")))
len(voc)

860

In [5]:
voc.stoi

{'buffet': 0,
 'food': 1,
 'good': 2,
 'wait': 3,
 'line': 4,
 'time': 5,
 'crab': 6,
 'dessert': 7,
 '$': 8,
 'like': 9,
 'vegas': 10,
 'come': 11,
 'try': 12,
 'eat': 13,
 'go': 14,
 'seafood': 15,
 'worth': 16,
 'place': 17,
 'great': 18,
 'leg': 19,
 'station': 20,
 'price': 21,
 'get': 22,
 'selection': 23,
 'bacchanal': 24,
 'dinner': 25,
 'pay': 26,
 'think': 27,
 'hour': 28,
 'quality': 29,
 'service': 30,
 'long': 31,
 'meat': 32,
 'fresh': 33,
 'well': 34,
 'section': 35,
 'thing': 36,
 'definitely': 37,
 'taste': 38,
 'rib': 39,
 'variety': 40,
 'want': 41,
 'plate': 42,
 'love': 43,
 'lot': 44,
 'pretty': 45,
 'oyster': 46,
 'amazing': 47,
 'table': 48,
 'people': 49,
 'sushi': 50,
 'experience': 51,
 'dish': 52,
 'asian': 53,
 'look': 54,
 'little': 55,
 'item': 56,
 'lunch': 57,
 'prime': 58,
 'person': 59,
 'delicious': 60,
 'favorite': 61,
 'drink': 62,
 'minute': 63,
 'seat': 64,
 'shrimp': 65,
 'option': 66,
 'brunch': 67,
 'different': 68,
 'bad': 69,
 'star': 70,
 '

## Create a term-document matrix

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
data.iloc[1].lemmatized_text

'feast worthy god Baccarnal Buffet Caesar Palace consider good buffet Las Vegas food critic rate highest trip advisor yelp buffet restaurant Vegas go eat Buffet trip decide try good available review base late breakfast lunch non holiday weekday find place confusing minute main entrance common buffet strip Admission Price Breakfast Monday Friday $ Brunch Saturday Sunday $ Lunch Monday Friday $ Dinner Daily $ weekday $ weekend holiday expect pay $ Total Reward Card Holder get $ price include sale tax Atmosphere interior similar mid level restaurant claim cost million feel like include equipment kitchen tool cost lot seating count mean dinner weekend quick dessert section separate counter rest food Customer Service get assign individual server helpful quickly kind forget table weekday lunch busy table Imagine dinner weekend service worst nice thing enforce hour eat time limit normally busy time peak hour need estimate wait time kiosk machine beverage choose variety juice coffee yelper rig

In [8]:
vectorizer = CountVectorizer(vocabulary=voc.stoi)
X = vectorizer.fit_transform(data.lemmatized_text.to_list())

# Create DataFrame
df_tdm = pd.DataFrame(X.toarray().transpose(), index = vectorizer.get_feature_names()).T

df_tdm.head()

Unnamed: 0,buffet,food,good,wait,line,time,crab,dessert,$,like,...,awful,layout,kitchen,continue,baby,b,serving,overly,music,island
0,1,1,1,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,6,8,14,1,0,3,3,4,0,6,...,0,0,1,0,0,0,0,0,0,0
2,0,0,1,0,0,0,2,3,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df_tdm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10123 entries, 0 to 10122
Columns: 860 entries, buffet to island
dtypes: int64(860)
memory usage: 66.4 MB


## SAVE THE TERM DOCUMENT MATRIX

In [9]:
should_save_data = input("Do you wish to save the term document matrix data? (y/n): ")
if should_save_data == "y":
    df_tdm.to_csv('saved_data/X_lemmatized.csv', index=False)
    data['useful'].to_csv('saved_data/Y_lemmatized.csv', index=False)
    print("Data saved!")

Data saved!
