In [106]:
import numpy as np
import pandas as pd
import csv

import nltk
from nltk.corpus import stopwords
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

# read csv into a dataframe
df_idf = pd.read_csv("DataSources/small_name_review_green.csv")
 
# print schema
print(df_idf.dtypes)
print(df_idf.shape)

name       object
text       object
rating    float64
dtype: object
(84, 3)


In [107]:
df_idf

Unnamed: 0,name,text,rating
0,ARTISAN,"[[""Met with Sara today for a small coverup. Sh...",1.0
1,THE LOBSTER TRAP,"[[""I have been coming here for years and have ...",1.0
2,THE BREADFRUIT & RUM BAR,[['Save your money and time. It took over an ...,1.0
3,SEVICHE,[['ATMOSPHERE: 5/5\nThe first word that came t...,1.0
4,SEAFOOD SHACK,"[[""Happy Hour Daily 3-6pm\n\nHappy hour specia...",1.0
5,SAM'S CAFE,"[[""Great ambience and absolutely adorable bar ...",1.0
6,RED LOBSTER,"[[""We took our grandmother here for a going aw...",1.0
7,PARK PLACE RESTAURANT,"[[""Just what you'd expect from a Greek diner. ...",1.0
8,URBAN CRAVE,[['As far as airport food goes this is good. ...,1.0
9,MONTEREY BAY FISH GROTTO,[['We eat out 4-5 nights a week so are familia...,1.0


In [108]:
green = df_idf.copy()
nongreen = df_idf.copy()

In [114]:
green = green[green['rating'].notnull()]
nongreen = nongreen[nongreen['rating'].isnull()]

In [115]:
green.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28 entries, 0 to 27
Data columns (total 3 columns):
name      28 non-null object
text      28 non-null object
rating    28 non-null float64
dtypes: float64(1), object(2)
memory usage: 896.0+ bytes


In [116]:
nongreen.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56 entries, 28 to 83
Data columns (total 3 columns):
name      56 non-null object
text      56 non-null object
rating    0 non-null float64
dtypes: float64(1), object(2)
memory usage: 1.8+ KB


In [117]:
green = green.drop('rating', axis=1)
nongreen = nongreen.drop('rating', axis=1)

In [118]:
from nltk.stem import WordNetLemmatizer 
import re
def pre_process(text):
    # lowercase
    text=text.lower()
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [119]:
green['text'] = green['text'].apply(lambda x:pre_process(x))
nongreen['text'] = nongreen['text'].apply(lambda x:pre_process(x))

In [120]:
green.head()

Unnamed: 0,name,text
0,ARTISAN,met with sara today for a small coverup she w...
1,THE LOBSTER TRAP,i have been coming here for years and have to...
2,THE BREADFRUIT & RUM BAR,save your money and time it took over an hour...
3,SEVICHE,atmosphere nthe first word that came to mind ...
4,SEAFOOD SHACK,happy hour daily pm n nhappy hour specials ar...


In [121]:
green['text'][2]



In [160]:
def remove_stopwords(mytext):
    #stop_words = set(stopwords.words('english')) 
    
    stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", 
                  "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", 
                  "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", 
                  "their", "theirs", "themselves", "what", "which", "who", "whom", "this", 
                  "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", 
                  "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", 
                  "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", 
                  "of", "at", "by", "for", "with", "about", "against", "between", "into", 
                  "through", "during", "before", "after", "above", "below", "to", "from", 
                  "up", "down", "in", "out", "on", "off", "over", "under", "again", "further",
                  "then", "once", "here", "there", "when", "where", "why", "how", "all", 
                  "any", "both", "each", "few", "more", "most", "other", "some", "such", 
                  "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", 
                  "s", "t", "can", "will", "just", "don", "should", "now", "also"]
    
    word_tokens = word_tokenize(mytext) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    return filtered_sentence

In [161]:
import nltk
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [' '.join(lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(text))]

In [162]:
#clean will have no stopwords, mydfidf will
resultgreen = green.copy()
resultnongreen = nongreen.copy()

In [163]:
resultgreen['text'] = resultgreen['text'].apply(lambda x:remove_stopwords(x))
resultnongreen['text'] = resultnongreen['text'].apply(lambda x:remove_stopwords(x))

In [164]:
resultgreen['text'] = resultgreen['text'].str.join(' ')
resultnongreen['text'] = resultnongreen['text'].str.join(' ')

In [165]:
resultgreen['text_lemmatized'] = resultgreen.text.apply(lemmatize_text)
resultnongreen['text_lemmatized'] = resultnongreen.text.apply(lemmatize_text)

In [166]:
resultgreen.head()

Unnamed: 0,name,text,text_lemmatized
0,ARTISAN,met sara today small coverup great work really...,[meet sara today small coverup great work real...
1,THE LOBSTER TRAP,coming years say best spot lobster north great...,[come years say best spot lobster north great ...
2,THE BREADFRUIT & RUM BAR,save money time took hour get lunch served har...,[save money time take hour get lunch serve har...
3,SEVICHE,atmosphere nthe first word came mind walked pl...,[atmosphere nthe first word come mind walk pla...
4,SEAFOOD SHACK,happy hour daily pm n nhappy hour specials dri...,[happy hour daily pm n nhappy hour specials dr...


In [167]:
resultnongreen.head()

Unnamed: 0,name,text,text_lemmatized
28,#1 FRIED RICE,really good food big portions decent price eno...,[really good food big portion decent price eno...
29,#1 HAWAIIAN BARBECUE,ordered fried mahimahi unfortunately made anot...,[order fry mahimahi unfortunately make another...
30,#1 PHO,fantastic pho vegetable pho vegetable broth ev...,[fantastic pho vegetable pho vegetable broth e...
31,#1 SUSHI,love place amazing cheap sushi thats fresh del...,[love place amaze cheap sushi thats fresh deli...
32,#1BROTHERS PIZZA,good thing place price value good get pay pizz...,[good thing place price value good get pay piz...


In [168]:
resultgreen['text_lemmatized'][2]



In [169]:
from sklearn.feature_extraction.text import TfidfVectorizer
greenvectorizer = TfidfVectorizer(max_features=100, smooth_idf=True, )#tokenizer=tokens ,use_idf=True, smooth_idf=True, sublinear_tf=False)
nongreenvectorizer = TfidfVectorizer(max_features=100, smooth_idf=True, )#tokenizer=tokens ,use_idf=True, smooth_idf=True, sublinear_tf=False)



In [170]:
greenmatrix = greenvectorizer.fit_transform(resultgreen['text'])
nongreenmatrix = nongreenvectorizer.fit_transform(resultnongreen['text'])

In [171]:
print(greenvectorizer.get_feature_names())

['always', 'another', 'around', 'asked', 'back', 'bad', 'bar', 'best', 'better', 'came', 'chicken', 'come', 'could', 'customer', 'day', 'didn', 'drink', 'drinks', 'drive', 'eat', 'even', 'ever', 'every', 'experience', 'fast', 'first', 'food', 'fresh', 'friendly', 'fries', 'get', 'give', 'go', 'going', 'good', 'got', 'great', 'know', 'last', 'like', 'little', 'lobster', 'location', 'long', 'love', 'made', 'make', 'manager', 'mcdonald', 'mcdonalds', 'meal', 'menu', 'minutes', 'much', 'never', 'ni', 'nice', 'night', 'nthe', 'one', 'order', 'ordered', 'people', 'place', 'pretty', 're', 'really', 'restaurant', 'right', 'said', 'sauce', 'say', 'server', 'service', 'shrimp', 'something', 'staff', 'still', 'sure', 'table', 'take', 'think', 'thru', 'time', 'times', 'told', 'took', 'try', 'two', 'us', 've', 'wait', 'want', 'way', 'well', 'went', 'wings', 'worst', 'would', 'wrong']


In [172]:
print(nongreenvectorizer.get_feature_names())

['always', 'amazing', 'area', 'around', 'back', 'bar', 'beef', 'beer', 'best', 'better', 'bit', 'came', 'cheese', 'chicken', 'chocolate', 'come', 'could', 'definitely', 'delicious', 'dessert', 'didn', 'dinner', 'drink', 'drinks', 'eat', 'even', 'everything', 'experience', 'first', 'food', 'fresh', 'friendly', 'get', 'give', 'go', 'going', 'good', 'got', 'great', 'happy', 'hot', 'hour', 'know', 'like', 'little', 'love', 'lunch', 'made', 'main', 'make', 'meal', 'menu', 'much', 'never', 'ni', 'nice', 'night', 'nthe', 'one', 'order', 'ordered', 'people', 'pizza', 'place', 'pretty', 'price', 're', 'really', 'recommend', 'restaurant', 'rice', 'right', 'said', 'salad', 'sauce', 'say', 'see', 'server', 'service', 'side', 'small', 'staff', 'still', 'sure', 'table', 'take', 'think', 'time', 'top', 'try', 'two', 'us', 've', 'view', 'wait', 'want', 'way', 'well', 'went', 'would']


In [173]:
print(greenmatrix.shape)

(28, 100)


In [174]:
print(nongreenmatrix.shape)

(56, 100)
