In [67]:
!pip install spacy



In [68]:
!pip install nltk



In [69]:
# Importing all the required libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Loading the dataset into dataframe
df = pd.read_csv('/content/restaurant_reviews_az.csv')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [70]:
# Printing the first 5 rows of the dataframe and its summary info
print(df.head())
print(df.info())

                review_id                 user_id             business_id  \
0  IVS7do_HBzroiCiymNdxDg  fdFgZQQYQJeEAshH4lxSfQ  sGy67CpJctjeCWClWqonjA   
1  QP2pSzSqpJTMWOCuUuyXkQ  JBLWSXBTKFvJYYiM-FnCOQ  3w7NRntdQ9h0KwDsksIt5Q   
2  oK0cGYStgDOusZKz9B1qug  2_9fKnXChUjC5xArfF8BLg  OMnPtRGmbY8qH_wIILfYKA   
3  E_ABvFCNVLbfOgRg3Pv1KQ  9MExTQ76GSKhxSWnTS901g  V9XlikTxq0My4gE8LULsjw   
4  Rd222CrrnXkXukR2iWj69g  LPxuausjvDN88uPr-Q4cQA  CA5BOxKRDPGJgdUQ8OUOpw   

   stars  useful  funny  cool  \
0      3       1      1     0   
1      5       1      1     1   
2      5       1      0     0   
3      5       0      0     0   
4      4       1      0     0   

                                                text                 date  
0  OK, the hype about having Hatch chili in your ...  2020-01-27 22:59:06  
1  Pandemic pit stop to have an ice cream.... onl...  2020-04-19 05:33:16  
2  I was lucky enough to go to the soft opening a...  2020-02-29 19:43:44  
3  I've gone to claim Jumpers all 

In [71]:
# Selecting 1 star reviews and 5 star reviews
one_star_reviews = df[df['stars'] == 1]['text'].values
five_star_reviews = df[df['stars'] == 5]['text'].values

In [72]:
# Applying text processing techniques to return list of lemmatized words without stopwords
def text_processing(review):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(review)
    filtered_review = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words]
    return filtered_review

# Applying text processing on 1 star reviews and 5 star reviews
one_star_reviews_processed = [text_processing(review) for review in one_star_reviews]
five_star_reviews_processed = [text_processing(review) for review in five_star_reviews]

In [73]:
from nltk.tag import pos_tag

# Function to get nouns
def get_nouns(tagged_words):
    return [word for word, pos in tagged_words if pos.startswith('N')]

# Retrieving nouns for 1 star reviews and 5 star reviews
one_star_nouns = [get_nouns(pos_tag(review)) for review in one_star_reviews_processed]
five_star_nouns = [get_nouns(pos_tag(review)) for review in five_star_reviews_processed]

# Converting nested lists into one dimensional lists
one_star_nouns_flat = [word for sublist in one_star_nouns for word in sublist]
five_star_nouns_flat = [word for sublist in five_star_nouns for word in sublist]

# Geting the top 20 frequently used nouns
top_20_one_star_nouns = Counter(one_star_nouns_flat).most_common(20)
top_20_five_star_nouns = Counter(five_star_nouns_flat).most_common(20)

print(top_20_one_star_nouns)
print(top_20_five_star_nouns)

[('food', 6134), ('order', 5318), ('time', 4294), ('place', 3229), ('service', 2838), ('minute', 2262), ('customer', 2101), ('restaurant', 1869), ('manager', 1509), ('people', 1392), ('hour', 1222), ('location', 1167), ('experience', 1111), ('way', 1064), ('staff', 988), ('employee', 980), ('pizza', 930), ('meal', 861), ('chicken', 853), ('table', 852)]
[('food', 14814), ('place', 10598), ('time', 7013), ('service', 6585), ('Tucson', 4894), ('restaurant', 4141), ('order', 3933), ('staff', 3707), ('menu', 2672), ('Great', 2456), ('flavor', 2373), ('pizza', 2220), ('experience', 2211), ('meal', 2029), ('spot', 1932), ('chicken', 1931), ('sauce', 1912), ('taco', 1849), ('day', 1792), ('everything', 1775)]


In [74]:
# Function to get adjectives
def get_adjectives(tagged_words):
    return [word for word, pos in tagged_words if pos.startswith('J')]

# Getting adjectives for 1 star reviews and 5 star reviews
one_star_adjectives = [get_adjectives(pos_tag(review)) for review in one_star_reviews_processed]
five_star_adjectives = [get_adjectives(pos_tag(review)) for review in five_star_reviews_processed]

# Converting nested lists into one dimensional lists
one_star_adjectives_flat = [word for sublist in one_star_adjectives for word in sublist]
five_star_adjectives_flat = [word for sublist in five_star_adjectives for word in sublist]

# Getting the top 20 frequently used adjectives
top_20_one_star_adjectives = Counter(one_star_adjectives_flat).most_common(20)
top_20_five_star_adjectives = Counter(five_star_adjectives_flat).most_common(20)

print(top_20_one_star_adjectives)
print(top_20_five_star_adjectives)

[('u', 2252), ('good', 1659), ('bad', 1135), ('last', 838), ('table', 833), ('great', 672), ('first', 659), ('wrong', 629), ('sure', 576), ('worst', 551), ('new', 550), ('many', 530), ('much', 508), ('terrible', 504), ('small', 498), ('next', 474), ('old', 470), ('horrible', 464), ('disappointed', 463), ('little', 444)]
[('great', 9366), ('good', 8623), ('delicious', 6354), ('best', 3851), ('fresh', 2840), ('nice', 2420), ('favorite', 2240), ('u', 2190), ('friendly', 1943), ('little', 1854), ('hot', 1572), ('first', 1572), ('new', 1515), ('happy', 1393), ('many', 1374), ('sure', 1330), ('next', 1236), ('special', 1196), ('perfect', 1150), ('super', 1142)]


In [75]:
# Function to get verbs
def get_verbs(tagged_words):
    return [word for word, pos in tagged_words if pos.startswith('V')]

# Getting verbs for 1 star reviews and 5 star reviews
one_star_verbs = [get_verbs(pos_tag(review)) for review in one_star_reviews_processed]
five_star_verbs = [get_verbs(pos_tag(review)) for review in five_star_reviews_processed]

# Converting nested lists into one dimensional lists
one_star_verbs_flat = [word for sublist in one_star_verbs for word in sublist]
five_star_verbs_flat = [word for sublist in five_star_verbs for word in sublist]

# Getting the top 20 frequently used verbs
top_20_one_star_verbs = Counter(one_star_verbs_flat).most_common(20)
top_20_five_star_verbs = Counter(five_star_verbs_flat).most_common(20)

print(top_20_one_star_verbs)
print(top_20_five_star_verbs)

[('get', 2422), ('go', 2415), ('ordered', 2390), ('said', 2362), ('got', 2201), ('asked', 1899), ('told', 1712), ('came', 1580), ('went', 1454), ("'ve", 1425), ("'m", 1335), ('going', 1302), ('know', 1192), ('say', 1141), ('come', 1137), ('took', 1031), ('make', 985), ('take', 980), ('waiting', 913), ('made', 902)]
[("'ve", 4495), ('go', 4082), ('ordered', 3476), ('got', 3448), ('get', 3128), ('love', 2870), ("'m", 2581), ('made', 2555), ('recommend', 2404), ("'s", 2367), ('come', 2299), ('amazing', 2242), ('make', 1986), ("'re", 1984), ('came', 1958), ('tried', 1601), ('try', 1566), ('went', 1499), ('take', 1469), ('going', 1419)]


In [76]:
from nltk import ne_chunk

# Function to get named entities
def get_named_entities(words):
    tagged_words = pos_tag(words)
    named_entities = ne_chunk(tagged_words)
    return [chunk.label() for chunk in named_entities if hasattr(chunk, 'label')]

# Get named entities for 1 star reviews and 5 star reviews
one_star_entities = [get_named_entities(review) for review in one_star_reviews_processed]
five_star_entities = [get_named_entities(review) for review in five_star_reviews_processed]

# Converting nested lists into one dimensional lists
one_star_entities_flat = [entity for sublist in one_star_entities for entity in sublist]
five_star_entities_flat = [entity for sublist in five_star_entities for entity in sublist]

# Getting the top 20 frequently used named entities
top_20_one_star_entities = Counter(one_star_entities_flat).most_common(20)
top_20_five_star_entities = Counter(five_star_entities_flat).most_common(20)

print(top_20_one_star_entities)
print(top_20_five_star_entities)

[('PERSON', 9737), ('ORGANIZATION', 4082), ('GPE', 3300), ('GSP', 23), ('LOCATION', 17), ('FACILITY', 13)]
[('PERSON', 34484), ('GPE', 15141), ('ORGANIZATION', 7855), ('LOCATION', 173), ('GSP', 107), ('FACILITY', 40)]
