# INFO 498 Final Project

### Train data statistics

In [7]:
# load dataset
import pandas as pd

df_train = pd.read_csv('data/yelpnlg-train.csv')

display(df_train.head())

Unnamed: 0,id,ref,mr,sentiment,length,first_person,exclamation
0,0,best corn beef and pastrami combo sandwich and...,food||corn_beef||amazing||mention_1 food||past...,positive,len_medium,not_first_person,has_exclamation
1,1,"""he said it did not taste good , and it appear...",food||tomato||canned||mention_1 food||onion||n...,negative,len_long,not_first_person,no_exclamation
2,2,i usually get the dal gal bee chicken dish tha...,food||dal_gal_bee_chicken_dish||no_adj||mentio...,neutral,len_medium,first_person,no_exclamation
3,3,"""the pho was very flavourful , good firm rice ...",food||pho||flavourful||mention_1 food||firm_ri...,positive,len_long,not_first_person,no_exclamation
4,4,"""the spices they use to fry their chicken are ...",food||spices||no_adj||mention_1 food||fry||no_...,positive,len_long,not_first_person,no_exclamation


#### Columns Described

* id - A sequential identifier of the instance in that split.
* ref - The reference text (review sentence) for that instance.
* mr - The meaning representation (mr) for that instance. Each MR is a set of space-separated tuples, and each is tuple
is divided with "||" separators. Each tuple contains the following information (in this order):
    * attribute - one of: {"restaurant", "cuisine", "food", "service", "staff", "ambiance", "price"}
    * value - any value (from attribute lexicons)
    * adjective - any adjective (from sentence dependency parse), else "no_adj" if none is available/retrievable in the parse
    * mention - mention_N (N indicates which mention is being referenced, i.e. 1 for first mention, 2 for second mention, etc.)
* sentiment - "positive" (4-5 stars), "neutral" (3 stars), "negative" (1-2 stars)
* length - "len_short" (4-10 tokens), "len_medium" (10-20 tokens), "len_long" (20-30 tokens)
* first_person - "first_person" (includes a first person pronoun: {"i", "my", "me", "our", "we", "us"}), "not_first_person" (does not include any first person pronouns)
* exclamation - "has_exclamation" (include an exclamation mark), "no_exclamation" (does not include an exclamation mark)

In [None]:
# total reviews
total_reviews = len(df_train)

# total amount of each sentiment
sentiments = {'positive': 0, 'negative': 0, 'neutral': 0}

for sentiment in df_train['sentiment']:
    # check for positive
    if sentiment == 'positive':
        sentiments[sentiment] += 1
    # check for negative
    elif sentiment == 'negative':
        sentiments[sentiment] += 1
    # check for neutral
    elif sentiment == 'neutral':
        sentiments[sentiment] += 1

# total amount of each length of review
total_lengths = {'short': 0, 'medium': 0, 'long': 0}

for length in df_train['length']:
    # check for short
    if length == 'len_short':
        total_lengths['short'] += 1
    # check for medium
    if length == 'len_medium':
        total_lengths['medium'] += 1
    # check for long
    if length == 'len_long':
        total_lengths['long'] += 1

# longest and shortest review
shortest = float('inf')
longest = 0

for review in df_train['ref']:
    review_len = len(review)
    # find shortest
    if review_len < shortest:
        shortest = review_len
    # find longest
    if review_len > longest:
        longest = review_len

# first person/non first person totals
pov_totals = {'first person': 0, 'not first person': 0}

for pov in df_train['first_person']:
    # check for first
    if pov == 'first_person':
        pov_totals['first person'] += 1
    # check for non first
    if pov == 'not_first_person':
        pov_totals['not first person'] += 1

# exclamation vs no exclamation
exclamation_totals = {'exclamation': 0, 'no exclamation': 0}

for exclamation in df_train['exclamation']:
    # check for exclamation
    if exclamation == 'has_exclamation':
        exclamation_totals['exclamation'] += 1
    # check for no exclamation
    if exclamation == 'no_exclamation':
        exclamation_totals['no exclamation'] += 1

# print review total
print('total reviews:', total_reviews)
print()

# print sentiment totals
print('total sentiments:', sentiments)
print()

# print review lengths
print('total lengths:', total_lengths)
print('shortest review by tokens:', shortest)
print('longest review by tokens:', longest)
print()

# print pov totals
print("total pov's:", pov_totals)
print()

# print exclamation totals
print('total exclamations:', exclamation_totals)

## Yelp Business and Reviews
### Cleaning and combining datasets

In [8]:
# load business and review datasets
df_yelp_business = pd.read_csv('data/yelp_business.csv')
df_yelp_review = pd.read_csv('data/yelp_review.csv')

In [9]:
# remove businesses that do not classify as a restaurant
df_yelp_business = df_yelp_business[df_yelp_business['categories'].str.contains('restaurant', case=False, na=False)]

In [10]:
df_yelp_business.head(3)

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",,"""581 Howe Ave""",Cuyahoga Falls,OH,44221,41.119535,-81.47569,3.5,116,1,American (New);Nightlife;Bars;Sandwiches;Ameri...
5,o9eMRCWt5PkpLDE0gOPtcQ,"""Messina""",,"""Richterstr. 11""",Stuttgart,BW,70567,48.7272,9.14795,4.0,5,1,Italian;Restaurants
10,XOSRcvtaKc_Q5H1SAzN20A,"""East Coast Coffee""",,"""737 West Pike St""",Houston,PA,15342,40.241548,-80.212815,4.5,3,0,Breakfast & Brunch;Gluten-Free;Coffee & Tea;Fo...


In [11]:
df_yelp_business.shape

(54630, 13)

In [12]:
df_yelp_review.head(3)

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0


In [13]:
df_yelp_review.shape

(5261668, 9)

In [14]:
# combine datasets based on the business_id
df_combined = pd.merge(df_yelp_review, df_yelp_business, on='business_id', how='inner')
df_combined.head(3)

Unnamed: 0,review_id,user_id,business_id,stars_x,date,text,useful,funny,cool,name,...,address,city,state,postal_code,latitude,longitude,stars_y,review_count,is_open,categories
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0,"""Wilensky's""",...,"""34 Avenue Fairmount Ouest""",Montréal,QC,H2T 2M1,45.523333,-73.594859,4.0,84,1,Diners;Food;Restaurants;Delis
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0,"""Tuck Shop""",...,"""4662 Rue Notre-Dame O""",Montréal,QC,H4C 1S7,45.472902,-73.588321,4.5,50,1,Restaurants;Canadian (New);Italian
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0,"""Lester's Deli""",...,"""1057 Avenue Bernard""",Outremont,QC,H2V 1V1,45.522144,-73.607076,4.0,70,1,Specialty Food;Food;Sandwiches;Restaurants;Bur...


In [15]:
df_combined.shape

(3221555, 21)

In [16]:
# remove unnecessary columns

columns_to_remove = ['user_id', 'useful', 'funny', 'cool', 'address', 'is_open', 'neighborhood']
df_cleaned = df_combined.drop(columns=columns_to_remove)
df_cleaned.head(3)

Unnamed: 0,review_id,business_id,stars_x,date,text,name,city,state,postal_code,latitude,longitude,stars_y,review_count,categories
0,vkVSCC7xljjrAI4UGfnKEQ,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,"""Wilensky's""",Montréal,QC,H2T 2M1,45.523333,-73.594859,4.0,84,Diners;Food;Restaurants;Delis
1,n6QzIUObkYshz4dz2QRJTw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,"""Tuck Shop""",Montréal,QC,H4C 1S7,45.472902,-73.588321,4.5,50,Restaurants;Canadian (New);Italian
2,MV3CcKScW05u5LVfF6ok0g,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,"""Lester's Deli""",Outremont,QC,H2V 1V1,45.522144,-73.607076,4.0,70,Specialty Food;Food;Sandwiches;Restaurants;Bur...


In [62]:
# randomly sample data to reduce size

sample_fraction = 0.01
df_sampled = df_cleaned.sample(frac=sample_fraction, random_state=1)

# should be over 30,000 samples
df_sampled.shape

(32216, 14)

In [64]:
# map star ratings to sentiment labels
def map_stars_to_sentiment(stars):
    if stars >= 3:
        return 1
    elif stars < 3:
        return 0

# create a new column for true sentiment
df_sampled['true_sentiment'] = df_sampled['stars_x'].apply(map_stars_to_sentiment)


df_sampled.head(5)

Unnamed: 0,review_id,business_id,stars_x,date,text,name,city,state,postal_code,latitude,longitude,stars_y,review_count,categories,true_sentiment
1947193,UdbSqUb_Eadz6bsH3V8-Ig,LFs5jyYdXlzi0SpAYi1eSA,5,2016-01-12,Came her for my birthday and everything was am...,"""Maggiano's Little Italy""",Las Vegas,NV,89109,36.127783,-115.168968,4.0,1250,Nightlife;Bars;Italian;Restaurants,1
1348323,e8rGhwPielvSOTWEs1idsw,uoEV1AxIm3_XFxiScYWrbA,1,2015-01-31,I will refuse to come to this location again b...,"""Paradise Bakery & Cafe""",Glendale,AZ,85308,33.642095,-112.22568,3.5,12,Cafes;Restaurants;Bakeries;Food,0
2390913,5CsGTuMbnVvFTY55QhZa0A,vF58mwG-9Bx67S6hxKZ68A,3,2010-01-20,I have never actually had a good time at this ...,"""Frank & Tony's""",Willoughby,OH,44094,41.640863,-81.40773,2.5,19,Nightlife;Bars;Pizza;Restaurants,1
1151566,TBdeYFPPqVw19hRjxOQXuA,igHYkXZMLAc9UdV5VnR_AA,5,2016-05-05,We had dinner here this week after being entic...,"""Echo & Rig""",Las Vegas,NV,89145,36.166576,-115.286232,4.5,1924,Food;Steakhouses;Meat Shops;Butcher;Specialty ...,1
2361052,091Ga8_V_ngvhl8Z89wleg,EdIFp1tdPi1267hm3da6Nw,4,2017-11-09,Got the prime rib here on my last vegas trip i...,"""Ellis Island BBQ""",Las Vegas,NV,89109,36.113207,-115.163313,4.0,97,Restaurants;American (Traditional);Barbeque,1


# Text Classification with LLM prompting

### OpenAI API

In [1]:
import os
from openai import OpenAI

def complete(prompt):
    client = OpenAI()
    
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ]
    )
    
    return completion.choices[0].message.content

In [35]:
ZERO_PROMPT = """
Please classify the following review as positive or negative.
Output 1 for positive and 0 for negative.
Format the output as Label: 0 or 1.

Review: "{}"
"""

FEW_PROMPT = """
Please classify the following review as positive or negative.
Output 1 for positive and 0 for negative.
Format the output as Label: 0 or 1.

For example:
Review: “I had a great experience at this restaurant! The food was delicious, and the service was excellent. I will be back soon!” 
Label: 1

Review: "I had high hopes for this place, but everything was a letdown. 
The food was greasy and lacked flavor, the portions were small for the price, 
and the staff seemed annoyed to even be there. I had to ask twice for water,
and it never came. Definitely won't be returning."
Label: 0

Review: "{}"
"""


In [None]:
outputs_few = dict()

for i in range(len(df_sampled)):
    prompt = FEW_PROMPT.format(df_sampled['text'].iloc[i])
    response = complete(prompt)
    outputs_few[i] = response

In [None]:
from sklearn.metrics import classification_report

predictions = [int(response.strip("Label: ")) for response in outputs_few.values()]

In [None]:
print(classification_report(df_sampled['true_sentiments'], predictions))