# INFO 498 Final Project

### Train data statistics

In [25]:
# load dataset
import pandas as pd

df_train = pd.read_csv('data/yelpnlg-train.csv')

display(df_train.head())

Unnamed: 0,id,ref,mr,sentiment,length,first_person,exclamation
0,0,best corn beef and pastrami combo sandwich and...,food||corn_beef||amazing||mention_1 food||past...,positive,len_medium,not_first_person,has_exclamation
1,1,"""he said it did not taste good , and it appear...",food||tomato||canned||mention_1 food||onion||n...,negative,len_long,not_first_person,no_exclamation
2,2,i usually get the dal gal bee chicken dish tha...,food||dal_gal_bee_chicken_dish||no_adj||mentio...,neutral,len_medium,first_person,no_exclamation
3,3,"""the pho was very flavourful , good firm rice ...",food||pho||flavourful||mention_1 food||firm_ri...,positive,len_long,not_first_person,no_exclamation
4,4,"""the spices they use to fry their chicken are ...",food||spices||no_adj||mention_1 food||fry||no_...,positive,len_long,not_first_person,no_exclamation


#### Columns Described

* id - A sequential identifier of the instance in that split.
* ref - The reference text (review sentence) for that instance.
* mr - The meaning representation (mr) for that instance. Each MR is a set of space-separated tuples, and each is tuple
is divided with "||" separators. Each tuple contains the following information (in this order):
    * attribute - one of: {"restaurant", "cuisine", "food", "service", "staff", "ambiance", "price"}
    * value - any value (from attribute lexicons)
    * adjective - any adjective (from sentence dependency parse), else "no_adj" if none is available/retrievable in the parse
    * mention - mention_N (N indicates which mention is being referenced, i.e. 1 for first mention, 2 for second mention, etc.)
* sentiment - "positive" (4-5 stars), "neutral" (3 stars), "negative" (1-2 stars)
* length - "len_short" (4-10 tokens), "len_medium" (10-20 tokens), "len_long" (20-30 tokens)
* first_person - "first_person" (includes a first person pronoun: {"i", "my", "me", "our", "we", "us"}), "not_first_person" (does not include any first person pronouns)
* exclamation - "has_exclamation" (include an exclamation mark), "no_exclamation" (does not include an exclamation mark)

In [26]:
# total reviews
total_reviews = len(df_train)

# total amount of each sentiment
sentiments = {'positive': 0, 'negative': 0, 'neutral': 0}

for sentiment in df_train['sentiment']:
    # check for positive
    if sentiment == 'positive':
        sentiments[sentiment] += 1
    # check for negative
    elif sentiment == 'negative':
        sentiments[sentiment] += 1
    # check for neutral
    elif sentiment == 'neutral':
        sentiments[sentiment] += 1

# total amount of each length of review
total_lengths = {'short': 0, 'medium': 0, 'long': 0}

for length in df_train['length']:
    # check for short
    if length == 'len_short':
        total_lengths['short'] += 1
    # check for medium
    if length == 'len_medium':
        total_lengths['medium'] += 1
    # check for long
    if length == 'len_long':
        total_lengths['long'] += 1

# longest and shortest review
shortest = float('inf')
longest = 0

for review in df_train['ref']:
    review_len = len(review)
    # find shortest
    if review_len < shortest:
        shortest = review_len
    # find longest
    if review_len > longest:
        longest = review_len

# first person/non first person totals
pov_totals = {'first person': 0, 'not first person': 0}

for pov in df_train['first_person']:
    # check for first
    if pov == 'first_person':
        pov_totals['first person'] += 1
    # check for non first
    if pov == 'not_first_person':
        pov_totals['not first person'] += 1

# exclamation vs no exclamation
exclamation_totals = {'exclamation': 0, 'no exclamation': 0}

for exclamation in df_train['exclamation']:
    # check for exclamation
    if exclamation == 'has_exclamation':
        exclamation_totals['exclamation'] += 1
    # check for no exclamation
    if exclamation == 'no_exclamation':
        exclamation_totals['no exclamation'] += 1

# print review total
print('total reviews:', total_reviews)
print()

# print sentiment totals
print('total sentiments:', sentiments)
print()

# print review lengths
print('total lengths:', total_lengths)
print('shortest review by tokens:', shortest)
print('longest review by tokens:', longest)
print()

# print pov totals
print("total pov's:", pov_totals)
print()

# print exclamation totals
print('total exclamations:', exclamation_totals)

total reviews: 235426

total sentiments: {'positive': 146760, 'negative': 48372, 'neutral': 40294}

total lengths: {'short': 22054, 'medium': 116972, 'long': 96400}
shortest review by tokens: 15
longest review by tokens: 245

total pov's: {'first person': 118552, 'not first person': 116874}

total exclamations: {'exclamation': 19244, 'no exclamation': 216181}
