# Predicting Yelp Star Ratings
In this little exercise, I am going to have a look at the distribution of Yelp ratings (1 to 5 stars) and their correlations to business and user attributes. Eventually I am attempting to use some ML algorithms to predict a rating from business / user attributes and basic properties of the review test.

### Import statements

In [1]:
%matplotlib inline

import os, sys
import numpy as np
import pandas as pd
import random

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
#matplotlib.style.use('fivethirtyeight')

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def format_column_names(dataFrame):
    dataFrame.columns = dataFrame.columns.str.replace('[^\w.]+','_')
    dataFrame.columns = dataFrame.columns.str.lower()
    
pd.options.display.max_seq_items = 500

### User Settings

In [3]:
# The fraction (random sample) of the review dataset, which is to be parsed
# (values larges than 0.1 will cause memory overflows when performing in-memory merges)
REVIEW_FRAC = 0.1

# The minimium number of user votes for a review to consider it to be relevant
MIN_VOTES = 5

### Verify the source directory

In [None]:
# Set the source directory for the input csv files (business.csv, user.csv, review.csv)
source_dir = os.path.join( os.getcwd(), 'yelp-workspace' )

required_files = ['business.csv', 'user.csv', 'review.csv']

nfiles_found = sum( os.path.isfile( os.path.join(source_dir, f) ) for f in required_files)
if nfiles_found < len(required_files):
    source_dir = input('Specify CSV source directory: ')
if nfiles_found < len(required_files):
    print('Source files not found.')
    sys.exit(1)
    
print('Source directory: {0}'.format(source_dir))

Source directory: /home/marco/workspace-python/YelpExercise/yelp-workspace


## Data Wrangling

### Load and format business data

In [None]:
# Load business data
businesses = pd.read_csv( os.path.join(source_dir, 'business.csv'),
    parse_dates=True,
    low_memory=False,
    index_col='business_id'
)
format_column_names(businesses)

# Identify column starting with 'attribute'
attr_cols = [col for col in list(businesses) if col.startswith('attributes.')]

# Convert attribute columns to numeric values
businesses[attr_cols] = businesses[attr_cols].replace(
    to_replace=[True, 'yes', 'full_bar', 'free', 'yes_free', 'quiet', 'yes_corkage', 'beer_and_wine'], value=1 )
businesses[attr_cols] = businesses[attr_cols].replace(
    to_replace=[False, 'no', 'none', 'very_loud'], value=-1 )
businesses[attr_cols] = businesses[attr_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

#businesses.columns
#businesses.info()

In [None]:
# Plot the business mean ratings
#star_counts = businesses.stars.value_counts(sort=False, normalize=True).sort_index()
#star_counts.plot(kind="bar", title="Business Mean Ratings", rot='0').set_xlabel('Rating')

### Load and format user data

In [None]:
# Load user data
users = pd.read_csv( os.path.join(source_dir, 'user.csv'), parse_dates=True, index_col='user_id' )
format_column_names(users)

compl_cols = [col for col in list(users) if col.startswith('compliments.')]
users['compliments'] = users[compl_cols].sum(axis=1)

vote_cols = [col for col in list(users) if col.startswith('votes.')]
users['votes'] = users[vote_cols].sum(axis=1)

#users.columns
#users.info()

### Load and format reviews data

In [None]:
review_file = os.path.join(source_dir, 'review.csv')

# count lines
#num_lines = sum(1 for _ in open(review_file))
num_lines = 10000000
# configure random line indices to skip
skip_idx = random.sample(range(1, num_lines), num_lines - int(REVIEW_FRAC*num_lines))

# only load a random fraction of the reviews dataset, specified by REVIEW_FRAC
reviews = pd.read_csv(
    review_file,
    parse_dates=True,
    index_col='review_id',
    skiprows=skip_idx
)
format_column_names(reviews)

reviews['text_length'] = reviews['text'].str.len()
reviews['text_wc'] = reviews['text'].str.split().apply(len)

vote_cols = [col for col in list(reviews) if col.startswith('votes.')]
reviews['votes'] = reviews[vote_cols].sum(axis=1)

times = pd.DatetimeIndex(reviews.date)
reviews['year'] = times.year

#reviews.columns
reviews.info()

### Merge reviews, users and business tables (left joins)

In [None]:
rb = pd.merge(reviews, businesses, how='left', left_on='business_id', right_index=True, suffixes=('@reviews', '@businesses'))
rbu = pd.merge(rb, users, how='left', left_on='user_id', right_index=True, suffixes=('@reviews', '@users'))

#rbu['stars@reviews'].loc[rbu['votes@reviews'] >= 1].size

del businesses
del users
del reviews

#rbu.columns
rbu.info()

# Exploratory Data Analysis

### Plot number of votes per review

In [None]:
rbu['votes@reviews'].value_counts(normalize=True).ix[:20] \
    .plot.bar(rot=90, title='Distribution of votes per review')

### Investigate the distribution of ratings

In [None]:
star_counts = rbu['stars@reviews'].value_counts(normalize=True).sort_index()
star_counts_min1 = rbu['stars@reviews'].loc[rbu['votes@reviews'] >= 1].value_counts(normalize=True).sort_index()
#star_counts_min3 = rbu['stars@reviews'].loc[rbu['votes@reviews'] >= 3].value_counts(normalize=True).sort_index()
star_counts_min5 = rbu['stars@reviews'].loc[rbu['votes@reviews'] >= 5].value_counts(normalize=True).sort_index()

star_counts_comb = pd.concat([star_counts, star_counts_min1, star_counts_min5], axis=1)
star_counts_comb.columns = ['all', 'minimum of 1 vote', 'minimum of 5 votes']

star_counts_comb.plot.bar(title="Distribution of ratings", stacked=False, rot=0).set_xlabel('Rating')

In [None]:
star_counts_per_year = rbu.loc[rbu['votes@reviews'] >= MIN_VOTES].groupby(['year'])['stars@reviews'].value_counts(normalize=True).unstack().transpose()
star_counts_per_year[star_counts_per_year.columns[-5:]].plot.bar(title="Distribution of ratings per year (min. of {0} votes)".format(MIN_VOTES), stacked=False, rot=0).set_xlabel('Rating')

### Investigate correlations with the ratings column

In [None]:
cols = ['attributes.accepts_credit_cards',
 'attributes.alcohol',
 'attributes.by_appointment_only',
 'attributes.caters',
 'attributes.coat_check',
 'attributes.corkage',
 'attributes.delivery',
 'attributes.dogs_allowed',
 'attributes.drive_thru',
 'attributes.good_for_dancing',
 'attributes.good_for_groups',
 'attributes.good_for.breakfast',
 'attributes.good_for.brunch',
 'attributes.good_for.dessert',
 'attributes.good_for.dinner',
 'attributes.good_for.latenight',
 'attributes.good_for.lunch',
 'attributes.good_for_kids',
 'attributes.happy_hour',
 'attributes.has_tv',
 'attributes.noise_level',
 'attributes.open_24_hours',
 'attributes.order_at_counter',
 'attributes.outdoor_seating',
 'attributes.price_range',
 'attributes.smoking',
 'attributes.take_out',
 'attributes.takes_reservations',
 'attributes.waiter_service',
 'attributes.wheelchair_accessible',
 'attributes.wi_fi',
 'review_count@users',
 'compliments']
correls = rbu[cols].corrwith(rbu['stars@reviews'], drop=True).sort_values()
correls.plot.bar(figsize=(10,5), title='Correlation of business attributes (#1) with avg. rating')

In [None]:
cols = [col for col in list(rbu) if '.ambience' in col or '.music' in col or '.parking' in col]
correls = rbu[cols].corrwith(rbu['stars@businesses'], drop=True).sort_values()
correls.plot.bar(figsize=(10,5), title='Correlation of business attributes (#2) with avg. rating')

In [None]:
cols = ['compliments', 'votes@reviews', 'review_count@users', 'fans']
correls = rbu[cols].corrwith(rbu['stars@reviews'], drop=True)
correls.plot.bar(title='Correlation of user attributes & review votes with ratings')

In [None]:
# rbu.loc[rbu['votes'] >= 5].groupby(['stars_review'])['text_length'].mean().plot.bar(title="Mean review length (characters) vs. rating (min. of 5 votes)", stacked=False, rot=0).set_xlabel('Rating')

### Investigate the relation between review word count and rating

In [None]:
rbu.loc[rbu['votes@reviews'] >= MIN_VOTES].groupby(['stars@reviews'])['text_wc'].mean().plot.bar(title="Mean review word counts vs. rating (min. of 5 votes)", stacked=False, rot=0).set_xlabel('Rating')

In [None]:
rbu.columns

# Predictive Data Analysis

## Split the available data into training and test set

## Vectorize the review texts

In [None]:
train, test = train_test_split(rbu, test_size = 0.2)

In [None]:
vect = CountVectorizer()
vect.fit(train['text'])

In [None]:
vect.get_feature_names()

In [None]:
train_dtm = vect.transform(train)
train_dtm

In [None]:
test_dtm = vect.transform(test)
test_dtm