# Yelp Analysis
### 11/10/2022
In this table I'll join the tables in the Yelp academic dataset. \
There are 5 tables, we'll import all 5 Json files here.\
[Documentation for Dataset](https://www.yelp.com/dataset/documentation/main)


## Extract

In [3]:
# importing dependencies
import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [4]:
# importing business
business_df = pd.read_csv("Data/yelp_business_restaurants.csv")

# remove columns
# we remove the average restaurant rating
# we only care for the stars in the review_df (map text to rating)
business_df.drop(['stars'], axis = 1, inplace = True)
business_df.drop(['Unnamed: 0'], axis = 1, inplace = True)

# get only businesses who are restaurants
business_df = business_df[business_df['is_restaurant'] == True]

# rename 'name' to 'business_name'
business_df.columns = business_df.columns.str.replace('name', 'business_name')

# rename 'review_count' to 'business_review_count'
business_df.columns = business_df.columns.str.replace('review_count', 'business_review_count')

# snapshot
business_df.head()

FileNotFoundError: ignored

In [None]:
# importing review
data_file = open("Data/yelp_academic_dataset_review.json", 'r', encoding='utf8')
data = []
for line in data_file:
    data.append(json.loads(line))
review_df = pd.DataFrame(data)
data_file.close()

# snapshot
review_df.head()

In [None]:
# importing user
data_file = open("Data/yelp_academic_dataset_user.json", 'r', encoding='utf8')
data = []
for line in data_file:
    data.append(json.loads(line))
user_df = pd.DataFrame(data)
data_file.close()

# snapshot
user_df.head()


## Transform

1. Filter for businesses that are food or restaurant related (use `yelp_categories` dataset)
2. Filter for users who have 40 or more reviews.
3. Filter for businesses who have 10 or more reviews.
4. Filter for businesses in PA & NJ.
5. Filter for reviews in PA & NJ.


In [None]:
sns.histplot(user_df['review_count'], binwidth = 5)

In [None]:
user_df['review_count'].quantile([0.05, 0.95])

In [None]:
# 1. Filter for users who have more than 1 reviews
#    Filter for users who have less than 92 reviews
user_df = user_df[(user_df['review_count'] > 1) |\
                  (user_df['review_count'] < 92)]
print(len(user_df))

In [None]:
# 2. Outliers in review count for businesses
business_df['business_review_count'].quantile([0.05, 0.95])

In [None]:
# 2. Filter for businesses who have more than 11 reviews
#    Filter for businesses who have less than 290 reviews
business_df = business_df[(business_df['business_review_count'] >= 11) |\
                          (business_df['business_review_count'] <= 290)]

print(len(business_df))

In [None]:
# get count of reviews by state
print(business_df.groupby(['state'])['business_review_count'].agg(np.size).sort_values())

# count plot of businesses in different states
sns.set(rc={"figure.figsize":(10, 6)}) #width=10, #height=6
sns.histplot(business_df['state'])

In [None]:
# 3. Filter for businesses in CA
business_df = business_df[business_df['state'].isin(['CA'])]

# check
print(business_df.state.unique())

# snapshot
business_df.head()

In [None]:
# 4. Merging reviews with users
master_df = review_df.merge(user_df, how='inner', on="user_id")
master_df.head()

In [None]:
# 4. Merging master with users
master_df = master_df.merge(business_df, how='inner', on="business_id")
master_df

In [None]:
# summary stats
print("After data processing ...")
print(f"# of Unique Reviews: {'{:,}'.format(len(master_df.review_id.unique()))}")
print(f"# of Unique Users: {'{:,}'.format(len(master_df.user_id.unique()))}")
print(f"# of Unique Businesses: {'{:,}'.format(len(master_df.business_id.unique()))}")
print("===" * 10)
print()

print("Regarding users ...")
print(f"Mean review count per user: {round(master_df.user_id.value_counts().mean(), 4)}")
print(f"Median review count per user: {round(master_df.user_id.value_counts().median(), 4)}")
print(f"Variance of review counts per user: {round(master_df.user_id.value_counts().var(), 4)}")
q3, q1 = np.percentile(master_df.user_id.value_counts(), [75, 25])
print(f"IQR of review counts per user: {round(q3 - q1, 4)}")
print("---")
print()

print("Regarding stars per user ...")
print("Mean stars per user: " + str(round(master_df.groupby("user_id")['stars'].mean().mean(), 4)))
print("Median stars per user: " + str(round(master_df.groupby("user_id")['stars'].mean().median(), 4)))
print("Variance of stars per user: " + str(round(master_df.groupby("user_id")['stars'].mean().var(), 4)))
q3, q1 = np.percentile(master_df.groupby("user_id")['stars'].mean(), [75, 25])
print(f"IQR of stars per user: {round(q3 - q1, 4)}")
print("===" * 10)
print()

print("Regarding businesses ...")
print(f"Mean review count per business: {round(master_df.business_id.value_counts().mean(), 4)}")
print(f"Median review count per business: {round(master_df.business_id.value_counts().median(), 4)}")
print(f"Variance of review counts per business: {round(master_df.business_id.value_counts().var(), 4)}")
q3, q1 = np.percentile(master_df.business_id.value_counts(), [75, 25])
print(f"IQR of review counts per business: {round(q3 - q1, 4)}")
print("---")
print()

print("Regarding stars per business ...")
print("Mean stars per business: " + str(round(master_df.groupby("business_id")['stars'].mean().mean(), 4)))
print("Median stars per business: " + str(round(master_df.groupby("business_id")['stars'].mean().median(), 4)))
print("Variance of stars per business: " + str(round(master_df.groupby("business_id")['stars'].mean().var(), 4)))
q3, q1 = np.percentile(master_df.groupby("business_id")['stars'].mean(), [75, 25])
print(f"IQR of mean stars per business: {round(q3 - q1, 4)}")

In [None]:
# output final dataset
master_df.to_csv("master_df.csv")

In [None]:
# import master df
master_df = pd.read_csv("master_df.csv")

# snapshot
master_df

## California Restaurants Analysis

In [None]:
master_df.columns

In [None]:
# number of restaurants per city
master_df.city.value_counts()

In [None]:
# mean restaurant stars per city
master_df.groupby(['city'])['stars'].mean().sort_values()

In [None]:
# mean restaurant reviews per city
master_df.groupby(['city'])['business_review_count'].mean().sort_values()

## Cleaning text data

Per line of text:
1. Remove punctuation
2. Tokenize
3. Remove stop words
4. Stem words
5. Lemmatize

In [None]:
# feature selection
master_df = master_df[['stars', 'text']]

In [None]:
# import dependencies to clean text
import string
import re
import nltk
stopword = nltk.corpus.stopwords.words('english') # english stopwords
ps = nltk.PorterStemmer() # stem english words
wn = nltk.WordNetLemmatizer() # lemmatize english words

# function to remove punctuation
def remove_punct(text):
    """
    Removing punctuation.
    """
    text_no_punct = "".join([char for char in text if char not in string.punctuation])
    return text_no_punct

def tokenize(text):
    """
    Tokenizing text.
    """
    tokens = re.split('\W+', text)
    return tokens

def remove_stop_words(tokenized_text):
    """
    Removing stop words.
    """
    text = " ".join([word for word in tokenized_text if word not in stopword])
    return text

def text_stemmer(tokenized_text):
    """
    Reduce words to its stemmed form.
    """
    text = "".join([ps.stem(word) for word in tokenized_text])
    return text
    
def text_lemmatize(tokenized_text):
    """
    Reduce words to their root form.
    """
    text = "".join([wn.lemmatize(word) for word in tokenized_text])
    return text

def clean_text_col(text_col):
    """
    Apply text cleaning functions to Pandas Series
    """
    text_col_punc = text_col.apply(lambda x: text_lemmatize(\
                                             text_stemmer(\
                                             remove_stop_words(\
                                             tokenize(
                                             remove_punct(x.lower())\
                                             )))))
    return text_col_punc

In [None]:
# cleaning text
X = clean_text_col(master_df['text'])

In [None]:
# concat text with target
cleaned_text_df = pd.concat([master_df[['stars']], X], axis = 1)

In [None]:
# output to csv
pd.concat([master_df[['stars']], X], axis = 1).to_csv("cleaned_text.csv")

In [None]:
# import cleaned text
cleaned_text_df = pd.read_csv("cleaned_text.csv").drop(['Unnamed: 0'], axis = 1)

# text
X = cleaned_text_df['text']

# snapshot
cleaned_text_df

## Vectorize Text Data

Now we encode text as integers to create feature vectors. We will take three approaches to vectorizing the data.

**Bag of Words**: This described the presence of words within the text data. The algorithm gives a 1 if the word is present in the sentence, and a 0 if abscent.

**N-Gram**: N-grams are a combination of adjacent words or letters of length `n`. We will use bigrams.

**TF-IDF**: Term frequency-inverse document frequency defines the proportion of times a word appears in a document over the number of times that same word appears in all other documents.

Note: `CountVectorizer +  TfidfTransformer  = TfidfVectorizer`

In [None]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# initialize vectorizer
tfidf_vect = TfidfVectorizer()

# fit tf-idf on cleaned text
X_tfidf = tfidf_vect.fit_transform(X)

print(X_tfidf.shape)

## Truncated SVD to visualize text in 2 Dimensions against Target

For a PCA, even if the input is a sparse matrix, the output is not. PCA(X) is SVD(X - mean(X)). As of now, there is no workaround for this in SKLearn. So we'll use Truncated SVD as an alternative. The singular-value decomposition/ SVD is a dimension reduction technique for matrices that reduces the matrix into its component to simplify the calculation. 

SVD is a popular method for dimensionality reduction. However, it works better with sparse data. This is because the estimator does not center the data before computing the singular value decomposition. Here sparse data refers to the data with many zero values. 

Because we are working on a TF-IDF matrix, this is known as Latent Semantic Analysis.

In [None]:
from sklearn.decomposition import TruncatedSVD

# initialize model
svd = TruncatedSVD(n_components=2)

# fit model
svd_Components = svd.fit_transform(X_tfidf)

# cast as df
svd_df = pd.DataFrame(data = svd_Components,
                           columns = ['SVD1',
                                      'SVD2'])

# concatenate stars
svd_df = pd.concat([svd_df, master_df[['stars']]], axis = 1)

# Percentage of variance explained by each of the selected components.
print(f"Explained variance ratio: {str(round(svd.explained_variance_ratio_.sum(), 10))}")

# snapshot
svd_df

In [None]:
# plot 2D SVD
plt.figure(figsize = (10, 8))
sns.scatterplot(data = svd_df,
                x = "SVD1",
                y = "SVD2",
                hue = "stars",
                palette = sns.color_palette("hls", 5),
                alpha = 1)
plt.xlabel('SVD1', fontsize = 15)
plt.ylabel('SVD2', fontsize = 15)
plt.title('2 Component Truncated SVD for CA Yelp Reviews', fontsize = 18)

## Truncated SVD to visualize text in 3 Dimensions against Target

In [None]:
from sklearn.decomposition import TruncatedSVD

# initialize model
svd = TruncatedSVD(n_components=3)

# fit model
svd_Components = svd.fit_transform(X_tfidf)

# cast as df
svd_df_3d = pd.DataFrame(data = svd_Components,
                           columns = ['SVD1',
                                      'SVD2',
                                      'SVD3'])

# concatenate stars
svd_df_3d = pd.concat([svd_df_3d, master_df[['stars']]], axis = 1)

# Percentage of variance explained by each of the selected components.
print(f"Explained variance ratio: {str(round(svd.explained_variance_ratio_.sum(), 10))}")

# snapshot
svd_df_3d

In [None]:
import plotly.express as px

svd_df_3d['stars'] = svd_df_3d['stars'].astype('str')

fig = px.scatter_3d(svd_df_3d, x='SVD3', y='SVD2', z='SVD1',
                    color='stars', opacity = 0.8)

fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

fig.show()

## Naive Bayes Stars Classifier

Now we fit a Naive Bayes classifier on the TF-IDF matrix of vectorized words to predict stars.

In [None]:
# importing dependencies
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, master_df['stars'], test_size=0.20, random_state=420)

# train validation split
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.25, random_state=420)

In [None]:
# initialize MNB and fit
clf = MultinomialNB().fit(X_train, y_train)

# in-sample predicted
in_sample_pred = clf.predict(X_train)

# OOS predicted
oos_pred = clf.predict(X_test)

In [None]:
# in sample performance metrics
print("In Sample Classification Report")
print(classification_report(y_train, in_sample_pred, target_names=master_df['stars'].unique().astype("str")))

In [None]:
# in sample performance metrics
print("Out of Sample Classification Report")
print(classification_report(y_test, oos_pred, target_names=master_df['stars'].unique().astype("str")))