In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

In [None]:
business=pd.read_csv("../input/yelp_business.csv")

In [None]:
business.head()

### EDA

Let us check the rating distribution

In [None]:
x = business['stars'].value_counts().index
y = business['stars'].value_counts().values

In [None]:
plt.figure(figsize=(9,6))
ax= sns.barplot(x, y,data= business ,alpha=0.8 )
plt.title("Ratings Distribution")
plt.xlabel('Ratings ', fontsize=12)

Let us check the review count based on the rating

In [None]:
plt.figure(figsize=(9,6))
ax= sns.barplot(x = 'stars', y='review_count',data= business ,alpha=0.8 )
plt.title("Ratings Distribution")
plt.xlabel('Ratings ', fontsize=12)

In [None]:
business['categories'].head()

In [None]:
business_cat=' '.join(business['categories'])

In [None]:
categry=pd.DataFrame(business_cat.split(';'),columns=['category'])

In [None]:
x = categry.category.value_counts()

In [None]:
x=x.sort_values(ascending=False)
x=x.iloc[0:20]

Types of Business

In [None]:
plt.figure(figsize=(16,4))
ax = sns.barplot(x.index, x.values, alpha=0.8)
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)
plt.show()

In [None]:
x = business['city'].value_counts().sort_values(ascending = False)
x=x.iloc[0:25]

Cities with most business 

In [None]:
plt.figure(figsize=(16,4))
ax = sns.barplot(x.index, x.values, alpha=0.8)
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)
plt.show()

In [None]:
x = business['name'].value_counts().sort_values(ascending = False)

x=x.iloc[0:25]

Most reviwed business

In [None]:
plt.figure(figsize=(16,4))
ax = sns.barplot(x.index, x.values, alpha=0.8)
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)
plt.show()

In [None]:
busi_attr = pd.read_csv('../input/yelp_review.csv') 

In [None]:
busi_attr = busi_attr[:100000]

As the data is huge it is not possible for my system to perform analysis on the entire dataset, but if it is possible with your system try using more reviews. <br> I have selected data from 100,000 reviews.

### EDA on the reviews

In [None]:
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize

In [None]:
nltk.download('punkt')

Let us create a bag of words consisting of all the reviews!

In [None]:
a = busi_attr['text'].str.lower().str.cat(sep=' ')

In [None]:
import re

In [None]:
b = re.sub('[^A-Za-z]+', ' ', a)

In [None]:
b[:1000]

In [None]:
stop_words = list(get_stop_words('en'))         
nltk_words = list(stopwords.words('english'))   
stop_words.extend(nltk_words)

In [None]:
word_tokens = word_tokenize(b)

In [None]:
len(word_tokens)

In [None]:
filtered_sentence = []
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

In [None]:
len(filtered_sentence)

After removing the stop words, we can see reduction of size by 50 percent

In [None]:
# Remove characters which have length less than 2  
without_single_chr = [word for word in filtered_sentence if len(word) > 2]

# Remove numbers
cleaned_data_title = [word for word in without_single_chr if not word.isnumeric()]   

Let us find the most frequently used words in the reviews!

In [None]:
top_N = 100
word_dist = nltk.FreqDist(cleaned_data_title)
rslt = pd.DataFrame(word_dist.most_common(top_N),
                    columns=['Word', 'Frequency'])

plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.barplot(x="Word",y="Frequency", data=rslt.head(7))

Let us create a wordcloud 

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
def wc(data,bgcolor,title):
    plt.figure(figsize = (100,100))
    wc = WordCloud(background_color = bgcolor, max_words = 1000,  max_font_size = 50)
    wc.generate(' '.join(data))
    plt.imshow(wc)
    plt.axis('off')

In [None]:
wc(cleaned_data_title,'black','Most Used Words')

Let us try to perform analysis on the entire review rather than all the words. For this we make use of the TextBlob

In [None]:
from textblob import TextBlob

bloblist_desc = list()

df_review_str=busi_attr['text'].astype(str)

In [None]:
for row in df_review_str:
    blob = TextBlob(row)
    bloblist_desc.append((row,blob.sentiment.polarity, blob.sentiment.subjectivity))
    df_polarity_desc = pd.DataFrame(bloblist_desc, columns = ['Review','sentiment','polarity'])

In [None]:
df_polarity_desc.head()

Based on my analysis, I have selected these values however you could use based on your insights

In [None]:
def f(df_polarity_desc):
    if df_polarity_desc['sentiment'] > 0:
        val = "Positive Review"
    elif df_polarity_desc['sentiment'] == 0:
        val = "Neutral Review"
    else:
        val = "Negative Review"
    return val

In [None]:
df_polarity_desc['Sentiment_Type'] = df_polarity_desc.apply(f, axis=1)

plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.countplot(x="Sentiment_Type", data=df_polarity_desc)

In [None]:
positive_reviews=df_polarity_desc[df_polarity_desc['Sentiment_Type']=='Positive Review']
negative_reviews=df_polarity_desc[df_polarity_desc['Sentiment_Type']=='Negative Review']

In [None]:
negative_reviews.head()

Let us look at the wordcloud of the most used words in a positive review

In [None]:
wc(positive_reviews['Review'],'black','Most Used Words')

Let us look at the wordcloud of the most used words in a negative review

In [None]:
wc(negative_reviews['Review'],'black','Most Used Words')

### Using machine learning to predict whether a review has 1 star rating or 5 star rating

Let us now train a model We are taking only review with ratings 1 and 5 to perform the analysis to make the analysis more simple.

In [None]:
busi_attr=busi_attr.dropna(axis=0,how='any')
rating_class = busi_attr[(busi_attr['stars'] == 1) | (busi_attr['stars'] == 5)]
X_review=rating_class['text']
y=rating_class['stars']

In [None]:
import string
def text_process(review):
    nopunc=[word for word in review if word not in string.punctuation]
    nopunc=''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer=CountVectorizer(analyzer=text_process).fit(X_review)

In [None]:
X_review = bow_transformer.transform(X_review)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_review, y, test_size=0.3, random_state=101)

In [None]:
X_train

Using SVM

In [None]:
from sklearn.svm import SVC
sv_model = SVC()
sv_model.fit(X_train, y_train)
Y_pred = sv_model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(confusion_matrix(y_test, Y_pred))
print('\n Accuracy:')
print(accuracy_score(y_test, Y_pred))
print(classification_report(y_test, Y_pred))

Using Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lg_model = LogisticRegression()
lg_model.fit(X_train, y_train)
Y_pred = lg_model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(confusion_matrix(y_test, Y_pred))
print('\n Accuracy:')
print(accuracy_score(y_test, Y_pred))
print(classification_report(y_test, Y_pred))

Using Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
predict=nb.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(confusion_matrix(y_test, predict))
print('\n Accuracy:')
print(accuracy_score(y_test, predict))
print(classification_report(y_test, predict))

We can see we have the best accuracy with Logistic regression. 


In SVM the given labeled training data (supervised learning), the algorithm outputs an optimal hyperplane which categorizes the new examples.

Naive Bayes classifiers are a family of simple "probabilistic classifiers" based on applying Bayes' theorem with strong (naive) independence assumptions between the features.

Naïve Bayes is probabilistic in nature, while the SVM one is geometric.

The features point of view is that Naive Bayes treats them as independent, whereas SVM looks at the interactions.




Naive Bayes classifier (nBc) makes two bold assumptions:
1)The probability of occurrence of any word given the class label, is independent of the probability of occurrence of any other word, given that label.
2)The probability of occurrence of a word in a document, is independent of the location of that word within the document(!).

Logistic regression measures the relationship between a output variable Y (categorical) and one or more independent variables, which are usually (but not necessarily) continuous, by using probability scores as the predicted values of the dependent variable.

In short Naive Bayes has a higher bias but lower variance compared to logistic regression. If the data set follows the bias then Naive Bayes will be a better classifier. 

