# Important packages

In [1]:
import pandas as pd
import numpy as np
from time import time

import matplotlib.pyplot as plt; plt.rcdefaults()
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn import datasets, metrics, svm, linear_model
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression, LogisticRegressionCV, RidgeCV, LassoCV, PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.extmath import density
import statsmodels.api as sm
import statsmodels.formula.api as smf


import scipy.stats as stats
np.set_printoptions(precision=4)

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm_notebook
import spacy
import en_core_web_sm
import nltk
import nltk.data
from nltk.corpus import stopwords
import textacy

import pickle
import warnings
warnings.simplefilter("ignore")

# Load data

In [2]:
########### READ ###########
# https://stackoverflow.com/questions/46507606/what-does-the-cv-stand-for-in-sklearn-linear-model-logisticregressioncv

In [3]:
# Decided to use .sample - the original dataset is large and takes long time to run models
restaurants = pd.read_pickle('/Users/mai/Desktop/yelp_dataset/to_submit/restaurants_clean.pickle')

In [4]:
reviews = pd.read_pickle('/Users/mai/Desktop/yelp_dataset/to_submit/review_sentimentanalysis.pickle')

# Create datasets per business status

In [5]:
successful_reviews = reviews[reviews.prediction_col == 1]
failed_reviews = reviews[reviews.prediction_col == 0]

In [6]:
print(successful_reviews.shape)
print(failed_reviews.shape)

(66514, 16)
(67770, 16)


# Add more words to the stop_word list

In [7]:
stop_words_list = stopwords.words('english')

In [8]:
words_to_remove = [
                    'com biz_photos mudalytuatnzm2k9zoh27q select', 
                    'com biz_photos mudalytuatnzm2k9zoh27q select', 
                    'don know', 'don think', 'don think ll going', 'don want', 'http',
                    'biz_photos', 'http www youtube com watch', 
                    'let just say', 'love love', 'new york', 'www yelp com',  
                    '6_jqe5olfhoz1t_m96g85g', 'dyuoxkw4dtljsthdxdxslg', 'mudalytuatnzm2k9zoh27q', 
                    'yelp com biz_photos', 
                    'yelp com biz_photos dyuoxkw4dtljsthdxdxslg select', 
                    'yelp com biz_photos mudalytuatnzm2k9zoh27q', 
                ]

In [9]:
stop_words_list.extend(words_to_remove)

# Create a function to extract frequently used keywords

In [11]:
def frequency_analyzer(data, start, end):
    cvec = CountVectorizer(stop_words=stop_words_list, ngram_range=(start, end))

    # Fit our vectorizer using our train data
    cvec.fit(data)

    # Transform training data
    cvec_mat = cvec.transform(data)

    # Words occuring
    words = cvec.get_feature_names()
    length = len(cvec.get_feature_names())

    # Create a ranking table
    
    a = cvec_mat.sum(axis=0)
    rank = pd.DataFrame(a, columns=words).transpose().sort_values(by=0, ascending=False)
    rank.reset_index(inplace=True)

    return rank[:100]

# Success restaurants with positive sentiment scores

In [12]:
success_positive_reviews = successful_reviews[successful_reviews['vader_result'] == 'positive']
Xsp = success_positive_reviews['text']

In [13]:
Xsp1 = frequency_analyzer(Xsp, 1, 1)
Xsp2 = frequency_analyzer(Xsp, 2, 2)
Xsp3 = frequency_analyzer(Xsp, 3, 3)
Xsp4 = frequency_analyzer(Xsp, 4, 4)
Xsp5 = frequency_analyzer(Xsp, 5, 5)

# Success restaurants with negative sentiment scores

In [14]:
successful_negative_reviews = successful_reviews[successful_reviews['vader_result'] == 'negative']
Xsn = successful_negative_reviews['text']

In [15]:
Xsn1 = frequency_analyzer(Xsn, 1, 1)
Xsn2 = frequency_analyzer(Xsn, 2, 2)
Xsn3 = frequency_analyzer(Xsn, 3, 3)
Xsn4 = frequency_analyzer(Xsn, 4, 4)
Xsn5 = frequency_analyzer(Xsn, 5, 5)

# Failed restaurants with positive sentiment scores

In [16]:
failed_positive_reviews = failed_reviews[failed_reviews['vader_result'] == 'positive']
Xfp = failed_positive_reviews['text']

In [17]:
Xfp1 = frequency_analyzer(Xfp, 1, 1)
Xfp2 = frequency_analyzer(Xsn, 2, 2)
Xfp3 = frequency_analyzer(Xfp, 3, 3)
Xfp4 = frequency_analyzer(Xfp, 4, 4)
Xfp5 = frequency_analyzer(Xfp, 5, 5)

# Failed restaurants with negative sentiment scores

In [18]:
failed_negative_reviews = failed_reviews[failed_reviews['vader_result'] == 'negative']
Xfn = failed_negative_reviews['text']

In [19]:
Xfn1 = frequency_analyzer(Xfn, 1, 1)
Xfn2 = frequency_analyzer(Xfn, 2, 2)
Xfn3 = frequency_analyzer(Xfn, 3, 3)
Xfn4 = frequency_analyzer(Xfn, 4, 4)
Xfn5 = frequency_analyzer(Xfn, 5, 5)
Xfn6 = frequency_analyzer(Xfn, 6, 6)
Xfn7 = frequency_analyzer(Xfn, 7, 7)
Xfn8 = frequency_analyzer(Xfn, 8, 8)

# Create a dataframe per review type

## Change column names for all DataFrames

In [20]:
df_list = [Xsp1, Xsp2, Xsp3, Xsp4, Xsp5, \
           Xsn1, Xsn2, Xsn3, Xsn4, Xsn5, \
           Xfp1, Xfp2, Xfp3, Xfp4, Xfp5, \
           Xfn1, Xfn2, Xfn3, Xfn4, Xfn5, Xfn6, Xfn7, Xfn8]

In [21]:
successful_positive_df_list = [Xsp1, Xsp2, Xsp3, Xsp4, Xsp5]
successful_negative_df_list = [Xsn1, Xsn2, Xsn3, Xsn4, Xsn5]
failed_positive_df_list = [Xfp1, Xfp2, Xfp3, Xfp4, Xfp5]
failed_negative_df_list = [Xfn1, Xfn2, Xfn3, Xfn4, Xfn5, Xfn6, Xfn7, Xfn8]

In [22]:
for d in df_list:
    data = d
    row_n = 0
    word_sample = data['index'][row_n]
    i = len(word_sample.split(' '))
    data.columns = ['ngram_'+str(i), 'ngram_'+str(i)+'_count']

## Concatenate dataframes

In [23]:
success_positive_result = pd.concat([Xsp1, Xsp2, Xsp3, Xsp4, Xsp5], axis=1, sort=False)
success_negative_result = pd.concat([Xsn1, Xsn2, Xsn3, Xsn4, Xsn5], axis=1, sort=False)
failed_positive_result = pd.concat([Xfp1, Xfp2, Xfp3, Xfp4, Xfp5], axis=1, sort=False)
failed_negative_result = pd.concat([Xfn1, Xfn2, Xfn3, Xfn4, Xfn5, Xfn6, Xfn7, Xfn8], axis=1, sort=False)

In [25]:
success_positive_result.head()

Unnamed: 0,ngram_1,ngram_1_count,ngram_2,ngram_2_count,ngram_3,ngram_3_count,ngram_4,ngram_4_count,ngram_5,ngram_5_count
0,food,39917,go back,2495,definitely come back,456,would definitely come back,190,would definitely come back try,24
1,good,36318,first time,2483,food great service,353,great food great service,123,buy one get one free,23
2,place,32120,really good,2474,wait go back,353,would definitely go back,123,definitely come back next time,20
3,great,29004,come back,2321,definitely go back,333,great service great food,89,great food great service great,20
4,service,21975,food good,2140,would go back,321,would definitely recommend place,78,say enough good things place,17


In [None]:
success_positive_result.to_csv('success_positive_result.csv')

In [26]:
failed_negative_result.head()

Unnamed: 0,ngram_1,ngram_1_count,ngram_2,ngram_2_count,ngram_3,ngram_3_count,ngram_4,ngram_4_count,ngram_5,ngram_5_count,ngram_6,ngram_6_count,ngram_7,ngram_7_count,ngram_8,ngram_8_count
0,food,7212,go back,504,never go back,154,would never go back,16,could give zero stars would,7,wife friends show anymore unless see,3,square sad exclude place list nobody deserves,3,asked complete strangers sitting next bar girl...,3
1,place,4883,customer service,448,never come back,51,really wanted like place,16,never go back recommend anyone,4,heard restaffing def needed except front,3,sure kaya woman surface management eventually ...,3,drinks atmosphere clanky noisy yard house scal...,3
2,service,3838,first time,313,worst service ever,51,worst customer service ever,14,never came back check us,4,noisy yard house scale restaurant time,3,assure go ever since last time week,3,time square sad exclude place list nobody dese...,3
3,one,3261,come back,306,would go back,49,could give stars would,12,deserves come place happy excited,3,could give place negative stars would,3,time heard restaffing def needed except front,3,believe attitude careless carefree behavior ev...,3
4,like,3001,food good,293,poor customer service,44,took 30 minutes get,12,heard restaffing def needed except,3,customers point asked complete strangers sitting,3,sitting next bar girl real even wanna,3,go ever since last time week ago rudely,3


In [258]:
failed_negative_result.to_csv('failed_negative_result.csv')