In [13]:
from __future__ import print_function
from collections import Counter
from collections import OrderedDict
import os
import numpy as np
import numpy
import pandas as pd
from scipy import optimize
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import calendar
import datetime
import re
from math import log
from scipy import stats
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from array import array
from sklearn import preprocessing
from scipy.stats import boxcox
from scipy.stats import ttest_1samp, wilcoxon, ttest_ind, mannwhitneyu

In [15]:
#load the pickle object for quick access
result = pd.read_pickle("/Users/Matt/Documents/Stevens/BIA660 Web Analytics/yelp_dataset_challenge_academic_dataset/Data/500_Dataset")

In [16]:
def exploreBusiness(businessID):
    #exploratory Analysis for business ID mpDxBBGywUE6GRRKja3sBA
    business1 = result[result["business_id"]==businessID]
    #change date into datetime
    business1['date'] = pd.to_datetime(business1['date'])
    business1['Year-Month'] = business1.date.map(lambda x: x.strftime('%Y-%m'))
    business1['Qdate'] = [date - pd.tseries.offsets.DateOffset(days=1) + pd.tseries.offsets.QuarterEnd() for date in business1.date]
    #review_counts = business1.groupby('Year-Month')['review_id'].count()
    review_counts = business1.groupby('Qdate')['stars_x'].mean() #grab the mean number of reviews
    return review_counts, business1

In [17]:
# runs a 1 sample t test to compare a new month with the previous months
# https://stats.stackexchange.com/questions/198825/compare-previous-month-to-all-other-previous-months-performance
# http://iaingallagher.tumblr.com/post/50980987285/t-tests-in-python
#if stats.shapiro(review_counts)[1] >0.05:
#    normal_data=1
def Ttest_Anomaly_Detection(review_counts):
    anomaly = OrderedDict() #initialize dict
    d = OrderedDict()
    normal_data=0
    if stats.shapiro(review_counts)[1] >0.05:
        normal_data=1
    #one_sample_data = review_counts.values
    for ind, value in review_counts.iteritems():
        d[ind] = value
        #anomaly[ind]=-1
        if len(d) >= 31: #need 30 samples because the data is non-normal
            #t-test to compare the next month/year sample to the mean of all the prior ones
            one_sample_data = d.values()[0:(len(d)-1)]
            one_sample = stats.ttest_1samp(one_sample_data,value) #compares the entire period
            #one_sample = stats.wilcoxon(one_sample_data-value)
            #print(ind,value, one_sample_data)
            #print(len(d))
            if one_sample.pvalue < 0.01: #null hypothesis: new sample = prior mean
                anomaly[ind]=1  #there is a statistically signifant difference, where the new sample != prior mean   
    return anomaly, normal_data

In [18]:
#locate the period where there is a change in the anomaly[ind] values
#returns a dict where an anomaly is coded as a 1
def choosePeriods(anomaly):
    period={}
    keylist = anomaly.keys()
    keylist.sort()
    for key in keylist:
        if anomaly[key] == 1:
            period[key]=1
        else:
            period[key]=0
    return period   

In [19]:
#show a summary of the top words of ALL the anomalies for a given business
def textAnalysis_of_Anomaly(business_df, business_id, period):
    common={}
    final=''
    keylist = period.keys()
    keylist.sort()
    words_test=[]
    for key in keylist:
        if period[key] == 1:
            text=business_df[(business_df['Qdate']==key)].text
            text=text.to_string()
            text = re.sub('\W+', ' ',text)
            default_stopwords = set(nltk.corpus.stopwords.words('english'))
            custom_stopwords = set((u'–', '...'))
            all_stopwords = default_stopwords | custom_stopwords
            words = nltk.word_tokenize(text)

            words = [word for word in words if len(word) > 2] #remove words that are only 1 character
            # Remove numbers
            words = [word for word in words if not word.isnumeric()]

            # Lowercase all words (default_stopwords are lowercase too)
            words = [word.lower() for word in words]

            #remove words that are stopwords
            words = [word for word in words if word not in all_stopwords]
            words_test += words

    #print(words_test)
    # Calculate frequency distribution
    fdist = nltk.FreqDist(words_test)

    for word, frequency in fdist.most_common(10):
        final+=str(word+' ')
    common[business_id] = final

    return business_id, common

In [20]:
#NOT USED - outputs the most common words for each anomaly found for a business
def getCommonWordsForEachAnomaly(business_df, business_id, period):
    common={}
    keylist = period.keys()
    keylist.sort()
    for key in keylist:
        final=''
        if period[key] == 1:
            text=business_df[(business_df['Qdate']==key)].text
            text=text.to_string()
            text = re.sub('\W+', ' ',text)
            default_stopwords = set(nltk.corpus.stopwords.words('english'))
            custom_stopwords = set((u'–', '...'))
            all_stopwords = default_stopwords | custom_stopwords
            words = nltk.word_tokenize(text)
            
            words = [word for word in words if len(word) > 2] #remove words that are only 1 character
            # Remove numbers
            words = [word for word in words if not word.isnumeric()]

            # Lowercase all words (default_stopwords are lowercase too)
            words = [word.lower() for word in words]

            #remove words that are stopwords
            words = [word for word in words if word not in all_stopwords]

            # Calculate frequency distribution
            fdist = nltk.FreqDist(words)
            
            for word, frequency in fdist.most_common(5):
                final+=str(word+' ')
        common[key] = final
    
    return business_id, common

In [21]:
def createDataForPlot(anomaly, review_counts):
    anomaly_list = anomaly.items()
    anomaly_list = pd.DataFrame(anomaly_list)
    anomaly_list.columns = ['Qdate','Anomaly']
    review_df = pd.DataFrame(review_counts)
    review_df = review_df.reset_index()
    review_df.columns = ['Qdate','Mean_Rating']
    merged_plot_df = pd.merge(review_df, anomaly_list, how='left', on=['Qdate'])
    merged_plot_df = merged_plot_df.fillna(value=0)
    return merged_plot_df

In [39]:
def makeFolder(common, merged_plot_df, business_id):
    os.chdir('/Users/Matt/Documents/Stevens/BIA660 Web Analytics/yelp_dataset_challenge_academic_dataset')
    if not os.path.exists('Results'):
        os.makedirs('Results')
    os.chdir('/Users/Matt/Documents/Stevens/BIA660 Web Analytics/yelp_dataset_challenge_academic_dataset/Results')
    working_directory = os.getcwd()
    BusinessID=str(business_id)
    Text = "Business ID: "+BusinessID +"\n"+ "Top 10 common words: " + str(common.values()[0])
    #if not os.path.exists(BusinessID):
    os.makedirs(BusinessID)
    os.chdir(BusinessID)
    

    filename=common.values()[0]+".txt"
    
    file1 = open(filename, "w")
    file1.write(Text)
    file1.close()
    
    
    #plot graph
    colors = {0: 'b', 1: 'r'}
    ax = merged_plot_df.plot(x='Qdate', y='Mean_Rating', kind='bar',figsize=(20,10), color=[colors[i] for i in merged_plot_df['Anomaly']])
    fig = ax.get_figure()
    fig.savefig(BusinessID+'.png')

    os.chdir(working_directory)

    return "success"

In [39]:
for business in result.business_id.unique():
    common={}
    business_id = business
    review_counts, business_df = exploreBusiness(business)
    anomaly, normal_data = Ttest_Anomaly_Detection(review_counts)
    if len(anomaly)> 1 and normal_data == 1:
        period = choosePeriods(anomaly)
        business_id, common=textAnalysis_of_Anomaly(business_df,business_id,period)
        merged_plot_df = createDataForPlot(anomaly, review_counts)
        makeFolder(common, merged_plot_df, business_id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KeyboardInterrupt: 

KeyboardInterrupt: 