In [4]:
#Importing the necessary packages for operations and NLP
import pandas as pd   
import numpy as np
from matplotlib import pyplot as plt
from collections import Counter

from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
#This is used for stemming of words
stemmer = PorterStemmer()

import re
import json
#import enchant

In [5]:
def readData():
    print("Gathering Raw Data")
    with open("review_partial1.json",encoding="utf8") as f:
        reviews = f.read().strip().split("\n")
    reviews = [json.loads(review) for review in reviews]
    print("Gathered the Raw Data")
    return(reviews)

def getTextsAndStars(reviews):
    print("Getting the Raw Text and the rating corresponding to the texts")
    texts = [review['text'] for review in reviews]
    stars = [review['stars'] for review in reviews]
    #Writing the texts and stars into a dataframe for later use
    textsAndStars = pd.DataFrame(columns=['texts','stars'])
    textsAndStars['texts'] = texts
    textsAndStars['stars'] = stars
    print("Obtained the raw text and ratings")
    texts = textsAndStars['texts']
    stars = textsAndStars['stars']
    return texts,stars

def getReviewsAndStarsForBusinesses(reviews,allTexts,allStars, businessID):
    #Getting all reviews for certain businesses
    texts = [review['text'] for review in reviews if review['business_id'] == str(businessID)]
    #Getting all ratings for the user with most reviews
    stars = [review['stars'] for review in reviews if review['business_id'] == str(businessID)]
    
    dates = [review['date'] for review in reviews if review['business_id'] == str(businessID)]
    return texts,stars,dates

#Defining function for converting a raw review to a string of words 

def review_to_words(raw_review):
    #d = enchant.Dict("en_US")
    # Function to convert a raw review to a string of words
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review,"lxml").get_text()
    #
    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    #6.Remove Non-English words
    #meaningful_words_2 = [w for w in meaningful_words if d.check(w)]
    #7.stemming
    #meaningful_words_3=[stemmer.stem(w) for w in  meaningful_words_2]
    # 8. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words))

#removing stopwords, special characters
def clean_Reviews(texts):
    print("Preparing to clean reviews")
    clean_reviews = []
    for i in range(len(texts)):
        clean_reviews.append( review_to_words( texts[i]))
    print("Cleaned all reviews")
    return(clean_reviews)


def statisticalAnalysis(stars,dates):
    stars = np.array(stars)
    
    print("Average Rating for your business is",np.mean(stars))
    print("The minimum rating obtained for your business is",np.min(stars))
    print("The maximum rating obtained for your business is",np.max(stars))
    
    plt.hist(stars)
    plt.title('Plot showing count of stars')
    plt.xlabel('Stars')
    plt.ylabel('Count')
    plt.show()
    
    
    plt.boxplot(stars)
    plt.title('Boxplot of ratings')
    plt.xlabel('Boxplot of ratings')
    plt.show()
    
    year = []
    for i in range(len(dates)):
        year.append(str(dates[i])[0:4])
    
    
    year = []
    for i in range(len(dates)):
        year.append(str(dates[i])[0:4])
    df = pd.DataFrame(columns=['stars','year'])
    df['stars'] = stars
    df['year'] = year
    
    npyear = np.unique(np.array(year))
    meanRatings = []
    for i in range(len(npyear)):
        meanRatings.append(np.mean(np.array(df.loc[df['year'] == npyear[i], 'stars'])))
            
    plt.plot(npyear,meanRatings)
    plt.title('Average ratings over the years')
    plt.show()