In [1]:
# https://maartengr.github.io/BERTopic/index.html
from bertopic import BERTopic
from umap import UMAP
import nltk
nltk.download('stopwords')
import re
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
from bubble import BubbleChart
from fpdf import FPDF
from datetime import date
from sklearn.preprocessing import MinMaxScaler

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\panag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Summarize function

In [2]:
def summarize(metadata_path):
    
    # Read metadata
    metadata = pd.read_csv(metadata_path)
    food = metadata['food'][0]
    city = metadata['city'][0]
    rest_name = metadata['rest_name'][0]
    rest_location = metadata['rest_location'][0]
    
    # Open the data as dataframe
    df = pd.read_csv(f'data/yelp_reviews_{food}_{city}.csv', parse_dates = ['date'])
    df['rating_number'] = df['rating'].str.extract('(\d+)').astype(float)
    # Calculate average rating score
    score = round(df['rating_number'].mean(),1)

    
    ########################################## Statistics Plots ##########################################
    ######################################################################################################


    ################## Ratings over time (Plot 1) ##################

    # Dataset analysis
    df1 = df.groupby(pd.Grouper(key='date', freq='6M'))['rating_number'].mean().reset_index()
    df2 = df.groupby(pd.Grouper(key='date', freq='6M'))['rating_number'].count().reset_index()

    #Plot 
    fig, ax = plt.subplots(figsize=(16,7))
    plt.style.use('seaborn-ticks')
    ax.set_facecolor("whitesmoke")

    # Plot trend line
    ax.set_title('Ratings over time (6 Months Average)', fontweight="bold", fontsize = 20)
    ax.plot(df1.date, df1.rating_number, color='black', linewidth = 2.5)
    ax.legend(['Ratings'], loc = 'upper left',  fontsize = 13)
    ax.set_ylabel('Rating', fontweight="bold", fontsize = 13)
    ax.grid(False)
    plt.yticks(fontsize=13, fontweight="bold")
    plt.xticks(fontsize=13, fontweight="bold")

    ax2 = ax.twinx()

    # Color scale setup for bars
    scaler = MinMaxScaler([0.2, 0.8])
    sc = scaler.fit_transform(df2[['rating_number']])
    rgba_colors = np.zeros((len(df2),4))
    rgba_colors[:,0] = 255/255
    rgba_colors[:,1] = 69/255
    rgba_colors[:,2] = 0/255
    rgba_colors[:,-1]=sc.reshape(1,len(df2)).flatten()

    # Plot bar
    ax2.bar(df2.date, df2.rating_number, color = rgba_colors, width = 100, edgecolor='black')
    plt.yticks(fontsize=13, fontweight="bold")
    ax2.legend(['Number of reviews'], loc = 'upper right',fontsize = 13).set_alpha(1)
    ax2.set_ylabel('Number of Reviews', fontweight="bold", fontsize = 13)
    ax2.grid(False)

    plt.tight_layout()
    plt.savefig('./images/chart_1.png', bbox_inches = 'tight', dpi = 100)
    plt.close()
    
    ################## Numbers of ratings (Plot 2) ##################

    # Dataset analysis
    df1 = pd.DataFrame(df.rating_number.value_counts())

    #Plot
    fig, ax = plt.subplots(figsize=(7,5))
    plt.style.use('seaborn-ticks')
    ax.set_facecolor("whitesmoke")

    # Color scale setup for bars
    scaler = MinMaxScaler([0.2, 0.8])
    sc = scaler.fit_transform(df1[['rating_number']])
    rgba_colors = np.zeros((len(df1),4))
    rgba_colors[:,0] = 255/255
    rgba_colors[:,1] = 69/255
    rgba_colors[:,2] = 0/255
    rgba_colors[:,-1]=sc.reshape(1,len(df1)).flatten()

    ax.barh(df1.index, df1.rating_number, color = rgba_colors, edgecolor='black')
    ax.set_title('Number of Ratings', fontweight="bold", fontsize = 15)
    ax.set_ylabel('Rating', fontweight="bold", fontsize = 13)

    plt.tight_layout()
    plt.savefig('./images/chart_2.png', bbox_inches = 'tight', dpi = 100)
    plt.close()
 


    ########################################## NLP Analysis Plots ########################################
    ######################################################################################################

    
    # Cleaning the dataset

    # Remove non-word (special) characters such as punctuation etc
    df['review'] = [re.sub(r'\W', ' ', str(x)) for x in df['review']]
    # Remove numbers
    df['review'] = [re.sub(r'[0-9]', ' ', str(x)) for x in df['review']]
    # Remove all single characters
    df['review'] = [re.sub(r'\s+[a-zA-Z]\s+', ' ', str(x)) for x in df['review']]
    # Substitute multiple spaces with single space
    df['review'] = [re.sub(r'\s+', ' ', str(x), flags=re.I) for x in df['review']]

    # Split in Positive and negative reviews
    df_pos = df[df.rating_number > 3].reset_index()
    df_neg = df[df.rating_number < 3].reset_index()

    # Remove extra words
    restaurant_related_words = ['restaurant','star', rest_name.lower(), 
                                rest_location.lower(), city.lower()]

    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(restaurant_related_words)


    ################## Positive reviews BERTopic modeling ######################

    # Step 1 - Extract embeddings
    embedding_model = SentenceTransformer("all-mpnet-base-v2")

    # Step 2 - Reduce dimensionality
    umap_model = UMAP(n_neighbors = 15, n_components = 3, 
                      min_dist = 0.0, metric = 'cosine', random_state = 34)

    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(min_cluster_size = 10, metric = 'euclidean', 
                            cluster_selection_method = 'eom', prediction_data = True)

    # Step 4 - Tokenize topics
    vectorizer_model = CountVectorizer(stop_words = stopwords, ngram_range=(1, 3))

    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

    # Initiate BERTopic
    topic_model_pos = BERTopic(embedding_model = embedding_model,    # Step 1 - Extract embeddings
                            umap_model = umap_model,              # Step 2 - Reduce dimensionality
                            hdbscan_model = hdbscan_model,        # Step 3 - Cluster reduced embeddings
                            vectorizer_model = vectorizer_model,  # Step 4 - Tokenize topics
                            ctfidf_model = ctfidf_model,          # Step 5 - Extract topic words
                            diversity = 0.7,                      # Step 6 - Diversify topic words 
                            nr_topics = 4,                        # Number of topics
                            top_n_words = 4,                      # Number of words per topic
                            calculate_probabilities = True)

    # Run BERTopic model
    topics_pos = topic_model_pos.fit_transform(df_pos['review'])

    # Data for plotting
    labels_pos = list(topic_model_pos.topic_labels_.values())[1:]
    probability_pos = topic_model_pos.probabilities_[0]
    colors_pos =  ['#42772a', '#49822f', '#528c38','#569d36']

    bubble_chart = BubbleChart(area = probability_pos, bubble_spacing=0.2)
    bubble_chart.collapse()

    fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"))
    fig.set_size_inches(12,3, forward=True)

    ax.set_title('Positive Topic Modeling', fontweight="bold", fontsize = 12)
    bubble_chart.plot(ax, labels_pos, colors_pos)

    ax.axis("off")
    ax.relim()
    ax.autoscale_view()
    plt.tight_layout()
    plt.savefig('./images/pos_chart.png', bbox_inches = 'tight', dpi = 100)
    plt.close()

    ################## Negative reviews BERTopic modeling ######################

    # Step 1 - Extract embeddings
    embedding_model = SentenceTransformer("all-mpnet-base-v2")

    # Step 2 - Reduce dimensionality
    umap_model = UMAP(n_neighbors = 4, n_components = 3,                      # We reduce the number of neighboors
                      min_dist = 0.0, metric = 'cosine', random_state = 34)

    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(min_cluster_size = 5, metric = 'euclidean',       # We reduce the cluster size
                            cluster_selection_method = 'eom', prediction_data = True)

    # Step 4 - Tokenize topics
    vectorizer_model = CountVectorizer(stop_words = stopwords, ngram_range=(1, 3))

    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)


    # Initiate BERTopic
    topic_model_neg = BERTopic(embedding_model = embedding_model,    # Step 1 - Extract embeddings
                            umap_model = umap_model,              # Step 2 - Reduce dimensionality
                            hdbscan_model = hdbscan_model,        # Step 3 - Cluster reduced embeddings
                            vectorizer_model = vectorizer_model,  # Step 4 - Tokenize topics
                            ctfidf_model = ctfidf_model,          # Step 5 - Extract topic words
                            diversity = 0.7,                      # Step 6 - Diversify topic words 
                            nr_topics = 4,                        # Number of topics
                            top_n_words = 4,                      # Number of words per topic
                            calculate_probabilities = True)

    # Run BERTopic model
    topics_neg = topic_model_neg.fit_transform(df_neg['review'])

    # Data for plotting
    labels_neg = list(topic_model_neg.topic_labels_.values())[1:]
    probability_neg = topic_model_neg.probabilities_[0]
    colors_neg =  ['#9d0303', '#b00000', '#ff4200','#e55c00']

    bubble_chart = BubbleChart(area = probability_neg, bubble_spacing=0.2)
    bubble_chart.collapse()

    fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"))
    fig.set_size_inches(12, 3, forward=True)


    ax.set_title('Negative Topic Modeling', fontweight="bold", fontsize = 12)
    bubble_chart.plot(ax, labels_neg, colors_neg)

    ax.axis("off")
    ax.relim()
    ax.autoscale_view()
    plt.tight_layout()
    plt.savefig('./images/neg_chart.png', bbox_inches = 'tight', dpi = 100)
    plt.close()



    ########################################## PDF report creation ##########################################
    #########################################################################################################

    pdf = FPDF('L', 'mm', 'A4')
    pdf.add_page()

    # A4 dimensions
    height = 210
    width = 297

    # Template Creation
    cordin = 1
    pdf.rect(cordin,cordin,width - 2*cordin, height-2*cordin)

    cordin = 2
    pdf.set_line_width(1)
    pdf.set_draw_color(175,6,6)
    pdf.rect(cordin,cordin,width - 2*cordin, height-2*cordin)

    # Title 
    pdf.set_xy(0, 15)
    pdf.set_font('Times', 'B', 18)
    pdf.cell(w = 0, align = 'C', txt = "Summury Review Report")

    # Date 
    pdf.set_xy(260, 11)
    pdf.set_font('Times', 'I', 12)
    pdf.cell(w = 0, align = 'C', txt = "Date: " + str(date.today().strftime("%d/%m/%Y")))

    # Logo
    pdf.image('./images/YelpLogo.jpg', 
            x = 9, y = 4, w = 30, type = 'JPEG')

    # Text Restaurant 
    pdf.set_xy(15, 35)
    pdf.set_font('Times','', 14)
    pdf.cell(w = 23, align = 'C', txt = "Restaurant: ")

    # Text Restaurant Name
    pdf.set_xy(37, 35)
    pdf.set_font('Times','B', 14)
    pdf.cell(w = 55, align = 'L', txt = rest_name)

    # Octagon
    cx = 148
    cy = 37
    a = 20
    b = 8

    pdf.set_line_width(0.5)
    pdf.set_draw_color(175,6,6)
    coords = ((cx-b/2, cy+a/2), (cx-a/2, cy+b/2), (cx-a/2,cy-b/2), (cx-b/2,cy-a/2),
              (cx+b/2,cy-a/2), (cx+a/2,cy-b/2),(cx+a/2, cy+b/2), (cx+b/2,cy+a/2))
    pdf.polygon(coords)

    # Text score 
    pdf.set_xy(141, 33)
    pdf.set_font('Times', 'B', 25)
    pdf.cell(w = 5, align = 'L', txt = str(score))

    # Text Location 
    pdf.set_xy(230, 35)
    pdf.set_font('Times','', 14)
    pdf.cell(w = 20, align = 'L', txt = "Location: ")

    # Text Restaurant Name
    pdf.set_xy(250, 35)
    pdf.set_font('Times','B', 14)
    pdf.cell(w = 0, align = 'L', txt = rest_location)

    # Positive Chart
    pdf.image('./images/pos_chart.png', 
            x = 20, y = 50, w = 100, type = 'PNG')

    # Negative Chart
    pdf.image('./images/neg_chart.png', 
            x = 170, y = 50, w = 100, type = 'PNG')

    # Plot 1
    pdf.image('./images/chart_1.png', 
            x = 10, y = 140, w = 150, type = 'PNG')

    # Plot 2
    pdf.image('./images/chart_2.png', 
            x = 180, y = 140, w = 90, type = 'PNG')

    pdf.set_author('Giannopoulos Panagiotis')
    pdf.output(f'./{rest_name}_{rest_location}_summary_report.pdf', 'F')
    
    print("The summarization report is:  ", f'./{rest_name}_{rest_location}_summary_report.pdf')

## Summarize Report

In [3]:
summarize('data/metadata.csv')

  pdf.image('./images/YelpLogo.jpg',
  pdf.image('./images/pos_chart.png',
  pdf.image('./images/neg_chart.png',
  pdf.image('./images/chart_1.png',
  pdf.image('./images/chart_2.png',
  pdf.output(f'./{rest_name}_{rest_location}_summary_report.pdf', 'F')


The summarization report is:   ./Dishoom_Covent Garden_summary_report.pdf
