# Web Application for Text Summarization & Topic Extractions

In [1]:
import pandas as pd
from jupyter_dash import JupyterDash
import dash
import dash_daq as daq
import dash_table
import dash_bootstrap_components as dbc
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
from dash import callback_context
import plotly.express as px
import plotly.graph_objects as go
from plotly.figure_factory import create_gantt
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# user defined module
import app_functions

## Load Test Data

In [2]:
# test data is retrieved from https://en.wikipedia.org/wiki/Natural_language_processing#Common_NLP_tasks
# load test data for preloading input text into a textbox
with open('test.txt', 'r') as f:
    test_input = f.read()

## Text Summarization Model

In [3]:
import torch 
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict = True)

def summarize_t5base(text, model, tokenizer):  
    input = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=1000, truncation=True)
    output = model.generate(input, max_length=250, min_length=25, length_penalty=5, num_beams=2)
    final_output = tokenizer.decode(output[0])
    
    return final_output

## App Layout

In [4]:
# set height for a given object
obj_height = '450px'

# section header style
sectionHeaderStyle = {'fontSize': '30px', 'color':'#fcf75e'}

# set style for header Div
divHeaderStyle = {'border': '1px solid #b0c4de', 'backgroundColor':'#1e90ff',
                  'textAlign':'center', 'color':'white', 'fontWeight':'bold', 'fontSize':'16pt'}
divStyle = {'width':'450px', 'height':obj_height, 'border': '1px solid #b0c4de', 
            'backgroundColor':'#fff', 'margin-right':'40px'}

# style for loading spinner/circle
loading_parent_style = {'width':'100%', 'height':'412px',
                        'align':'center', 'vertical-align':'middle',
                        'position':'relative'}

In [5]:
# instantiate the app
app = JupyterDash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# create the layout of the app
app.layout = html.Main([
    html.H1('Topic Extraction & Text Summarization for Computer Science Journals', 
            style={'fontSize': '40px', 'color':'#fff', 'margin-bottom':'30px'}),
    html.Table([
        html.Tr([
            html.Td(html.Div(style={'width':'20px'})), # end html.Td
            html.Td([
                dbc.Row([
                    html.H2('Text Summarization', style={'fontSize': '30px', 'color':'#fcf75e'}),
                    dbc.Tabs([
                        dbc.Tab([
                            html.Div([
                                dcc.Textarea(
                                    id='input_text', value=test_input,
                                    style={'width': '800px', 'height': '400px', 
                                           'backgroundColor':'#f5f5f5', 'overflow':'auto'},
                                ) # end dcc.Textarea
                            ]), # end html.Div
                            html.Div(
                                [html.Button('Generate Summary & Topics', id='btn_submit', n_clicks=0, 
                                             style={'fontSize':'14pt', 'color':'#00008b',
                                                    'backgroundColor':'#89cff0'})],
                            style={'transform': 'translateX(35%)', 'margin':'10px'}), # end html.Div
                        ], label='Input Text'), # end dbc.Tab
                    ], style={'font-weight':'bold', 'fontColor':'blue'}), # end dbc.Tabs
                ]), # end dbc.Row
                dbc.Row([
                    dbc.Tabs([
                        dbc.Tab([
                            dcc.Loading(
                                id="loading-output-text",
                                type="default",
                                children=html.Div(id='output_text', 
                                                 style={'whiteSpace': 'pre-line', 'backgroundColor':'#e7feff', 
                                                        'border': '1px solid green', 'overflow':'auto', 
                                                        'height':'400px', 'width':'800px'}), # end html.Div
                            ),
                        ], label='Summarized Text', style={'backgroundColor':'#e7feff'}), # end dbc.Tab
                    ], style={'font-weight':'bold'}), # end dbc.Tabs
                ]), # end dbc.Row
            ]), # end html.Td
            html.Td(html.Div(style={'width':'80px'})), # end html.Td
            html.Td([
                html.H2('Topic Extraction', style=sectionHeaderStyle),
                html.Table([
                    html.Tr([
                        html.Td([
                            html.Div([
                                dcc.Loading(
                                    id='loading-topics',
                                    type='circle',
                                    children=[html.Div(id='topic_info'), # end html.Div
                                              html.Div(id='topic_table', style={'margin':'20px'})],
                                    parent_style={'height':'350px', 'width':'100%',
                                                  'align':'center', 'vertical-align':'middle',
                                                  'position':'relative'}
                                ), # end dcc.Loading
                            ], style={'border': '1px solid #b0c4de',
                                      'backgroundColor':'white', 'margin-right':'10px',
                                      'width':'500px', 'height':'400px', 'overflowY':'auto'}), # end html.Div
                        ]), # end html.Td
                        html.Td([
                            html.Div([
                                dcc.Loading(
                                    id='loading-metric-fig',
                                    type='circle',
                                    children=dcc.Graph(id='metric_fig', style={'height':'400px'})
                                ), # end dcc.Loading
                            ], style={'border': '1px solid #b0c4de', 'backgroundColor':'white', 
                                      'width':'430px', 'height':'400px'}), # end html.Div
                        ]), # end html.Td
                    ]), # end html.Tr
                ], style={'margin-bottom':'25px'}), # end html.Table
                html.H2('Word Cloud', style=sectionHeaderStyle),
                html.Table([
                    html.Tr([
                        html.Td([
                            html.Div([
                                html.Div(['Input Text'], style=divHeaderStyle),
                                html.Div([
                                    dcc.Loading(
                                        id="loading-text-wc",
                                        type="circle",
                                        children=html.Img(id='text_wordcloud'),
                                        parent_style=loading_parent_style,
                                    ), # end dcc.Loading
                                ]), # end Div
                            ], style=divStyle), # end html.Div
                        ]), # end html.Td
                        html.Td([
                            html.Div([
                                html.Div(['Summarized Text'], style=divHeaderStyle),
                                html.Div([
                                    dcc.Loading(
                                        id="loading-output-wc",
                                        type="circle",
                                        children=html.Img(id='output_wordcloud'),
                                        parent_style=loading_parent_style,
                                    ), # end dcc.Loading
                                ]), # end Div
                            ], style=divStyle), # end html.Div
                        ]), # end html.Td
                    ]), # end html.Tr
                ]), # end html.Table
            ], style={'vertical-align':'top'}), # end html.Td
        ]), # end html.Tr
    ]), # end html.Table
    html.Div(style={'height':'50px'}),
], style={'backgroundColor':'#000039'}) # end html.Main

## Callback Functions

#### Submit Button

In [6]:
@app.callback(
    Output('output_text', 'children'),
    Input('btn_submit', 'n_clicks'),
    State('input_text', 'value'),
    prevent_initial_call=True
)
def update_output(n_clicks, input_text):
    # T5 model and tokenizer for text summarization task 
    global model, tokenizer
    
    output_text = ''
    cleaned_text = ''
    if n_clicks > 0:
        output_text = summarize_t5base(input_text, model, tokenizer)
          
    return '{}'.format(output_text[5:-4])

#### Extracted Topics

In [7]:
def build_metric_figure(score_metrics, best_k):
    '''Build line plots of coherence and perplexity scores'''
    
    # Create traces
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    
    # plot coherence scores
    fig.add_trace(go.Scatter(x=score_metrics['Number of Topics'], y=score_metrics['Coherence'],
                             mode='lines+markers', name='Coherence'))
    # plot perplexity scores
    fig.add_trace(go.Scatter(x=score_metrics['Number of Topics'], y=score_metrics['Perplexity'],
                             mode='lines+markers', name='Perplexity'),
                  secondary_y=True)
    fig.add_vline(x=best_k, line_width=2, line_dash="dash", line_color="green",
                  annotation_text="<b>Best k=" + str(best_k) +'</b>', annotation_position="bottom right")
    
    # Set y-axes titles
    fig.update_yaxes(title_text="<b>Coherence</b>", secondary_y=False)
    fig.update_yaxes(title_text="<b>Perplexity</b>", secondary_y=True)
    fig.update_xaxes(title_text='<b>Number of Topics (k)</b>')
    
    # Set figure's title and legend
    fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.01, xanchor="right", x=0.98), 
                      width=430, height=400, title='<b>Model Evaluation</b>')
    
    # return figure
    return fig

In [8]:
# terms table style
terms_style_header={'backgroundColor':"#6495ed", 'border': '1px solid white', 
                    'textAlign': 'center', 'fontWeight': 'bold'}
terms_style_data={'backgroundColor':"#add8e6", 'border': '1px solid white', 'height': 'auto'}


@app.callback(Output('topic_table', 'children'),
              Output('topic_info', 'children'),
              Output('metric_fig', 'figure'),
              Input('btn_submit', 'n_clicks'),
              State('input_text', 'value'),
              prevent_initial_call=True)
def load_metric_figure(nclicks, input_text):
    # check if the user clicks the submit button
    if nclicks > 0:
        cleaned_text = app_functions.process_text(input_text)
        
        # extract topics and get scores
        topics, score_metrics = app_functions.extract_topics(cleaned_text)
        
        # build topic info table
        max_coherence = max(score_metrics['Coherence'])
        best_k = score_metrics[score_metrics['Coherence'] == max_coherence]
        text_style = {'color':'blue', 'display':'inline', 'margin':'20px'}
        topic_info = html.Table([
            html.Tr([
                html.Td([
                    html.H5('Number of topics: ', style={'display':'inline'}),
                    html.H4(best_k['Number of Topics'].values[0], style=text_style)
                ]),
            ]), # end html.Tr
            html.Tr([
                html.Td([
                    html.H5('Coherence: ', style={'display':'inline'}),
                    html.H4(round(best_k['Coherence'].values[0], 4), style=text_style)
                ]),
            ]), # end html.Tr
            html.Tr([
                html.Td([
                    html.H5('Perplexity: ', style={'display':'inline'}),
                    html.H4(round(best_k['Perplexity'].values[0], 4), style=text_style)
                ]),
            ]), # end html.Tr
        ], style={'margin':'10px'}) # end html.Table 
        
        
        # prepare data for topic_table
        topics_data = topics.to_dict('records')
        # build topic table
        topic_table = dash_table.DataTable(columns=[{"name": i, "id": i} for i in topics.columns],
                             data=topics.to_dict('records'),
                             style_table={'height': '350px', 'width':'420px'},
                             style_cell={'whiteSpace': 'normal', 'textAlign': 'left', 
                                         'height':'auto', 'verticalAlign':'top',
                                         # all three widths are needed
                                         'minWidth': '30px', 'width': '200px', 'maxWidth': '300px',},
                             style_header=terms_style_header,
                             style_data=terms_style_data,
                             style_cell_conditional=[{'if': {'column_id': 'Topic Id'}, 'width': '20px'},],
                            )    # end DataTable 
        
        return topic_table, topic_info, build_metric_figure(score_metrics, best_k['Number of Topics'].values[0])
    else:
        return html.H5('No topics found.'), '', {}

#### Word Clouds

In [9]:
from wordcloud import WordCloud
from io import BytesIO
import base64
import matplotlib.pyplot as plt

def plot_wordcloud(text_in):
    # build a word cloud of text
    wc = WordCloud(background_color="white", #collocation_threshold=2,
                   width=446, height=412).generate(text_in)
    
    # return the word cloud as an image
    return wc.to_image()

In [10]:
@app.callback(
    Output('text_wordcloud', 'src'),
    Output('output_wordcloud', 'src'),
    Input('btn_submit', 'n_clicks'),
    Input('input_text', 'value'),
    Input('output_text', 'children'),
    prevent_initial_call=True)
def build_wordclouds(n_clicks, input_text, summarized_text):
    
    # create WordCloud for input text
    input_wc = BytesIO()
    plot_wordcloud(input_text).save(input_wc, format='PNG')
    
    # create a Word Cloud for summarized text
    output_wc = BytesIO()
    plot_wordcloud(summarized_text).save(output_wc, format='PNG')
    
    return 'data:image/png;base64,{}'.format(base64.b64encode(input_wc.getvalue()).decode()), \
           'data:image/png;base64,{}'.format(base64.b64encode(output_wc.getvalue()).decode())

## Start App

In [11]:
# run Dash app
if __name__ == '__main__':
    app.run_server(mode='external', port=9000, debug=False)

 * Running on http://127.0.0.1:9000/ (Press CTRL+C to quit)
127.0.0.1 - - [30/Nov/2021 22:11:48] "[37mGET /_alive_237ff964-35c1-4f0c-9ba2-57782f3b2965 HTTP/1.1[0m" 200 -


Dash app running on http://127.0.0.1:9000/


127.0.0.1 - - [30/Nov/2021 22:11:49] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [30/Nov/2021 22:11:50] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [30/Nov/2021 22:11:50] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [30/Nov/2021 22:11:50] "[37mGET /_dash-component-suites/dash_core_components/async-graph.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [30/Nov/2021 22:11:50] "[37mGET /_dash-component-suites/dash_core_components/async-plotlyjs.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [30/Nov/2021 22:13:37] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [30/Nov/2021 22:13:46] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [30/Nov/2021 22:14:03] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [30/Nov/2021 22:14:03] "[37mGET /_dash-component-suites/dash_table/async-highlight.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [30/Nov/2021 22:14:03] "[37mGET /_dash-component-suites/dash_table/async-table.js HTTP/1.1[0m