In [2]:
!pip install dash
!pip install dash_bootstrap_components
!pip install dash_dangerously_set_inner_html

import dash_dangerously_set_inner_html
import dash
from dash import html, dcc, Input, Output
from dash.dependencies import Input, Output
import pickle
import plotly.graph_objs as go
import pandas as pd
from collections import Counter
import itertools
import ast
# from google.colab import drive
import plotly.express as px
import pandas as pd
from datetime import date, datetime

import urllib
import dash
from dash import html, dcc, Input, Output
import dash_bootstrap_components as dbc
import plotly.graph_objs as go
import pandas as pd
from dash.dependencies import Input, Output
from dash.exceptions import PreventUpdate
import numpy as np
from dash import dash_table, html



# Load data

In [None]:
# # Import datasets if from drive
# drive.mount('/content/gdrive', force_remount=True)
# dir = 'gdrive/MyDrive/CfMM/data/'

# with open(dir+f'df_dummy.pkl', 'rb') as pickle_file:
#   df_dummy = pd.compat.pickle_compat.load(pickle_file)

In [3]:
# Import datasets if from local
df_dummy = pd.read_pickle(r"df_dummy.pkl")

In [4]:
# Create df_corpus
df_corpus = df_dummy.copy()

# Get relevant parameters
min_date = df_corpus['date_published'].min()
max_date = df_corpus['date_published'].max()
unique_publishers = sorted(df_corpus['publisher'].unique())
unique_topics = df_corpus['topic_list'].explode().dropna().unique()

# Indiv Charts

## Chart 1

In [13]:
# Initialize the Dash application
stylesheets = [dbc.themes.FLATLY] # 'https://codepen.io/chriddyp/pen/bWLwgP.css'
app = dash.Dash(__name__, external_stylesheets=stylesheets)

# Define the layout of the application
app.layout = html.Div(children=[
    # All elements for Chart 1
    html.Div([

        html.Div([
            html.Label(['Filter Date Published:'], style={'font-weight': 'bold', 'width': '20%'}),
            dcc.DatePickerRange(
                id='chart1-datepickerrange',
                display_format='DD MMM YYYY',
                clearable=True,
                with_portal=True,
                max_date_allowed=datetime.today(),
                start_date=min_date,
                end_date=max_date,
                start_date_placeholder_text='Start date',
                end_date_placeholder_text='End date',
                style = {'font-size':'15px'}
            )
        ], style={'display':'flex', 'margin-bottom':'10px', 'align-items': 'center'}),

        html.Div([
            html.Label(['Filter Publishers:'], style={'font-weight': 'bold', 'width': '20%'}),
            dcc.Dropdown(
            id='chart1-publisher-dropdown',
            options=[{'label': publisher, 'value': publisher} for publisher in unique_publishers],
            placeholder='Select Publisher',
            multi=True,
            clearable=True,
            style = {'width': '70%'})
        ], style={'display':'flex', 'margin-bottom':'10px', 'align-items': 'center'}),

        html.Div([
            html.Label(['Filter Bias Ratings:'], style={'font-weight': 'bold', 'width': '20%'}),
            dcc.Dropdown(
            id='chart1-bias-rating-dropdown',
            options=[
                {'label': 'Inconclusive', 'value':-1},
                {'label': 'Biased', 'value': 1},
                {'label': 'Very Biased', 'value': 2},
                {'label': 'Not Biased', 'value': 0},
            ],
            placeholder='Select Bias Rating',
            multi=True,
            clearable=True,
            style = {'width': '70%'})
        ], style={'display':'flex', 'margin-bottom':'10px', 'align-items': 'center'}),

        html.Div([
            html.Label(['Filter Topics:'], style={'font-weight': 'bold', 'width': '20%'}),
            dcc.Dropdown(
            id='chart1-topic-dropdown',
            options=[{'label': topic, 'value': topic} for topic in unique_topics],
            placeholder='Select Topic',
            multi=True,
            clearable=True,
            style = {'width': '70%'})
        ], style={'display':'flex', 'margin-bottom':'30px', 'align-items': 'center'}),

        # Toggle for color by bias ratings or bias categories
        dcc.RadioItems(
            id='chart1-color-toggle',
            options=[
                {'label': '    Show bias ratings', 'value': 'bias_ratings'},
                {'label': '    Show bias categories (for biased/very biased articles only)', 'value': 'bias_categories'}
            ],
            value='bias_ratings',  # default value on load
            labelStyle={'display': 'inline-block'},
            inputStyle={"margin-left": "10px"},
            style = {'margin-bottom': '50px'}
        ),

        # Graph for displaying the top offending publishers
        dcc.Graph(id='offending-publishers-bar-chart', style = {'margin-bottom': '50px'}),

        # Table for displaying the top offending publishers
        html.Div(id='table1-title', style={'fontSize': 20, 'fontColor': '#2E2C2B', 'margin-bottom': '20px'}),
        html.Div(id='table1'),
        html.Div([
            dbc.Button('Clear Table', id='clear-button1', style = {'display': 'none'}),
            dbc.Button('Export to CSV', id='export-button1', style = {'display': 'none'})
        ], style={'display':'flex', 'margin-top': '10px', 'align-items': 'center'}),

    ],
        style={
            'padding':10,
            'flex':1,
            'margin-top': '50px',
            'margin-bottom': '100px',
            'font-family': 'sans-serif'
            }
    )
])

# Callback for Chart 1
@app.callback(
    Output('offending-publishers-bar-chart', 'figure'),
    [
        Input('chart1-datepickerrange', 'start_date'),
        Input('chart1-datepickerrange', 'end_date'),
        Input('chart1-publisher-dropdown', 'value'),
        Input('chart1-bias-rating-dropdown', 'value'),
        Input('chart1-topic-dropdown', 'value'),
        Input('chart1-color-toggle', 'value')
    ]
)

def update_chart1(selected_start_date, selected_end_date, selected_publishers, selected_bias_ratings, selected_topics, color_by):
    filtered_df = df_corpus.copy()

    # Apply filters for quarters, publishers, and topics
    if (selected_start_date is not None) & (selected_end_date is not None):
        start_date = pd.to_datetime(str(selected_start_date))
        end_date = pd.to_datetime(str(selected_end_date))
        filtered_df = filtered_df[(filtered_df['date_published']>=start_date) & (filtered_df['date_published']<=end_date)]
    if selected_publishers:
        filtered_df = filtered_df[filtered_df['publisher'].isin(selected_publishers)]
    if selected_bias_ratings:
        filtered_df = filtered_df[filtered_df['bias_rating'].isin(selected_bias_ratings)]
    if selected_topics:
        filtered_df = filtered_df[filtered_df['topic'].str.contains('|'.join(selected_topics))]

    # Calculate the total counts of very biased and biased articles for each publisher
    if selected_bias_ratings:
        publisher_totals = filtered_df[filtered_df['bias_rating'].isin(selected_bias_ratings)].groupby('publisher', observed=True).size()
    else:
        publisher_totals = filtered_df.groupby('publisher', observed=True).size()

    # Sort publishers by this count and get the top 10
    top_publishers = publisher_totals.sort_values(ascending=False).head(10).index[::-1]

    # Filter the dataframe to include only the top publishers
    filtered_df = filtered_df[filtered_df['publisher'].isin(top_publishers)]
    filtered_df['publisher'] = pd.Categorical(filtered_df['publisher'], ordered=True, categories=top_publishers)
    filtered_df = filtered_df.sort_values('publisher')

    if color_by == 'bias_ratings':
        # Color mapping for bias ratings
        color_map = {
            -1: ('#CAC6C2', 'Inconclusive'),
            0: ('#f2eadf', 'Not Biased'), # #FFE5DC
            1: ('#eb8483', 'Biased'),
            2: ('#C22625', 'Very Biased')
        }
        # Prepare legend tracking
        legend_added = set()
        data = []
        for publisher in filtered_df['publisher'].unique():
            total_biased_articles = filtered_df[filtered_df['publisher'] == publisher]['bias_rating'].count()

            for rating, (color, name) in color_map.items():
                articles = filtered_df[(filtered_df['publisher'] == publisher) &
                                       (filtered_df['bias_rating'] == rating)]['bias_rating'].count()

                percentage_of_total = (articles / total_biased_articles) * 100 if total_biased_articles > 0 else 0

                tooltip_text = (
                    f"<b>Publisher: </b>{publisher}<br>"
                    f"<b>Bias Rating:</b> {name}<br>"
                    f"<b>Number of Articles:</b> {articles}<br>"
                    f"<b>Percentage of Total:</b> {percentage_of_total:.2f}%"
                )

                showlegend = name not in legend_added
                legend_added.add(name)

                data.append(go.Bar(
                    x=[articles],
                    y=[publisher],
                    name=name,
                    orientation='h',
                    marker=dict(color=color),
                    showlegend=showlegend,
                    text=tooltip_text,
                    hoverinfo='text',
                    textposition='none'
                ))

        # Update the layout
        layout = go.Layout(
            title=f"""<b>Who are the top offending publishers year to date?</b>""",
            xaxis=dict(title='Number of Articles'),
            yaxis=dict(title='Publisher'),
            hovermode='closest',
            barmode='stack',
            showlegend=True,
            hoverlabel=dict(
                align='left'
            ),
            template="simple_white",
            plot_bgcolor='white',
            paper_bgcolor='white',
            font_color='#2E2C2B',
            font_size=14,
            margin={'l': 150, 'r': 20, 'b': 40, 't': 40}
        )

    elif color_by == 'bias_categories':
        categories = ['generalisation', 'prominence', 'negative_behaviour', 'misrepresentation', 'headline_or_imagery']
        category_colors = ['#4185A0', '#AA4D71', '#B85C3B', '#C5BE71', '#7658A0']  # example colors
        # Prepare legend tracking
        legend_added = set()
        data = []
        filtered_df = filtered_df[filtered_df['bias_rating']>=1]
        filtered_df['total_bias_category'] = filtered_df[categories].sum(axis=1)

        for publisher in filtered_df['publisher'].unique():
            # Summing the 'total_bias_category' column which was pre-calculated
            total_biased_articles = filtered_df[filtered_df['publisher'] == publisher].shape[0]

            for i, category in enumerate(categories):
                # Count the number of rows where the category column has a 1 for this publisher
                articles = filtered_df[(filtered_df['publisher'] == publisher) & (filtered_df[category] == 1)].shape[0]

                # Calculate the percentage of total articles for the current category
                # percentage_of_total = (articles / total_biased_articles * 100) if total_biased_articles > 0 else 0
                tooltip_text = (
                    f"<b>Publisher: </b>{publisher}<br>"
                    f"<b>Bias Category: </b>{category.replace('_', ' ').title().replace('Or', 'or')}<br>"
                    f"Of the {total_biased_articles} articles with biased/very biased ratings,<br><b>{articles}</b> of them committed <b>{category.replace('_', ' ').title().replace('Or', 'or')}</b>."
                    # f"<b>Percentage of Total: </b>{percentage_of_total:.2f}%"
                )

                showlegend = category not in legend_added  # determine showlegend based on current category
                legend_added.add(category)

                data.append(go.Bar(
                    x=[articles],
                    y=[publisher],
                    name=category.replace('_', ' ').title().replace('Or', 'or'),
                    orientation='h',
                    marker=dict(color=category_colors[i]),
                    showlegend=showlegend,
                    text=tooltip_text,
                    hoverinfo='text',
                    textposition='none'
                ))

        # Update the layout
        layout = go.Layout(
            title=f"""<b>Who are the top offending publishers year to date?</b>""",
            xaxis=dict(title='Number of Articles'),
            yaxis=dict(title='Publisher'),
            hovermode='closest',
            barmode='group',
            showlegend=True,
            hoverlabel=dict(
                align='left'
            ),
            template="simple_white",
            plot_bgcolor='white',
            paper_bgcolor='white',
            font_color='#2E2C2B',
            font_size=14,
            margin={'l': 150, 'r': 20, 'b': 40, 't': 40}
        )

    return {'data': data, 'layout': layout}

# Callback for Table 1
@app.callback(
    [
        Output('table1-title', 'children'),
        Output(component_id='table1', component_property='children'),
        Output('clear-button1', 'style'),
        Output('export-button1', 'style'),
        Output('export-button1', 'href')
    ],
    [
        Input('chart1-datepickerrange', 'start_date'),
        Input('chart1-datepickerrange', 'end_date'),
        Input('chart1-publisher-dropdown', 'value'),
        Input('chart1-bias-rating-dropdown', 'value'),
        Input('chart1-topic-dropdown', 'value'),
        Input('chart1-color-toggle', 'value'),
        Input('offending-publishers-bar-chart', 'clickData'),
        Input('clear-button1', 'n_clicks')
    ]
)

def update_table1(selected_start_date, selected_end_date, selected_publishers, selected_bias_ratings, selected_topics, color_by, clickData, n_clicks):
    triggered = dash.callback_context.triggered
    topics = ''

    if triggered:
        id = triggered[0]['prop_id'].split('.')[0]

        if id in ['offending-publishers-bar-chart', 'export-button1']:
            filtered_df = df_corpus.copy()
            # filtered_df = filtered_df[filtered_df['bias_rating'].isin([0,1,2])]

            # Apply filters for quarters, publishers, and topics
            if (selected_start_date is not None) & (selected_end_date is not None):
                start_date = pd.to_datetime(str(selected_start_date))
                end_date = pd.to_datetime(str(selected_end_date))
                filtered_df = filtered_df[(filtered_df['date_published']>=start_date) & (filtered_df['date_published']<=end_date)]
            if selected_publishers:
                filtered_df = filtered_df[filtered_df['publisher'].isin(selected_publishers)]
            if selected_bias_ratings:
                filtered_df = filtered_df[filtered_df['bias_rating'].isin(selected_bias_ratings)]
            if selected_topics:
                filtered_df = filtered_df[filtered_df['topic'].str.contains('|'.join(selected_topics))]
                topics = 'having any of the selected topics'

            if (clickData is not None) or (clickData is None & id=='export-button1'):
                publisher = str(clickData['points'][0]['label'])
                filtered_df = filtered_df[filtered_df['publisher']==publisher]
                start_date = pd.to_datetime(str(selected_start_date)).strftime('%d %b %Y')
                end_date = pd.to_datetime(str(selected_end_date)).strftime('%d %b %Y')

                if color_by == 'bias_ratings':
                    title = dash_dangerously_set_inner_html.DangerouslySetInnerHTML(f'Showing all articles from <b>{publisher}</b> published <b>{start_date}</b> to <b>{end_date}</b> {topics}')
                    filtered_df['color'] = np.select(
                        [
                            filtered_df['bias_rating'] == 2,
                            filtered_df['bias_rating'] == 1
                        ],
                        [
                            'white',
                            '#2E2C2B'
                        ],
                        '#2E2C2B'
                    )
                    filtered_df['title_label'] = "<a href='" + filtered_df['article_url'] + "' target='_blank' style='color:" + filtered_df['color'] + ";'>" + filtered_df['title'] + "</a>"
                    filtered_df['bias_rating_label'] = np.select(
                        [
                            filtered_df['bias_rating']==-1,
                            filtered_df['bias_rating']==0,
                            filtered_df['bias_rating']==1,
                            filtered_df['bias_rating']==2
                        ],
                        [
                            'Inconclusive',
                            'Not Biased',
                            'Biased',
                            'Very Biased'
                        ]
                    )
                    filtered_df['date_published_label_(yyyy-mm-dd)'] = filtered_df['date_published'].dt.date

                    # Save to csv
                    csv_df = filtered_df[['publisher', 'title', 'article_url', 'date_published_label_(yyyy-mm-dd)', 'topic', 'bias_rating_label']]
                    csv_df.columns = ['Publisher', 'Title', 'Article URL', 'Date Published', 'Topic', 'Bias Rating']
                    csv_string = "data:text/csv;charset=utf-8,%EF%BB%BF" + urllib.parse.quote(csv_df.to_csv(index=False, encoding='utf-8'))

                    # Dash
                    filtered_df = filtered_df.sort_values('date_published', ascending=False)[['title_label', 'date_published_label_(yyyy-mm-dd)', 'topic', 'bias_rating_label']]
                    table = dash_table.DataTable(
                        css=[dict(selector= "p", rule = "margin: 0; text-align: left")],
                        columns=[{'id': x, 'name': x.replace('_', ' ').title(), 'presentation': 'markdown'} if 'title' in x else {'id': x, 'name': x.replace('_', ' ').replace('label', '').title().replace('Or', 'or').replace('Yyyy-Mm-Dd', 'yyyy-mm-dd')} for x in filtered_df.columns],
                        markdown_options={"html": True},
                        data=filtered_df.to_dict('records'),
                        sort_action='native',
                        filter_action='native',
                        filter_options={'case': 'insensitive'},

                        page_current=0,
                        page_size=10,
                        style_table={'margin': 'auto', 'padding': '0 5px', 'overflowX': 'auto', 'overflowY': 'auto'},
                        style_header={'textAlign': 'center', 'fontWeight': 'bold'},
                        style_data={'minWidth':'120px', 'maxWidth':'120px', 'width':'120px'},
                        style_data_conditional=[
                            {
                                'if': {
                                    'filter_query': '{bias_rating_label}="Very Biased"'
                                    },
                                'backgroundColor': '#C22625',
                                'color': 'white'
                            },
                            {
                                'if': {
                                    'filter_query': '{bias_rating_label}="Biased"'
                                    },
                                'backgroundColor': '#eb8483',
                                'color': '#2E2C2B'
                            }
                        ],
                        style_cell={'textAlign': 'left', 'padding': '5px', 'font-family':'sans-serif', 'whiteSpace': 'normal', 'height': 'auto', 'minWidth':'180px', 'maxWidth':'180px', 'width':'180px'},
                        style_cell_conditional=[
                            {
                                'if': {
                                    'column_id': ['topic', 'title']
                                },
                                'width': '600px'
                            }
                        ]
                    )

                else:
                    title = dash_dangerously_set_inner_html.DangerouslySetInnerHTML(f'Showing biased/very biased articles from <b>{publisher}</b> published <b>{start_date}</b> to <b>{end_date}</b> {topics}')
                    filtered_df = filtered_df[filtered_df['bias_rating']>=1]
                    filtered_df['color'] = np.select(
                        [
                            filtered_df['bias_rating'] == 2,
                            filtered_df['bias_rating'] == 1
                        ],
                        [
                            'white',
                            '#2E2C2B'
                        ],
                        '#2E2C2B'
                    )
                    filtered_df['title_label'] = "<a href='" + filtered_df['article_url'] + "' target='_blank' style='color:" + filtered_df['color'] + ";'>" + filtered_df['title'] + "</a>"
                    filtered_df['bias_rating_label'] = np.select(
                        [
                            filtered_df['bias_rating']==-1,
                            filtered_df['bias_rating']==0,
                            filtered_df['bias_rating']==1,
                            filtered_df['bias_rating']==2
                        ],
                        [
                            'Inconclusive',
                            'Not Biased',
                            'Biased',
                            'Very Biased'
                        ]
                    )
                    categories = ['generalisation', 'prominence', 'negative_behaviour', 'misrepresentation', 'headline_or_imagery']
                    for category in categories:
                        filtered_df[category] = np.where(filtered_df[category]==1, 'Y', 'N')
                    filtered_df['date_published_label_(yyyy-mm-dd)'] = filtered_df['date_published'].dt.date

                    # Save to csv
                    csv_df = filtered_df[['publisher', 'title', 'article_url', 'date_published_label_(yyyy-mm-dd)', 'topic', 'bias_rating_label']+categories]
                    csv_df.columns = ['Publisher', 'Title', 'Article URL', 'Date Published', 'Topic', 'Bias Rating'] + [c.upper() for c in categories]
                    csv_string = "data:text/csv;charset=utf-8,%EF%BB%BF" + urllib.parse.quote(csv_df.to_csv(index=False, encoding='utf-8'))

                    # Dash
                    filtered_df = filtered_df.sort_values('date_published', ascending=False)[['title_label', 'date_published_label_(yyyy-mm-dd)', 'topic', 'bias_rating_label']+categories]
                    table = dash_table.DataTable(
                        css=[dict(selector="p", rule="margin:0; text-align:left")],
                        columns=[{'id': x, 'name': x.replace('_', ' ').title(), 'presentation': 'markdown'} if 'title' in x else {'id': x, 'name': x.replace('_', ' ').replace('label', '').title().replace('Or', 'or').replace('Yyyy-Mm-Dd', 'yyyy-mm-dd')} for x in filtered_df.columns],
                        markdown_options={"html": True},
                        data=filtered_df.to_dict('records'),
                        sort_action='native',
                        filter_action='native',
                        filter_options={'case': 'insensitive'},

                        page_current=0,
                        page_size=10,
                        style_table={'margin': 'auto', 'padding': '0 5px', 'overflowX': 'auto', 'overflowY': 'auto'},
                        style_header={'textAlign': 'center', 'fontWeight': 'bold'},
                        style_data={'minWidth':'120px', 'maxWidth':'120px', 'width':'120px'},
                        style_data_conditional=[
                            {
                                'if': {
                                    'filter_query': '{bias_rating_label}="Very Biased"',
                                    'column_id': ['date_published', 'title', 'topic', 'bias_rating_label']
                                    },
                                'backgroundColor': '#C22625',
                                'color': 'white'
                            },
                            {
                                'if': {
                                    'filter_query': '{bias_rating_label}="Biased"',
                                    'column_id': ['date_published', 'title', 'topic', 'bias_rating_label']
                                    },
                                'backgroundColor': '#eb8483',
                                'color': '#2E2C2B'
                            },
                            {
                                'if': {
                                    'filter_query': '{generalisation}="Y"',
                                    'column_id': 'generalisation'
                                    },
                                'backgroundColor': '#4185A0',
                                'color': 'white'
                            },
                            {
                                'if': {
                                    'filter_query': '{prominence}="Y"',
                                    'column_id': 'prominence'
                                    },
                                'backgroundColor': '#AA4D71',
                                'color': 'white'
                            },
                            {
                                'if': {
                                    'filter_query': '{negative_behaviour}="Y"',
                                    'column_id': 'negative_behaviour'
                                    },
                                'backgroundColor': '#B85C3B',
                                'color': 'white'
                            },
                            {
                                'if': {
                                    'filter_query': '{misrepresentation}="Y"',
                                    'column_id': 'misrepresentation'
                                    },
                                'backgroundColor': '#C5BE71',
                                'color': '#2E2C2B'
                            },
                            {
                                'if': {
                                    'filter_query': '{headline_or_imagery}="Y"',
                                    'column_id': 'headline_or_imagery'
                                    },
                                'backgroundColor': '#7658A0',
                                'color': 'white'
                            }
                        ],
                        style_cell={'textAlign': 'left', 'padding': '5px', 'font-family':'sans-serif', 'whiteSpace': 'normal', 'height': 'auto'},
                        style_cell_conditional=[
                            {
                                'if': {'column_id': ['topic', 'title']},
                                'width': '600px'
                            }
                        ]
                    )

            if id == 'export-button1':
                return [title], table, {'fontSize':14, 'display': 'block'}, {'fontSize':14, 'display': 'block', 'margin-left': '10px'}, csv_string

            return [title], table, {'fontSize':14, 'display': 'block'}, {'fontSize':14, 'display': 'block', 'margin-left': '10px'}, csv_string

        elif id in ['chart1-datepickerrange', 'chart1-topic-dropdown', 'chart1-publisher-dropdown', 'chart1-bias-rating-dropdown', 'chart1-color-toggle', 'clear-button1']:
            return [], None, {'display': 'none'}, {'display': 'none'}, ''

    else:
        return [], None, {'display': 'none'}, {'display': 'none'}, ''


if __name__ == '__main__':
    app.run_server(debug=True) #host='0.0.0.0'

## Chart 2

## Chart 3

In [8]:
df_corpus.columns

Index(['date_published', 'publisher', 'title', 'text', 'article_url', 'topic',
       'topic_list', 'bias_rating', 'generalisation', 'prominence',
       'negative_behaviour', 'misrepresentation', 'headline_or_imagery',
       'location'],
      dtype='object')

In [10]:
df_corpus.head()

Unnamed: 0,date_published,publisher,title,text,article_url,topic,topic_list,bias_rating,generalisation,prominence,negative_behaviour,misrepresentation,headline_or_imagery,location
0,2019-05-23,www.dailymail.co.uk,Iran says it will not surrender even if it is ...,"LONDON, May 23 (Reuters) - Iranian President H...",https://www.dailymail.co.uk/wires/reuters/arti...,Crimes and Arrests | Politics,"[Crimes and Arrests, Politics]",-1,0,0,1,0,0,"Iran, Islamic Republic of"
1,2019-06-13,www.theguardian.com,MEPs create biggest far-right group in Europea...,"Geert Wilders, left, Matteo Salvini, centre, a...",https://www.theguardian.com/world/2019/jun/13/...,Politics,[Politics],-1,0,0,1,0,0,Italy
2,2019-04-30,www.thejc.com,US army veteran arrested after planning terror...,A US army veteran has been arrested in souther...,https://www.thejc.com/news/us-news/us-army-vet...,Crimes and Arrests | Terrorism and Extremism |...,"[Crimes and Arrests, Terrorism and Extremism, ...",0,0,0,0,0,0,New Caledonia
3,2019-07-06,www.express.co.uk,Johnson calls on immigrants to learn English t...,The Tory leadership favourite made the remarks...,https://www.express.co.uk/news/politics/115003...,Politics | Hate Speech and Discrimination,"[Politics, Hate Speech and Discrimination]",-1,0,0,1,0,0,United Kingdom
4,2019-02-01,www.christiantoday.com,"In packed churches and secret masses, papal vi...",In Dubai's overflowing churches and Riyadh's s...,https://www.christiantoday.com/article/in-pack...,Religion,[Religion],0,0,0,0,0,0,Saudi Arabia


In [14]:
import dash
from dash import dcc, html, dash_table
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output, State
import pandas as pd
import numpy as np
import base64
from io import BytesIO
from datetime import datetime
from wordcloud import WordCloud
import plotly.graph_objects as go
from sklearn.feature_extraction.text import CountVectorizer
import urllib.parse

# Helper function to convert word cloud to image data URI
def pil_image_to_base64(img):
    buffered = BytesIO()
    img.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

min_date = df_corpus['date_published'].min()
max_date = df_corpus['date_published'].max()

# Initialize the Dash application
stylesheets = [dbc.themes.FLATLY]
app = dash.Dash(__name__, external_stylesheets=stylesheets)

# Define the layout of the application
app.layout = html.Div(children=[

    # Date range picker
    html.Div([
        html.Label(['Filter Date Published:'], style={'font-weight': 'bold', 'width': '20%'}),
        dcc.DatePickerRange(
            id='chart5-datepickerrange',
            display_format='DD MMM YYYY',
            clearable=True,
            with_portal=True,
            max_date_allowed=datetime.today(),
            start_date=min_date,
            end_date=max_date,
            start_date_placeholder_text='Start date',
            end_date_placeholder_text='End date',
            style={'font-size': '15px'}
        )
    ], style={'display': 'flex', 'margin-bottom': '10px', 'align-items': 'center'}),

    # Dropdown for publishers
    html.Div([
        html.Label(['Filter Publishers:'], style={'font-weight': 'bold', 'width': '20%'}),
        dcc.Dropdown(
            id='chart5-publisher-dropdown',
            options=[{'label': publisher, 'value': publisher} for publisher in unique_publishers],
            placeholder='Select Publisher',
            multi=True,
            clearable=True,
            style={'width': '70%'}
        )
    ], style={'display': 'flex', 'margin-bottom': '10px', 'align-items': 'center'}),

    # Dropdown for topics
    html.Div([
        html.Label(['Filter Topics:'], style={'font-weight': 'bold', 'width': '20%'}),
        dcc.Dropdown(
            id='chart5-topic-dropdown',
            options=[{'label': topic, 'value': topic} for topic in unique_topics],
            placeholder='Select Topic',
            multi=True,
            clearable=True,
            style={'width': '70%'}
        )
    ], style={'display': 'flex', 'margin-bottom': '30px', 'align-items': 'center'}),

    # Dropdown for bias category
    html.Div([
        html.Label(['Filter Bias Category:'], style={'font-weight': 'bold', 'width': '20%'}),
        dcc.Dropdown(
            id='chart5-bias-category-dropdown',
            options=[
                {'label': 'Generalisation', 'value': 'generalisation'},
                {'label': 'Prominence', 'value': 'prominence'},
                {'label': 'Negative Behaviour', 'value': 'negative_behaviour'},
                {'label': 'Misrepresentation', 'value': 'misrepresentation'},
                {'label': 'Headline or Imagery', 'value': 'headline_or_imagery'},
            ],
            placeholder='Select Bias Category',
            multi=True,
            clearable=True,
            style={'width': '70%'}
        )
    ], style={'display': 'flex', 'margin-bottom': '10px', 'align-items': 'center'}),

    # Dropdown for bias rating
    html.Div([
        html.Label(['Filter Bias Rating:'], style={'font-weight': 'bold', 'width': '20%'}),
        dcc.Dropdown(
            id='chart5-bias-rating-dropdown',
            options=[
                {'label': 'Biased', 'value': 2},
                {'label': 'Very Biased', 'value': 1},
                {'label': 'Not Biased', 'value': 0},
                {'label': 'Inconclusive', 'value': -1},
            ],
            placeholder='Select Bias Rating',
            multi=True,
            clearable=True,
            style={'width': '70%'}
        )
    ], style={'display': 'flex', 'margin-bottom': '10px', 'align-items': 'center'}),

    # Dropdown for n-gram selection
    html.Div([
        html.Label(['Select N-gram:'], style={'font-weight': 'bold', 'width': '20%'}),
        dcc.Dropdown(
            id='chart5-ngram-dropdown',
            options=[
                {'label': 'Unigram (1)', 'value': 1},
                {'label': 'Bigram (2)', 'value': 2},
                {'label': 'Trigram (3)', 'value': 3}
            ],
            value=1,  # default value on load
            clearable=False,
            style={'width': '70%'}
        )
    ], style={'display': 'flex', 'margin-bottom': '10px', 'align-items': 'center'}),

    # Toggle for headline-only or full-text word clouds
    dcc.RadioItems(
        id='chart5-text-toggle',
        options=[
            {'label': 'Headline-only', 'value': 'title'},
            {'label': 'Full-text', 'value': 'text'}
        ],
        value='title',  # default value on load
        labelStyle={'display': 'inline-block'},
        inputStyle={"margin-left": "10px"},
        style={'margin-bottom': '50px'}
    ),

    # Graph for displaying the word cloud
    dcc.Graph(id='wordcloud-container', style={'height': '80vh'}),

    # Word search input and button
    html.Div([
        html.Label(['Word Search:'], style={'font-weight': 'bold', 'width': '20%'}),
        dcc.Input(id='word-search', type='text', style={'width': '60%'}),
        dbc.Button('Search', id='search-button', n_clicks=0, style={'margin-left': '10px'})
    ], style={'display': 'flex', 'margin-bottom': '10px', 'align-items': 'center'}),

    # Table for displaying the result for word search
    html.Div(id='table5-title', style={'fontSize': 20, 'color': '#2E2C2B', 'margin-bottom': '20px'}),
    html.Div(id='table5'),
    html.Div([
        dbc.Button('Clear Table', id='clear-button5', style={'display': 'none'}),
        dbc.Button('Export to CSV', id='export-button5', style={'display': 'none'})
    ], style={'display': 'flex', 'margin-top': '10px', 'align-items': 'center'}),
], style={
    'padding': 10,
    'flex': 1,
    'margin-top': '50px',
    'margin-bottom': '100px',
    'font-family': 'sans-serif'
})

# Callback for Chart 5
@app.callback(
    Output('wordcloud-container', 'figure'),
    [
        Input('chart5-datepickerrange', 'start_date'),
        Input('chart5-datepickerrange', 'end_date'),
        Input('chart5-publisher-dropdown', 'value'),
        Input('chart5-topic-dropdown', 'value'),
        Input('chart5-bias-category-dropdown', 'value'),
        Input('chart5-bias-rating-dropdown', 'value'),
        Input('chart5-text-toggle', 'value'),
        Input('chart5-ngram-dropdown', 'value')
    ]
)
def update_chart5(selected_start_date, selected_end_date, selected_publishers, selected_topics, selected_bias_categories, selected_bias_ratings, text_by, ngram_value):
    filtered_df = df_corpus.copy()

    # Apply filters for dates, publishers, and topics
    if selected_start_date and selected_end_date:
        start_date = pd.to_datetime(selected_start_date)
        end_date = pd.to_datetime(selected_end_date)
        filtered_df = filtered_df[(filtered_df['date_published'] >= start_date) & (filtered_df['date_published'] <= end_date)]
    if selected_publishers:
        filtered_df = filtered_df[filtered_df['publisher'].isin(selected_publishers)]
    if selected_topics:
        filtered_df = filtered_df[filtered_df['topic'].str.contains('|'.join(selected_topics))]
    if selected_bias_ratings:
        filtered_df = filtered_df[filtered_df['bias_rating'].isin(selected_bias_ratings)]
    if selected_bias_categories:
        filtered_df = filtered_df[filtered_df[selected_bias_categories].sum(axis=1) > 0]

    # Generate n-grams
    text = ' '.join(filtered_df[text_by].dropna().values)
    vectorizer = CountVectorizer(ngram_range=(ngram_value, ngram_value))
    ngram_counts = vectorizer.fit_transform([text])
    ngram_freq = ngram_counts.toarray().flatten()
    ngram_names = vectorizer.get_feature_names_out()
    word_counts = dict(zip(ngram_names, ngram_freq))

    total_words = sum(word_counts.values())
    wc = WordCloud(background_color='white',
                   max_words=100,
                   width=1600,
                   height=1200,
                   scale=1.5,
                   margin=100,
                   max_font_size=100
                  ).generate_from_frequencies(word_counts)

    # Get word positions and frequencies
    word_positions = wc.layout_

    # Extract positions and other data for Scatter plot
    words = []
    x = []
    y = []
    sizes = []
    hover_texts = []
    frequencies = []

    for (word, freq), font_size, position, orientation, color in word_positions:
        words.append(word)
        x.append(position[0])
        y.append(position[1])
        sizes.append(font_size)
        frequencies.append(freq)
        raw_count = word_counts[word]
        percentage = (raw_count / total_words) * 100
        hover_texts.append(f"<b>Word: </b>{word}<br>"
                           f"<b>Count: </b>{raw_count}<br>"
                           f"<b>Percentage: </b>{percentage:.2f}%")

    # Identify top 5 words by frequency
    top_5_indices = np.argsort(frequencies)[-5:]
    colors = ['#D3D3D3'] * len(words)
    custom_colors = [
        '#EA8C55', #top 5
        '#C75146',
        '#AD2E24',
        '#81171B',
        '#540804', #top 1
    ]

    # Apply custom colors to the top 5 words
    for i, idx in enumerate(top_5_indices):
        colors[idx] = custom_colors[i % len(custom_colors)]

    # Sort words by frequency to ensure top words appear on top
    sorted_indices = np.argsort(frequencies)
    words = [words[i] for i in sorted_indices]
    x = [x[i] for i in sorted_indices]
    y = [y[i] for i in sorted_indices]
    sizes = [sizes[i] for i in sorted_indices]
    hover_texts = [hover_texts[i] for i in sorted_indices]
    colors = [colors[i] for i in sorted_indices]

    # Create the Plotly figure with Scatter plot
    fig = go.Figure()

    # Add words as Scatter plot points
    fig.add_trace(go.Scatter(
        x=x,
        y=y,
        mode='text',
        text=words,
        textfont=dict(size=sizes, color=colors),
        hovertext=hover_texts,
        hoverinfo='text'
    ))

    # Update the layout to remove axes and make the word cloud bigger
    fig.update_layout(
        xaxis=dict(showgrid=False, zeroline=False, visible=False),
        yaxis=dict(showgrid=False, zeroline=False, visible=False),
        margin=dict(l=10, r=10, t=10, b=10),
        plot_bgcolor='white'
    )

    # Reverse the y-axis to match the word cloud orientation
    fig.update_yaxes(autorange="reversed")

    return fig

# Callback for Table 5
@app.callback(
    [
        Output('table5-title', 'children'),
        Output('table5', 'children'),
        Output('clear-button5', 'style'),
        Output('export-button5', 'style'),
        Output('export-button5', 'href')
    ],
    [
        Input('search-button', 'n_clicks')
    ],
    [
        State('chart5-datepickerrange', 'start_date'),
        State('chart5-datepickerrange', 'end_date'),
        State('chart5-publisher-dropdown', 'value'),
        State('chart5-topic-dropdown', 'value'),
        State('chart5-bias-category-dropdown', 'value'),
        State('chart5-bias-rating-dropdown', 'value'),
        State('word-search', 'value')
    ]
)
def update_table5(n_clicks, selected_start_date, selected_end_date, selected_publishers, selected_topics, selected_bias_categories, selected_bias_ratings, search_word):
    triggered = dash.callback_context.triggered
    topics = ''

    if triggered:
        id = triggered[0]['prop_id'].split('.')[0]

        if id == 'search-button':
            filtered_df = df_corpus.copy()

            # Apply filters for dates, publishers, and topics
            if selected_start_date and selected_end_date:
                start_date = pd.to_datetime(str(selected_start_date))
                end_date = pd.to_datetime(str(selected_end_date))
                filtered_df = filtered_df[(filtered_df['date_published'] >= start_date) & (filtered_df['date_published'] <= end_date)]
            if selected_publishers:
                filtered_df = filtered_df[filtered_df['publisher'].isin(selected_publishers)]
            if selected_topics:
                filtered_df = filtered_df[filtered_df['topic'].str.contains('|'.join(selected_topics))]
                topics = 'having any of the selected topics'
            if selected_bias_ratings:
                filtered_df = filtered_df[filtered_df['bias_rating'].isin(selected_bias_ratings)]
            if selected_bias_categories:
                filtered_df = filtered_df[filtered_df[selected_bias_categories].sum(axis=1) > 0]

            if search_word:
                filtered_df = filtered_df[filtered_df['title'].str.contains(search_word, case=False, na=False) | 
                                          filtered_df['text'].str.contains(search_word, case=False, na=False)]

            title = f'Showing all articles matching the filters {topics}'
            filtered_df['color'] = np.select(
                [
                    filtered_df['bias_rating'] == 2,
                    filtered_df['bias_rating'] == 1
                ],
                [
                    'white',
                    '#2E2C2B'
                ],
                '#2E2C2B'
            )
            filtered_df['title_label'] = "<a href='" + filtered_df['article_url'] + "' target='_blank' style='color:" + filtered_df['color'] + ";'>" + filtered_df['title'] + "</a>"
            filtered_df['bias_rating_label'] = np.select(
                [
                    filtered_df['bias_rating'] == -1,
                    filtered_df['bias_rating'] == 0,
                    filtered_df['bias_rating'] == 1,
                    filtered_df['bias_rating'] == 2
                ],
                [
                    'Inconclusive',
                    'Not Biased',
                    'Biased',
                    'Very Biased'
                ]
            )
            filtered_df['date_published_label_(yyyy-mm-dd)'] = filtered_df['date_published'].dt.date

            # Save to csv
            csv_df = filtered_df[['publisher', 'title', 'article_url', 'date_published_label_(yyyy-mm-dd)', 'topic', 'bias_rating_label']]
            csv_df.columns = ['Publisher', 'Title', 'Article URL', 'Date Published', 'Topic', 'Bias Rating']
            csv_string = "data:text/csv;charset=utf-8,%EF%BB%BF" + urllib.parse.quote(csv_df.to_csv(index=False, encoding='utf-8'))

            # Dash
            filtered_df = filtered_df.sort_values('date_published', ascending=False)[['title_label', 'date_published_label_(yyyy-mm-dd)', 'topic', 'bias_rating_label']]
            table = dash_table.DataTable(
                css=[dict(selector="p", rule="margin: 0; text-align: left")],
                columns=[{'id': x, 'name': x.replace('_', ' ').title(), 'presentation': 'markdown'} if 'title' in x else {'id': x, 'name': x.replace('_', ' ').replace('label', '').title().replace('Or', 'or').replace('Yyyy-Mm-Dd', 'yyyy-mm-dd')} for x in filtered_df.columns],
                markdown_options={"html": True},
                data=filtered_df.to_dict('records'),
                sort_action='native',
                filter_action='native',
                filter_options={'case': 'insensitive'},

                page_current=0,
                page_size=10,
                style_table={'margin': 'auto', 'padding': '0 5px', 'overflowX': 'auto', 'overflowY': 'auto'},
                style_header={'textAlign': 'center', 'fontWeight': 'bold'},
                style_data={'minWidth': '120px', 'maxWidth': '120px', 'width': '120px'},
                style_data_conditional=[
                    {
                        'if': {
                            'filter_query': '{bias_rating_label}="Very Biased"'
                        },
                        'backgroundColor': '#C22625',
                        'color': 'white'
                    },
                    {
                        'if': {
                            'filter_query': '{bias_rating_label}="Biased"'
                        },
                        'backgroundColor': '#eb8483',
                        'color': '#2E2C2B'
                    }
                ],
                style_cell={'textAlign': 'left', 'padding': '5px', 'font-family': 'sans-serif', 'whiteSpace': 'normal', 'height': 'auto', 'minWidth': '180px', 'maxWidth': '180px', 'width': '180px'},
                style_cell_conditional=[
                    {
                        'if': {
                            'column_id': ['topic', 'title']
                        },
                        'width': '600px'
                    }
                ]
            )

            if id == 'export-button5':
                return [title], table, {'fontSize': 14, 'display': 'block'}, {'fontSize': 14, 'display': 'block', 'margin-left': '10px'}, csv_string

            return [title], table, {'fontSize': 14, 'display': 'block'}, {'fontSize': 14, 'display': 'block', 'margin-left': '10px'}, csv_string

    return [], None, {'display': 'none'}, {'display': 'none'}, ''

# Run the Dash app
if __name__ == '__main__':
    app.run_server(debug=True)

## Chart 4