# Deployment

## Exctract: Load locally for testing

In [1]:
from glob import glob
glob('../data/0_external/*csv')

['../data/0_external\\data_science_210220_1241.csv',
 '../data/0_external\\sustainable_finance_210213_1046.csv',
 '../data/0_external\\sustainable_finance_210214_1435.csv',
 '../data/0_external\\sustainable_finance_210215_2133.csv',
 '../data/0_external\\sustainable_finance_210218_2021.csv',
 '../data/0_external\\sustainable_finance_210220_1029.csv',
 '../data/0_external\\sustainable_finance_210223_2048.csv',
 '../data/0_external\\sustainable_finance_210227_2137.csv',
 '../data/0_external\\sustainable_finance_210228_0929.csv',
 '../data/0_external\\sustainable_finance_210302_2147.csv',
 '../data/0_external\\sustainable_finance_210306_1030.csv',
 '../data/0_external\\sustainable_finance_210310_2022.csv',
 '../data/0_external\\sustainable_finance_210314_2142.csv',
 '../data/0_external\\sustainable_finance_210318_2048.csv',
 '../data/0_external\\sustainable_finance_210319_0813.csv']

## Transform

In [2]:
# %load ../pipelines/esg_trending_topics/transform.py

import pandas as pd
import numpy as np
from datetime import datetime

# ~------------------ RESPONSE DATA ------------------~
def process_response(response, kw, ranking, geo):
    """  """
    try:
        df = response[kw][ranking]
        df[['keyword', 'ranking', 'geo', 'query_timestamp']] = [kw, ranking, geo, datetime.now()]
    except:
        print(f"Append empty dataframe for {ranking}: {kw}")
        return pd.DataFrame(columns=['query', 'value', 'keyword', 'ranking', 'geo', 'query_timestamp'])
    
    return df

def create_response_df(response, geo='global'):
    """ Create one dataframe for each ranking and each keyword """
    assert isinstance(response, dict), "Empty response, caught in transform.py. Try again." 

    ranking = [*response[[*response][0]]]
    keywords = [*response]

    df_list = []
    for r in ranking: 
        for kw in keywords:
            df_list.append(process_response(response, kw=kw, ranking=r, geo=geo))

    return pd.concat(df_list)



# ~------------------ PLOT DATA ------------------~
def add_features(df):
    """ Create normalized values for even display """
    
    assert set(["query", "value", "keyword", "ranking", "query_timestamp", "geo"]).issubset(df.columns), "Add features failed. \
    Missing one of [query, value, keyword, ranking, query_timestamp, geo]"
    
    # feature engineering: totals and normalize
    grouped = df.groupby(['ranking']).value # group values by ranking
    df['value_total'] = grouped.transform('sum') # total sum 
    df['value_normalized'] = ((df.value-grouped.transform('min'))/(grouped.transform('max')-grouped.transform('min'))).astype(float) 
    df['value_normalized_total'] = df.groupby(['ranking']).value_normalized.transform('sum') # total sum of normalized values 
    df['date'] = pd.to_datetime(df.query_timestamp).dt.strftime("%d. %B %Y")
    
    return df

def select_topn(df, top_n):
    """ Select top-n keywords for each ranking ordered by value """
    assert df.columns.str.contains("ranking").any(), "select_topn failed. Missing 'ranking' column."

    df = df.reset_index(drop=True)
    df.value = pd.to_numeric(df.value, errors='coerce') # avoid object dtype
    topn_idx = df.groupby("ranking")['value'].nlargest(top_n).droplevel(0).index

    return df.loc[topn_idx, : ]

def sanitize_labels(df):
    """ Insert linebreaks and create headings """
    df['labels'] = df['query'].apply(lambda x: x.replace(' ', '<br>')) # linebreaks
    df['ranking_label'] = df.ranking.replace({'top': f'Evergreens - updated {df.date.to_list()[0]}',
                                              'rising': f'Trending - updated {df.date.to_list()[0]}'})
   
    return df

def plot_data(df, top_n=35):
    """ Return 2 dataframes: Newcomer ("rising") and top charts ("top") """
    df = (df.pipe(select_topn, top_n)
            .pipe(add_features)
            .pipe(sanitize_labels)
            )

    # rankings: top and rising    
    return df.query('ranking == "rising"'),  df.query('ranking == "top"')


In [13]:
# %load ../pipelines/esg_trending_topics/deploy.py
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import chart_studio.plotly as py

def create_plot(dfr, dft):
    """ Create treemap for Trending (="Rising") - dfr - and Top keywords - dft """
    
    assert dfr.columns.str.contains("labels").any(), "dfr does not contain 'labels' columns"
    assert dft.columns.str.contains("labels").any(), "dft does not contain 'labels' columns"
    assert dfr.columns.str.contains("ranking_label").any(), "dfr does not contain 'ranking_label' columns"
    assert dft.columns.str.contains("ranking_label").any(), "dft does not contain 'ranking_label' columns"
    assert dfr.columns.str.contains("value_normalized").any(), "dfr does not contain 'value_normalized' columns"
    assert dfr.columns.str.contains("value_normalized").any(), "dft does not contain 'value_normalized' columns"
    
    fig = make_subplots(2, 1, specs=[[{"type": "domain"}], [ {"type": "domain"}]])

    fig.add_trace(go.Treemap(
        labels = dfr['labels'], #_href
        parents = dfr.ranking_label, 
        values = dfr.value_normalized, 
    ), 1, 1)

    fig.add_trace(go.Treemap(
        labels = dft['labels'],
        parents = dft.ranking_label, 
        values = dft.value_normalized
    ), 2, 1)

    fig.update_layout(margin=dict(t=10, b=10, r=10, l=10), 
                      plot_bgcolor= "rgba(0, 0, 0, 0)",
                      paper_bgcolor= "rgba(0, 0, 0, 0)",
                     )

    fig.update_traces(
        opacity=1, 
        textposition='middle center', 
        textfont={'family':"Arial", 'size': 20}, 
        hoverinfo= "label", # "skip",
        tiling = {'squarifyratio': 1, 'pad': 0}, 
        textfont_size=24, 
        marker={
            'depthfade': True,
            'cauto': True,
        }
    )
    
    return fig

def deploy_plot(figure, filename):    
    """ Upload graph to chartstudio """
    print(f"Upload figure {filename} to plotly")
    py.plot(figure, filename=filename)




# Testing

In [6]:
from glob import glob
files = glob('../data/0_external/*csv')

In [8]:
load_file = files[-1]
print(f"Load {load_file}")
df = pd.read_csv(load_file)

Load ../data/0_external\sustainable_finance_210319_0813.csv


In [14]:
dfr, dft = plot_data(df)
fig = create_plot(dfr, dft)
deploy_plot(fig, filename='sustainable_finance')

Upload figure sustainable_finance to plotly
