Merge pull request #65 from nestauk/61_volume_pipeline

61 volume pipeline
nestauk · Mar 17, 2023 · f3c19c5 · f3c19c5
2 parents f7e0c09 + 624bb8b
commit f3c19c5
Show file tree

Hide file tree

Showing 13 changed files with 535 additions and 56 deletions.
diff --git a/dap_aria_mapping/__init__.py b/dap_aria_mapping/__init__.py
@@ -20,6 +20,9 @@ def get_yaml_config(file_path: Path) -> Optional[dict]:
 # Define project base directory
 PROJECT_DIR = Path(__file__).resolve().parents[1]
 
+# Define directory for images in app
+IMAGE_DIR = f"{PROJECT_DIR}/dap_aria_mapping/analysis/app/images"
+
 # Define log output locations
 info_out = str(PROJECT_DIR / "info.log")
 error_out = str(PROJECT_DIR / "errors.log")

diff --git a/dap_aria_mapping/analysis/app/Home.py b/dap_aria_mapping/analysis/app/Home.py
@@ -1,52 +1,79 @@
 import streamlit as st
+from st_click_detector import click_detector
+from streamlit.components.v1 import html
 from PIL import Image
-import altair as alt
 from nesta_ds_utils.viz.altair import formatting
 from dap_aria_mapping import PROJECT_DIR
+from dap_aria_mapping.utils.app_utils import img_to_bytes, nav_page_from_image, create_hover_class
+
 formatting.setup_theme()
 
 PAGE_TITLE = "Innovation Explorer"
 
 IMAGE_DIR = f"{PROJECT_DIR}/dap_aria_mapping/analysis/app/images"
 
-
-#icon to be used as the favicon on the browser tab
+# icon to be used as the favicon on the browser tab
 nesta_fav = Image.open(f"{IMAGE_DIR}/favicon.ico")
 
 # sets page configuration with favicon and title
-st.set_page_config(
-    page_title=PAGE_TITLE, 
-    layout="wide", 
-    page_icon=nesta_fav
-)
+st.set_page_config(page_title=PAGE_TITLE, layout="wide", page_icon=nesta_fav)
 
 st.title("Welcome to the Innovation Explorer!")
 
 home_tab, data_tab, methods_tab = st.tabs(["Home", "About the Datasets", "Methodology"])
 
 with home_tab:
-    hs, cm = st.columns(2)
-    with hs:
-        st.image(Image.open(f"{IMAGE_DIR}/hs_homepage.png"))
 
-    with cm:
-        st.image(Image.open(f"{IMAGE_DIR}/cm_homepage.png"))
+    hs_img, cm_img = (
+        img_to_bytes(f"{IMAGE_DIR}/hs_homepage.png"),
+        img_to_bytes(f"{IMAGE_DIR}/cm_homepage.png"),
+    )
+
+    classes_images = {
+        "img-acu-1": {
+            "png": "https://s2.gifyu.com/images/hs_homepage.png",
+            "gif": "https://s2.gifyu.com/images/hs_homepage.gif",
+        },
+        "img-acu-2": {
+            "png": "https://s10.gifyu.com/images/cm_homepage.png",
+            "gif": "https://s10.gifyu.com/images/cm_homepage.gif",
+        },
+    }
+
+    for key, value in classes_images.items():
+        create_hover_class(key, value["png"], value["gif"])
+
+    content = """
+        <div style="display: flex; justify-content: center; margin: 0 auto; padding: 10px 0;">
+        <a href='#' id='img-1'><img width='90%' class='{acu1}' src='data:image/png;base64,{hs_img}'></a>
+        <a href='#' id='img-2'><img width='90%' class='{acu2}' src='data:image/png;base64,{cm_img}'></a>
+        </div>
+    """.format(
+        hs_img=hs_img, cm_img=cm_img, acu1="img-acu-1", acu2="img-acu-2"
+    )
+
+    clicked = click_detector(content)
+
+    if clicked == "img-1":
+        nav_page_from_image(page="Horizon_Scanner", timeout=5)
+    elif clicked == "img-2":
+        nav_page_from_image(page="Change_Makers", timeout=5)
 
-
 
 with data_tab:
-    st.markdown("In this app we leverage open source data provided by [Google Patents](https://console.cloud.google.com/marketplace/product/google_patents_public_datasets/google-patents-public-data?pli=1) and [Openalex](https://docs.openalex.org/) to assess the landscape of innovation in the UK")
+    st.markdown(
+        "In this app we leverage open source data provided by [Google Patents](https://console.cloud.google.com/marketplace/product/google_patents_public_datasets/google-patents-public-data?pli=1) and [Openalex](https://docs.openalex.org/) to assess the landscape of innovation in the UK"
+    )
     st.markdown("ADD MORE DATA DOCUMENTATION")
 
 with methods_tab:
     st.markdown("ADD INFORMATION ABOUT OUR METHODOLOGY")
 
-#adds the nesta x aria logo at the bottom of each tab, 3 lines below the contents
+# adds the nesta x aria logo at the bottom of each tab, 3 lines below the contents
 st.markdown("")
 st.markdown("")
 st.markdown("")
 
-white_space, logo, white_space = st.columns([1.5,1,1.5])
+white_space, logo, white_space = st.columns([1.5, 1, 1.5])
 with logo:
     st.image(Image.open(f"{IMAGE_DIR}/igl_nesta_aria_logo.png"))
-
diff --git a/dap_aria_mapping/analysis/app/images/cm_homepage.gif b/dap_aria_mapping/analysis/app/images/cm_homepage.gif
diff --git a/dap_aria_mapping/analysis/app/images/hs_homepage.gif b/dap_aria_mapping/analysis/app/images/hs_homepage.gif
diff --git a/dap_aria_mapping/analysis/app/pages/1_Horizon_Scanner.py b/dap_aria_mapping/analysis/app/pages/1_Horizon_Scanner.py
@@ -2,14 +2,18 @@
 from PIL import Image
 import altair as alt
 from nesta_ds_utils.viz.altair import formatting
-from dap_aria_mapping import PROJECT_DIR
+from dap_aria_mapping import PROJECT_DIR, IMAGE_DIR
+from dap_aria_mapping.getters.app_tables.horizon_scanner import volume_per_year
+from dap_aria_mapping.utils.app_utils import convert_to_pandas
+from dap_aria_mapping.getters.taxonomies import get_topic_names
+import polars as pl
+import pandas as pd
+from typing import List, Tuple
+
 formatting.setup_theme()
 
 PAGE_TITLE = "Horizon Scanner"
 
-IMAGE_DIR = f"{PROJECT_DIR}/dap_aria_mapping/analysis/app/images"
-
-
 #icon to be used as the favicon on the browser tab
 icon = Image.open(f"{IMAGE_DIR}/hs_icon.ico")
 
@@ -20,6 +24,120 @@
     page_icon=icon
 )
 
+@st.cache_data(show_spinner = "Loading data")
+def load_overview_data() -> Tuple[pl.DataFrame, pl.DataFrame, List[str]]:
+    """loads in the volume per year chart and does initial formatting that is not impacted by filters
+    caches results so the data is not loaded each time a filter is run
+
+    Returns:
+        pl.DataFrame: total patents/publications per domain/area/topic per year, with names
+        pl.DataFrame: same as above, but patent/publication counts are melted to long form
+        List: unique domain names in dataset
+    """
+    volume_data = volume_per_year()
+
+   #generate a list of the unique domain names to use as the filter
+    unique_domains = volume_data["domain_name"].unique().to_list()
+    unique_domains.insert(0,"All")
+
+    #reformat the patent/publication counts to long form for the alignment chart
+    alignment_data = volume_data.melt(
+        id_vars = ["year", "topic", "topic_name","area", "area_name","domain", "domain_name"],
+        value_vars = ["publication_count", "patent_count"])
+    alignment_data.columns = ["year", "topic", "topic_name", "area", "area_name", "domain", "domain_name","doc_type", "count"]
+
+    return volume_data, alignment_data, unique_domains
+
+@st.cache_data(show_spinner = "Filtering by domain")
+def filter_by_domain(domain: str, _volume_data: pl.DataFrame, _alignment_data: pl.DataFrame) -> Tuple[pl.DataFrame, pl.DataFrame, List[str]]:
+    """filters volume data, alignment data, and filter options based on a Domain selection
+
+    Args:
+        domain (str): domain selected by the filter
+        _volume_data (pl.DataFrame): volume data for emergence chart
+        _alignment_data (pl.DataFrame): alignment data for alignment chart
+
+    Returns:
+        Tuple[pl.DataFrame, pl.DataFrame, List[str]]: updated dataframes filtered by a domain, and a list of unique areas to populate area filter
+    """
+    volume_data = _volume_data.filter(pl.col("domain_name")==domain)
+    alignment_data = _alignment_data.filter(pl.col("domain_name")==domain)
+    unique_areas = volume_data["area_name"].unique().to_list()
+    return volume_data, alignment_data, unique_areas
+
+@st.cache_data(show_spinner = "Filtering by area")
+def filter_by_area(area:str, _volume_data: pl.DataFrame, _alignment_data: pl.DataFrame) -> Tuple[pl.DataFrame, pl.DataFrame, List[str]]:
+    """filters volume data, alignment data, and filter options based on an area selection
+
+    Args:
+        area (str): domain selected by the filter
+        _volume_data (pl.DataFrame): volume data for emergence chart
+        _alignment_data (pl.DataFrame): alignment data for alignment chart
+
+    Returns:
+        Tuple[pl.DataFrame, pl.DataFrame, List[str]]: updated dataframes filtered by an area, and a list of unique topics to populate topic filter
+    """
+    volume_data = _volume_data.filter(pl.col("area_name")==area)
+    alignment_data = _alignment_data.filter(pl.col("area_name")==area)
+    unique_topics = volume_data["topic_name"].unique().to_list()
+    return volume_data, alignment_data, unique_topics
+
+def group_emergence_by_level(_volume_data: pl.DataFrame, level: str, y_col: str) -> pl.DataFrame:
+    """groups the data for the emergence chart by the level specified by the filters
+
+    Args:
+        _volume_data (pl.DataFrame): data for backend of emergence chart
+        level (str): level to view, specified by domain/area filters
+        y_col (str): patents, publications, or all documents (specified by filter)
+
+    Returns:
+        pl.DataFrame: grouped emergence data for chart
+    """
+    q = (_volume_data.lazy().with_columns(
+        pl.col(level).cast(str)
+        ).groupby(
+            [level, "{}_name".format(level),"year"]
+            ).agg(
+                [pl.sum(y_col)]
+                ).filter(pl.any(pl.col("year").is_not_null())))
+    return q.collect()
+
+def group_alignment_by_level(_alignment_data: pl.DataFrame, level: str) -> pl.DataFrame:
+    """groups the data for the alignment chart by the level specified by the filters.
+    Also calculates the fraction of total documents per type to visualise in the chart.
+
+    Args:
+        _alignment_data (pl.DataFrame): data for backend of alignment chart
+        level (str): level to view, specified by domain/area filters
+
+    Returns:
+        pl.DataFrame: grouped alignment data for chart
+    """
+    total_pubs = _alignment_data.filter(pl.col("doc_type")=="publication_count").select(pl.sum("count"))
+    total_patents = _alignment_data.filter(pl.col("doc_type")=="patent_count").select(pl.sum("count"))
+    q = (_alignment_data.lazy().with_columns(
+        pl.col(level).cast(str)
+        ).groupby(["doc_type", level, "{}_name".format(level)]
+        ).agg(
+            [pl.sum("count").alias("total")]
+        ).with_columns(
+            pl.when(pl.col("doc_type") == "publication_count")
+            .then(pl.col("total")/total_pubs)
+            .when(pl.col("doc_type") == "patent_count")
+            .then(pl.col("total")/total_patents)
+            .alias("doc_fraction")
+        ).with_columns(
+            pl.when(pl.col("doc_type") == "publication_count")
+            .then("Publications")
+            .when(pl.col("doc_type") == "patent_count")
+            .then("Patents")
+            .alias("doc_name_clean")
+        )
+        .with_columns(
+            (pl.col("doc_fraction")*100).alias("doc_percentage"))
+        )
+    return q.collect()
+
 header1, header2 = st.columns([1,10])
 with header1:
     st.image(icon)
@@ -28,36 +146,88 @@
 
 st.markdown(f'<h1 style="color:#0000FF;font-size:16px;">{"<em>Explore patterns and trends in research domains across the UK<em>"}</h1>', unsafe_allow_html=True)
 
-area_drop, discipline_drop, topic_drop = st.columns(3)
-
-with area_drop:
-    area = st.selectbox(label = "Select an Area", options = ["All", "Area 1", "Area 2"])
-    discipline = "All"
+#load in volume data 
+volume_data, alignment_data, unique_domains = load_overview_data()
+
+with st.sidebar:
+    # filter for domains comes from unique domain names
+    domain = st.selectbox(label = "Select a Domain", options = unique_domains)
+    area = "All"
     topic = "All"
+    level_considered = "domain"
+    # if a domain is selected in the filter, then filter the data
+    if domain != "All":
+        volume_data, alignment_data, unique_areas = filter_by_domain(domain, volume_data, alignment_data)
+        unique_areas.insert(0, "All")
+        #if a domain is selected, the plots that are being visualised are by area (i.e. level 2)
+        level_considered = "area"
+
+        #if a domain is selected, allow user to filter by area
+        area = st.selectbox(label = "Select an Area", options = unique_areas)
+        if area != "All":
+            #if an area is selected, filter data to the area and present at topic level
+            volume_data, alignment_data, unique_topics  = filter_by_area(area, volume_data, alignment_data)
+            level_considered = "topic"
 
-with discipline_drop:
-    if area != "All":
-        #In reality, the options for discipline would come from df.loc[df["Level 1"] == area]["Level 2"].unique()
-        discipline = st.selectbox(label = "Select a Discipline", options = ["All", "Discipline 1", "Discipline 2"])
 
-with topic_drop:
-    if discipline != "All":
-        #In reality, the options for discipline would come from df.loc[df["Level 2"] == discipline]["Level 3"].unique()
-        topic = st.selectbox(label = "Select a Topic", options = ["All", "Topic 1", "Topic 2"])
+overview_tab, disruption_tab, novelty_tab, overlaps_tab = st.tabs(["Overview", "Disruption", "Novelty","Overlaps"])
 
+with overview_tab:
 
-total_to_display = st.slider(label = "Show me most productive:" , min_value = 0, max_value = 50)
+    st.subheader("Growth Over Time")
+    st.markdown("View trends in volume of content over time to detect emerging or stagnant areas of innovation")
+    show_only = st.selectbox(label = "Show Emergence In:", options = ["All Documents", "Publications", "Patents"])
+    if show_only == "Publications":
+        y_col = "publication_count"
+    elif show_only == "Patents":
+        y_col = "patent_count"
+    else:
+        y_col = "total_docs"
 
-overview_tab, disruption_tab, novelty_tab, overlaps_tab = st.tabs(["Overview", "Disruption", "Novelty","Overlaps"])
+    emergence_data = convert_to_pandas(group_emergence_by_level(volume_data, level_considered, y_col))
 
-with overview_tab:
-    volume, alignment = st.columns(2)
-    with volume:
-        st.subheader("Trends in Emergence")
-        st.markdown("This would show trends in growth over time for areas/domains/topics, allowing users to analyse patterns recognizing that certain areas produce more/less content than others")
-    with alignment:
-        st.subheader("Trends in Alignment")
-        st.markdown("This could illustrate if research is becoming more/less aligned with industry in certain areas")
+    volume_chart = alt.Chart(emergence_data).mark_line(point=True).encode(
+        alt.X("year:N"),
+        alt.Y("{}:Q".format(y_col), title = "Total Documents"),
+        color = alt.Color("{}_name:N".format(level_considered), 
+        legend = alt.Legend(labelFontSize = 10, title = None, labelLimit = 0, symbolSize = 20)
+        ),
+        tooltip=[
+            alt.Tooltip("year:N", title = "Year"),
+            alt.Tooltip("{}:Q".format(y_col),title = "Total Documents"),
+            alt.Tooltip("{}_name:N".format(level_considered), title = "{}".format(level_considered))]
+
+    ).interactive().properties(width=1100, height = 500)
+    st.altair_chart(volume_chart)
+
+    st.subheader("Alignment in Research and Industry")
+    st.markdown("Areas with high publication count and low patent count indicates there is significantly more activity in academia than industry on this topic (or vice versa).")
+    filtered_alignment_data = convert_to_pandas(group_alignment_by_level(alignment_data, level_considered))
+    alignment_chart = alt.Chart(filtered_alignment_data).transform_filter(
+        alt.datum.doc_fraction > 0  
+        ).mark_point(size = 60).encode(
+        alt.X("doc_fraction:Q", 
+            title = "Percent of Documents of the Given Type", 
+            scale=alt.Scale(type="log"), 
+            axis = alt.Axis(tickSize=0, format = "%", grid = False)),
+        alt.Y("{}_name:N".format(level_considered),
+            axis = alt.Axis(labelLimit = 0, title = None, grid = True) 
+            ),
+        tooltip=[
+            alt.Tooltip("doc_name_clean:N", title = "Document Type"),
+            alt.Tooltip("doc_percentage:Q", format = ".2f", title = "Percent of Docs (%)"),
+            alt.Tooltip("{}_name:N".format(level_considered), title = "{}".format(level_considered))],
+        color = alt.Color("doc_name_clean:N", legend=alt.Legend(
+            direction='horizontal',
+            legendX=10,
+            legendY=-80,
+            orient = 'none',
+            titleAnchor='middle',
+            title = None))
+        ).interactive().properties(width = 1100)
+
+
+    st.altair_chart(alignment_chart)
 
 with disruption_tab:
     disruption_trends, disruption_drilldown = st.columns(2)
@@ -93,4 +263,4 @@
 
 white_space, logo, white_space = st.columns([1.5,1,1.5])
 with logo:
-    st.image(Image.open(f"{IMAGE_DIR}/igl_nesta_aria_logo.png"))
+    st.image(Image.open(f"{IMAGE_DIR}/igl_nesta_aria_logo.png"))
diff --git a/dap_aria_mapping/analysis/app/pages/2_Change_Makers.py b/dap_aria_mapping/analysis/app/pages/2_Change_Makers.py
@@ -2,13 +2,11 @@
 from PIL import Image
 import altair as alt
 from nesta_ds_utils.viz.altair import formatting
-from dap_aria_mapping import PROJECT_DIR
+from dap_aria_mapping import PROJECT_DIR, IMAGE_DIR
 formatting.setup_theme()
 
 PAGE_TITLE = "Change Makers"
 
-IMAGE_DIR = f"{PROJECT_DIR}/dap_aria_mapping/analysis/app/images"
-
 
 #icon to be used as the favicon on the browser tab
 icon = Image.open(f"{IMAGE_DIR}/cm_icon.ico")
@@ -68,4 +66,4 @@
 
 white_space, logo, white_space = st.columns([1.5,1,1.5])
 with logo:
-    st.image(Image.open(f"{IMAGE_DIR}/igl_nesta_aria_logo.png"))
+    st.image(Image.open(f"{IMAGE_DIR}/igl_nesta_aria_logo.png"))