Skip to content

Commit

Permalink
Merge pull request #65 from nestauk/61_volume_pipeline
Browse files Browse the repository at this point in the history
61 volume pipeline
  • Loading branch information
emily-bicks authored Mar 17, 2023
2 parents f7e0c09 + 624bb8b commit f3c19c5
Show file tree
Hide file tree
Showing 13 changed files with 535 additions and 56 deletions.
3 changes: 3 additions & 0 deletions dap_aria_mapping/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ def get_yaml_config(file_path: Path) -> Optional[dict]:
# Define project base directory
PROJECT_DIR = Path(__file__).resolve().parents[1]

# Define directory for images in app
IMAGE_DIR = f"{PROJECT_DIR}/dap_aria_mapping/analysis/app/images"

# Define log output locations
info_out = str(PROJECT_DIR / "info.log")
error_out = str(PROJECT_DIR / "errors.log")
Expand Down
63 changes: 45 additions & 18 deletions dap_aria_mapping/analysis/app/Home.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,79 @@
import streamlit as st
from st_click_detector import click_detector
from streamlit.components.v1 import html
from PIL import Image
import altair as alt
from nesta_ds_utils.viz.altair import formatting
from dap_aria_mapping import PROJECT_DIR
from dap_aria_mapping.utils.app_utils import img_to_bytes, nav_page_from_image, create_hover_class

formatting.setup_theme()

PAGE_TITLE = "Innovation Explorer"

IMAGE_DIR = f"{PROJECT_DIR}/dap_aria_mapping/analysis/app/images"


#icon to be used as the favicon on the browser tab
# icon to be used as the favicon on the browser tab
nesta_fav = Image.open(f"{IMAGE_DIR}/favicon.ico")

# sets page configuration with favicon and title
st.set_page_config(
page_title=PAGE_TITLE,
layout="wide",
page_icon=nesta_fav
)
st.set_page_config(page_title=PAGE_TITLE, layout="wide", page_icon=nesta_fav)

st.title("Welcome to the Innovation Explorer!")

home_tab, data_tab, methods_tab = st.tabs(["Home", "About the Datasets", "Methodology"])

with home_tab:
hs, cm = st.columns(2)
with hs:
st.image(Image.open(f"{IMAGE_DIR}/hs_homepage.png"))

with cm:
st.image(Image.open(f"{IMAGE_DIR}/cm_homepage.png"))
hs_img, cm_img = (
img_to_bytes(f"{IMAGE_DIR}/hs_homepage.png"),
img_to_bytes(f"{IMAGE_DIR}/cm_homepage.png"),
)

classes_images = {
"img-acu-1": {
"png": "https://s2.gifyu.com/images/hs_homepage.png",
"gif": "https://s2.gifyu.com/images/hs_homepage.gif",
},
"img-acu-2": {
"png": "https://s10.gifyu.com/images/cm_homepage.png",
"gif": "https://s10.gifyu.com/images/cm_homepage.gif",
},
}

for key, value in classes_images.items():
create_hover_class(key, value["png"], value["gif"])

content = """
<div style="display: flex; justify-content: center; margin: 0 auto; padding: 10px 0;">
<a href='#' id='img-1'><img width='90%' class='{acu1}' src='data:image/png;base64,{hs_img}'></a>
<a href='#' id='img-2'><img width='90%' class='{acu2}' src='data:image/png;base64,{cm_img}'></a>
</div>
""".format(
hs_img=hs_img, cm_img=cm_img, acu1="img-acu-1", acu2="img-acu-2"
)

clicked = click_detector(content)

if clicked == "img-1":
nav_page_from_image(page="Horizon_Scanner", timeout=5)
elif clicked == "img-2":
nav_page_from_image(page="Change_Makers", timeout=5)



with data_tab:
st.markdown("In this app we leverage open source data provided by [Google Patents](https://console.cloud.google.com/marketplace/product/google_patents_public_datasets/google-patents-public-data?pli=1) and [Openalex](https://docs.openalex.org/) to assess the landscape of innovation in the UK")
st.markdown(
"In this app we leverage open source data provided by [Google Patents](https://console.cloud.google.com/marketplace/product/google_patents_public_datasets/google-patents-public-data?pli=1) and [Openalex](https://docs.openalex.org/) to assess the landscape of innovation in the UK"
)
st.markdown("ADD MORE DATA DOCUMENTATION")

with methods_tab:
st.markdown("ADD INFORMATION ABOUT OUR METHODOLOGY")

#adds the nesta x aria logo at the bottom of each tab, 3 lines below the contents
# adds the nesta x aria logo at the bottom of each tab, 3 lines below the contents
st.markdown("")
st.markdown("")
st.markdown("")

white_space, logo, white_space = st.columns([1.5,1,1.5])
white_space, logo, white_space = st.columns([1.5, 1, 1.5])
with logo:
st.image(Image.open(f"{IMAGE_DIR}/igl_nesta_aria_logo.png"))

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
226 changes: 198 additions & 28 deletions dap_aria_mapping/analysis/app/pages/1_Horizon_Scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,18 @@
from PIL import Image
import altair as alt
from nesta_ds_utils.viz.altair import formatting
from dap_aria_mapping import PROJECT_DIR
from dap_aria_mapping import PROJECT_DIR, IMAGE_DIR
from dap_aria_mapping.getters.app_tables.horizon_scanner import volume_per_year
from dap_aria_mapping.utils.app_utils import convert_to_pandas
from dap_aria_mapping.getters.taxonomies import get_topic_names
import polars as pl
import pandas as pd
from typing import List, Tuple

formatting.setup_theme()

PAGE_TITLE = "Horizon Scanner"

IMAGE_DIR = f"{PROJECT_DIR}/dap_aria_mapping/analysis/app/images"


#icon to be used as the favicon on the browser tab
icon = Image.open(f"{IMAGE_DIR}/hs_icon.ico")

Expand All @@ -20,6 +24,120 @@
page_icon=icon
)

@st.cache_data(show_spinner = "Loading data")
def load_overview_data() -> Tuple[pl.DataFrame, pl.DataFrame, List[str]]:
"""loads in the volume per year chart and does initial formatting that is not impacted by filters
caches results so the data is not loaded each time a filter is run
Returns:
pl.DataFrame: total patents/publications per domain/area/topic per year, with names
pl.DataFrame: same as above, but patent/publication counts are melted to long form
List: unique domain names in dataset
"""
volume_data = volume_per_year()

#generate a list of the unique domain names to use as the filter
unique_domains = volume_data["domain_name"].unique().to_list()
unique_domains.insert(0,"All")

#reformat the patent/publication counts to long form for the alignment chart
alignment_data = volume_data.melt(
id_vars = ["year", "topic", "topic_name","area", "area_name","domain", "domain_name"],
value_vars = ["publication_count", "patent_count"])
alignment_data.columns = ["year", "topic", "topic_name", "area", "area_name", "domain", "domain_name","doc_type", "count"]

return volume_data, alignment_data, unique_domains

@st.cache_data(show_spinner = "Filtering by domain")
def filter_by_domain(domain: str, _volume_data: pl.DataFrame, _alignment_data: pl.DataFrame) -> Tuple[pl.DataFrame, pl.DataFrame, List[str]]:
"""filters volume data, alignment data, and filter options based on a Domain selection
Args:
domain (str): domain selected by the filter
_volume_data (pl.DataFrame): volume data for emergence chart
_alignment_data (pl.DataFrame): alignment data for alignment chart
Returns:
Tuple[pl.DataFrame, pl.DataFrame, List[str]]: updated dataframes filtered by a domain, and a list of unique areas to populate area filter
"""
volume_data = _volume_data.filter(pl.col("domain_name")==domain)
alignment_data = _alignment_data.filter(pl.col("domain_name")==domain)
unique_areas = volume_data["area_name"].unique().to_list()
return volume_data, alignment_data, unique_areas

@st.cache_data(show_spinner = "Filtering by area")
def filter_by_area(area:str, _volume_data: pl.DataFrame, _alignment_data: pl.DataFrame) -> Tuple[pl.DataFrame, pl.DataFrame, List[str]]:
"""filters volume data, alignment data, and filter options based on an area selection
Args:
area (str): domain selected by the filter
_volume_data (pl.DataFrame): volume data for emergence chart
_alignment_data (pl.DataFrame): alignment data for alignment chart
Returns:
Tuple[pl.DataFrame, pl.DataFrame, List[str]]: updated dataframes filtered by an area, and a list of unique topics to populate topic filter
"""
volume_data = _volume_data.filter(pl.col("area_name")==area)
alignment_data = _alignment_data.filter(pl.col("area_name")==area)
unique_topics = volume_data["topic_name"].unique().to_list()
return volume_data, alignment_data, unique_topics

def group_emergence_by_level(_volume_data: pl.DataFrame, level: str, y_col: str) -> pl.DataFrame:
"""groups the data for the emergence chart by the level specified by the filters
Args:
_volume_data (pl.DataFrame): data for backend of emergence chart
level (str): level to view, specified by domain/area filters
y_col (str): patents, publications, or all documents (specified by filter)
Returns:
pl.DataFrame: grouped emergence data for chart
"""
q = (_volume_data.lazy().with_columns(
pl.col(level).cast(str)
).groupby(
[level, "{}_name".format(level),"year"]
).agg(
[pl.sum(y_col)]
).filter(pl.any(pl.col("year").is_not_null())))
return q.collect()

def group_alignment_by_level(_alignment_data: pl.DataFrame, level: str) -> pl.DataFrame:
"""groups the data for the alignment chart by the level specified by the filters.
Also calculates the fraction of total documents per type to visualise in the chart.
Args:
_alignment_data (pl.DataFrame): data for backend of alignment chart
level (str): level to view, specified by domain/area filters
Returns:
pl.DataFrame: grouped alignment data for chart
"""
total_pubs = _alignment_data.filter(pl.col("doc_type")=="publication_count").select(pl.sum("count"))
total_patents = _alignment_data.filter(pl.col("doc_type")=="patent_count").select(pl.sum("count"))
q = (_alignment_data.lazy().with_columns(
pl.col(level).cast(str)
).groupby(["doc_type", level, "{}_name".format(level)]
).agg(
[pl.sum("count").alias("total")]
).with_columns(
pl.when(pl.col("doc_type") == "publication_count")
.then(pl.col("total")/total_pubs)
.when(pl.col("doc_type") == "patent_count")
.then(pl.col("total")/total_patents)
.alias("doc_fraction")
).with_columns(
pl.when(pl.col("doc_type") == "publication_count")
.then("Publications")
.when(pl.col("doc_type") == "patent_count")
.then("Patents")
.alias("doc_name_clean")
)
.with_columns(
(pl.col("doc_fraction")*100).alias("doc_percentage"))
)
return q.collect()

header1, header2 = st.columns([1,10])
with header1:
st.image(icon)
Expand All @@ -28,36 +146,88 @@

st.markdown(f'<h1 style="color:#0000FF;font-size:16px;">{"<em>Explore patterns and trends in research domains across the UK<em>"}</h1>', unsafe_allow_html=True)

area_drop, discipline_drop, topic_drop = st.columns(3)

with area_drop:
area = st.selectbox(label = "Select an Area", options = ["All", "Area 1", "Area 2"])
discipline = "All"
#load in volume data
volume_data, alignment_data, unique_domains = load_overview_data()

with st.sidebar:
# filter for domains comes from unique domain names
domain = st.selectbox(label = "Select a Domain", options = unique_domains)
area = "All"
topic = "All"
level_considered = "domain"
# if a domain is selected in the filter, then filter the data
if domain != "All":
volume_data, alignment_data, unique_areas = filter_by_domain(domain, volume_data, alignment_data)
unique_areas.insert(0, "All")
#if a domain is selected, the plots that are being visualised are by area (i.e. level 2)
level_considered = "area"

#if a domain is selected, allow user to filter by area
area = st.selectbox(label = "Select an Area", options = unique_areas)
if area != "All":
#if an area is selected, filter data to the area and present at topic level
volume_data, alignment_data, unique_topics = filter_by_area(area, volume_data, alignment_data)
level_considered = "topic"

with discipline_drop:
if area != "All":
#In reality, the options for discipline would come from df.loc[df["Level 1"] == area]["Level 2"].unique()
discipline = st.selectbox(label = "Select a Discipline", options = ["All", "Discipline 1", "Discipline 2"])

with topic_drop:
if discipline != "All":
#In reality, the options for discipline would come from df.loc[df["Level 2"] == discipline]["Level 3"].unique()
topic = st.selectbox(label = "Select a Topic", options = ["All", "Topic 1", "Topic 2"])
overview_tab, disruption_tab, novelty_tab, overlaps_tab = st.tabs(["Overview", "Disruption", "Novelty","Overlaps"])

with overview_tab:

total_to_display = st.slider(label = "Show me most productive:" , min_value = 0, max_value = 50)
st.subheader("Growth Over Time")
st.markdown("View trends in volume of content over time to detect emerging or stagnant areas of innovation")
show_only = st.selectbox(label = "Show Emergence In:", options = ["All Documents", "Publications", "Patents"])
if show_only == "Publications":
y_col = "publication_count"
elif show_only == "Patents":
y_col = "patent_count"
else:
y_col = "total_docs"

overview_tab, disruption_tab, novelty_tab, overlaps_tab = st.tabs(["Overview", "Disruption", "Novelty","Overlaps"])
emergence_data = convert_to_pandas(group_emergence_by_level(volume_data, level_considered, y_col))

with overview_tab:
volume, alignment = st.columns(2)
with volume:
st.subheader("Trends in Emergence")
st.markdown("This would show trends in growth over time for areas/domains/topics, allowing users to analyse patterns recognizing that certain areas produce more/less content than others")
with alignment:
st.subheader("Trends in Alignment")
st.markdown("This could illustrate if research is becoming more/less aligned with industry in certain areas")
volume_chart = alt.Chart(emergence_data).mark_line(point=True).encode(
alt.X("year:N"),
alt.Y("{}:Q".format(y_col), title = "Total Documents"),
color = alt.Color("{}_name:N".format(level_considered),
legend = alt.Legend(labelFontSize = 10, title = None, labelLimit = 0, symbolSize = 20)
),
tooltip=[
alt.Tooltip("year:N", title = "Year"),
alt.Tooltip("{}:Q".format(y_col),title = "Total Documents"),
alt.Tooltip("{}_name:N".format(level_considered), title = "{}".format(level_considered))]

).interactive().properties(width=1100, height = 500)
st.altair_chart(volume_chart)

st.subheader("Alignment in Research and Industry")
st.markdown("Areas with high publication count and low patent count indicates there is significantly more activity in academia than industry on this topic (or vice versa).")
filtered_alignment_data = convert_to_pandas(group_alignment_by_level(alignment_data, level_considered))
alignment_chart = alt.Chart(filtered_alignment_data).transform_filter(
alt.datum.doc_fraction > 0
).mark_point(size = 60).encode(
alt.X("doc_fraction:Q",
title = "Percent of Documents of the Given Type",
scale=alt.Scale(type="log"),
axis = alt.Axis(tickSize=0, format = "%", grid = False)),
alt.Y("{}_name:N".format(level_considered),
axis = alt.Axis(labelLimit = 0, title = None, grid = True)
),
tooltip=[
alt.Tooltip("doc_name_clean:N", title = "Document Type"),
alt.Tooltip("doc_percentage:Q", format = ".2f", title = "Percent of Docs (%)"),
alt.Tooltip("{}_name:N".format(level_considered), title = "{}".format(level_considered))],
color = alt.Color("doc_name_clean:N", legend=alt.Legend(
direction='horizontal',
legendX=10,
legendY=-80,
orient = 'none',
titleAnchor='middle',
title = None))
).interactive().properties(width = 1100)


st.altair_chart(alignment_chart)

with disruption_tab:
disruption_trends, disruption_drilldown = st.columns(2)
Expand Down Expand Up @@ -93,4 +263,4 @@

white_space, logo, white_space = st.columns([1.5,1,1.5])
with logo:
st.image(Image.open(f"{IMAGE_DIR}/igl_nesta_aria_logo.png"))
st.image(Image.open(f"{IMAGE_DIR}/igl_nesta_aria_logo.png"))
6 changes: 2 additions & 4 deletions dap_aria_mapping/analysis/app/pages/2_Change_Makers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@
from PIL import Image
import altair as alt
from nesta_ds_utils.viz.altair import formatting
from dap_aria_mapping import PROJECT_DIR
from dap_aria_mapping import PROJECT_DIR, IMAGE_DIR
formatting.setup_theme()

PAGE_TITLE = "Change Makers"

IMAGE_DIR = f"{PROJECT_DIR}/dap_aria_mapping/analysis/app/images"


#icon to be used as the favicon on the browser tab
icon = Image.open(f"{IMAGE_DIR}/cm_icon.ico")
Expand Down Expand Up @@ -68,4 +66,4 @@

white_space, logo, white_space = st.columns([1.5,1,1.5])
with logo:
st.image(Image.open(f"{IMAGE_DIR}/igl_nesta_aria_logo.png"))
st.image(Image.open(f"{IMAGE_DIR}/igl_nesta_aria_logo.png"))
Loading

0 comments on commit f3c19c5

Please sign in to comment.