# The adaptable script to filter swinno db for bioeconomy innovations

1. gets the innovation based on query
2. exports to excel file for tagging
3. exports to txt file for copying images

In [1]:
import pandas as pd
from pathlib import Path

from src.swinno_helpers import connect_swinno_db
from src.utils import get_project_root

ROOT = get_project_root()
swinno = connect_swinno_db()

In [5]:
bioeconomy_excl_tagged = pd.read_sql(
    """
select
  distinct(i.sinno_id),
  i.innovation_name_in_swedish AS name,
  i.description_in_swedish AS description,
  i.additional_information_if_origin_new_scientific_discovery || i.additional_information_if_origin_new_technologies_or_materials || i.additional_info_if_origin_official_regulation_legislation_and_standards || i.additional_information_if_origin_solution_for_a_problem || i.additional_information_if_origin_performance || i.additional_information_if_origin_other AS info,
  i.year_of_commercialization AS year,
  us.use_sector
from
  innovation i
  join use_sectors us on i.sinno_id = us.sinno_id
where
  (
    (
    us.use_sector like '02%'
    or us.use_sector like '20%'
    or us.use_sector like '21%'
    or us.use_sector like '22%'
    or us.use_sector like '36%'
    or product_code like '02%'
    or product_code like '20%'
    or product_code like '21%'
    or product_code like '22%'
    or product_code like '36%'
   )
  or (
     description LIKE '%virke%'
  OR description LIKE '%cellulos%'
  OR description LIKE '%lignin%'
  OR description LIKE '%spån%'
  OR description LIKE '%bark%'
  OR description LIKE '%levulinsyra%'
  OR description LIKE '%furfural%'
  OR description LIKE '%svarttjära%'
  OR description LIKE '%svartlut%'
  OR description LIKE '%växtbas%'
  OR description LIKE '%ved%'
  OR description LIKE '%trä%'
  OR description LIKE '%skog%'
  OR description LIKE '%biobränsle%'
  OR description LIKE '%biologisk%'
  OR description LIKE '%nedbrytbar%'
  OR description LIKE '%papper%'
  OR description LIKE '%pappret%'
  OR description LIKE '%karton%'
  OR description LIKE '%tencel%'
   )
  )
  and i.sinno_id not in (
    select
      distinct(bv.sinno_id)
    from
      bioeconomy_visions_articles bv
  ); -- excludes those already tagged
""",
    swinno,
)

In [6]:
print("unique ids:", len(bioeconomy_excl_tagged["sinno_id"].unique()))
print("all ids:", len(bioeconomy_excl_tagged["sinno_id"]))

unique ids: 865
all ids: 1105


The fact that some innovations are doubled is not too bad at this stage, as the duplicates can be safely removed.
It is a bit annoying, so:
TODO: change query to return unique values only

In [7]:
filtered_bioeco = bioeconomy_excl_tagged[
    ~bioeconomy_excl_tagged.duplicated(subset=["sinno_id"])
]

In [8]:
len(filtered_bioeco)

865

In [10]:
tagging_columns = ["bioeconomy_vision", "innovation_type", "article_checked", "notes"]

for col in tagging_columns:
    filtered_bioeco[col] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_bioeco[col] = None


In [11]:
filtered_bioeco.head()

Unnamed: 0,sinno_id,name,description,info,year,use_sectors,bioeconomy_vision,innovation_type,article_checked,notes
0,5871001,,Kompakt krympfilmstunnel med infravärmare. Inf...,InfravärmeteknikLitenhet; automatisering; arbe...,1972,22222,,,,
1,5872001,Multiline',Automatisk maskin med vilken man kan gravera s...,,1972,22222,,,,
2,5873001,Solna 225,Tvåfärgspress med UV-tork.,,1973,22222,,,,
3,5874001,"BOOK-O""-MATIC",Tryckpress för framställning av färdig bok dir...,,1972,22222,,,,
4,5875001,Nitarn,"Häftningsmaskin (""häftapparat"") med rotationsh...",Produktivitet (anpassad för höga hastigheter);...,1973,222,,,,


In [13]:
filtered_bioeco.to_excel(
    Path(ROOT, "data/raw-data", "innovations_to_check.xlsx"), index=False
)

In [7]:
id_to_source_path = Path(ROOT, "notes", "230411_bioeconomy_excl_tagged.txt")
if id_to_source_path.exists():
    id_to_source_path.unlink()
with open(id_to_source_path, "a") as f:
    for id in list(filtered_bioeco["sinno_id"]):
        f.write(str(id) + "\n")