In [1]:
import pandas as pd
from pathlib import Path
from swinno_bioeconomy_directionality.utils import get_project_root
import swinno_bioeconomy_directionality.categorization_helpers as categorization_helpers
import swinno_bioeconomy_directionality.swinno_helpers as swinno_helpers
from swinno_bioeconomy_directionality.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

engine = swinno_helpers.connect_swinno_db()

[32m2024-09-13 08:25:10.155[0m | [1mINFO    [0m | [36mswinno_bioeconomy_directionality.config[0m:[36m<module>[0m:[36m7[0m - [1mPROJ_ROOT path is: /Users/research/projects/swinno_bioeconomy_directionality[0m


In [3]:
retagged_bv = pd.read_csv(
    RAW_DATA_DIR / "final_classifications/retagged_bioeconomy_visions.csv"
)
retagged_ei = pd.read_csv(
    RAW_DATA_DIR / "final_classifications/retagged_eco_innovations.csv"
)
retagged_notes = pd.read_csv(RAW_DATA_DIR / "final_classifications/retagged_notes.csv")

In [10]:
dfs = [
    pd.read_csv(
        p,
        usecols=[
            "sinno_id",
            "bioeconomy_vision",
            "innovation_type",
            "article_checked",
            "notes",
        ],
    )
    for p in Path(RAW_DATA_DIR, "final_classifications").glob(
        "*_innovations-to-check.csv"
    )
]

In [11]:
len(dfs)

3

In [12]:
combined = pd.concat(dfs)

In [13]:
combined_checked = combined.loc[combined["article_checked"] == 1].copy()

In [14]:
duplicates = combined_checked.loc[combined_checked.duplicated(subset="sinno_id")]
duplicates.shape

(1, 5)

In [15]:
uniques = combined_checked.loc[~combined_checked.duplicated(subset="sinno_id")]

In [16]:
uniques.loc[~uniques["sinno_id"].isin(retagged_bv["sinno_id"])]

Unnamed: 0,sinno_id,bioeconomy_vision,innovation_type,article_checked,notes
0,6000001,9,9,1.0,
1,6039001,9,9,1.0,
2,6341001,2,9,1.0,
3,6562001,9,602,1.0,
4,6740001,9,9,1.0,
...,...,...,...,...,...
525,10681001,9,230;206;211,1.0,not forest
526,10682001,9,999,1.0,not forest
527,10701001,9,601;161,1.0,not forest
528,10703001,9,999,1.0,not forest


In [17]:
len(retagged_bv["sinno_id"].unique())

46

# Remove the uncertain innovations

In [18]:
certains = uniques.loc[~uniques["sinno_id"].isin(retagged_bv["sinno_id"])]

## Eco Innovations

In [37]:
innovation_types = certains.loc[:, ["sinno_id", "innovation_type"]]
cleaned_innovation_types = categorization_helpers.clean_codes(
    innovation_types, code_digits=3, column="innovation_type"
)

split_innovation_types = swinno_helpers.split_cols(
    cleaned_innovation_types, col_to_split="innovation_type", sep=","
)

melted_innovation_types = swinno_helpers.melt_table(
    split_innovation_types,
    id_vars="sinno_id",
    col_start="innovation",
    value_name="innovation_type",
)
melted_innovation_types.dropna()

melted_innovation_types["innovation_type"] = melted_innovation_types[
    "innovation_type"
].astype(int)

In [38]:
certain_eco_innovation_types = pd.concat([melted_innovation_types, retagged_ei], axis=0)

In [39]:
certain_eco_innovation_types.to_sql(
    name="eco_innovations", con=engine, if_exists="replace", index=False
)

1486

## Bioeconomy Visions 

In [35]:
certains

Unnamed: 0,sinno_id,bioeconomy_vision,innovation_type,article_checked,notes
0,6000001,9,9,1.0,
1,6039001,9,9,1.0,
2,6341001,2,9,1.0,
3,6562001,9,602,1.0,
4,6740001,9,9,1.0,
...,...,...,...,...,...
525,10681001,9,230;206;211,1.0,not forest
526,10682001,9,999,1.0,not forest
527,10701001,9,601;161,1.0,not forest
528,10703001,9,999,1.0,not forest


In [40]:
visions = certains.loc[:, ["sinno_id", "bioeconomy_vision"]]
cleaned_visions = categorization_helpers.clean_codes(
    visions, code_digits=1, column="bioeconomy_vision"
)

split_visions = swinno_helpers.split_cols(
    cleaned_visions, col_to_split="bioeconomy_vision", sep=","
)

melted_visions = swinno_helpers.melt_table(
    split_visions, id_vars="sinno_id", col_start="bio", value_name="bioeconomy_vision"
)
melted_visions.dropna()

melted_visions["bioeconomy_vision"] = melted_visions["bioeconomy_vision"].astype(int)

certain_visions = pd.concat([melted_visions, retagged_bv], axis=0)

certain_visions.to_sql(
    name="bioeconomy_visions", con=engine, if_exists="replace", index=False
)

959

##  Notes

In [22]:
notes = certains.loc[:, ["sinno_id", "notes"]]
cleaned_innovation_types = categorization_helpers.clean_codes(
    innovation_types, code_digits=3, column="innovation_type"
)

certain_eco_innovation_types = pd.concat([notes, retagged_notes], axis=0)

notes = notes.dropna()
notes.to_sql(name="categorization_notes", con=engine, index=False, if_exists="replace")

425