In [1]:
import pandas as pd
from pathlib import Path
from src.utils import get_project_root
from src.categorization_helpers import *
from src.swinno_helpers import *

ROOT = get_project_root()

engine = connect_swinno_db()

In [2]:
first_round = pd.read_excel(
    Path(ROOT, "data/raw-data/2304011-checked_innovations.xlsx"),
    usecols=[
        "sinno_id",
        "bioeconomy_vision",
        "innovation_type",
        "article_checked",
        "notes",
    ],
)
second_round = pd.read_excel(
    Path(ROOT, "data/raw-data/20230622_innovations-to-check.xlsx"),
    usecols=[
        "sinno_id",
        "bioeconomy_vision",
        "innovation_type",
        "article_checked",
        "notes",
    ],
)

article_check = pd.concat([first_round, second_round])

article_check.shape

(953, 5)

In [3]:
innovation_types = article_check.loc[:, ["sinno_id", "innovation_type"]]
cleaned_innovation_types = clean_codes(
    innovation_types, code_digits=3, column="innovation_type"
)

visions = article_check.loc[:, ["sinno_id", "bioeconomy_vision"]]
cleaned_visions = clean_codes(visions, code_digits=1, column="bioeconomy_vision")
notes = article_check.loc[:, ["sinno_id", "notes"]]

cleaned_innovation_types.name = "innovation_types"
cleaned_visions.name = "visions"
notes.name = "notes"

In [4]:
for df in [cleaned_innovation_types, notes, cleaned_visions]:
    csv_path = Path(ROOT, "data", "modified-data", f"{df.name}-duplicates.csv")
    check_duplicates(df, output_name=df.name, output_path=csv_path)
    print("-" * 5)

No duplicates found.
-----
No duplicates found.
-----
No duplicates found.
-----


In [5]:
split_visions = split_cols(cleaned_visions, col_to_split="bioeconomy_vision", sep=",")
melted_visions = melt_table(
    split_visions, id_vars="sinno_id", col_start="bio", value_name="bioeconomy_vision"
)

melted_visions = melted_visions.dropna()

melted_visions

Unnamed: 0,sinno_id,bioeconomy_vision
0,5880001,9
1,5881001,2
2,5883001,2
3,5894001,9
4,5897001,9
...,...,...
785,13013001,0
786,13014001,9
787,13016001,0
788,13016001,9


In [6]:

melted_visions.to_sql(
    name="bioeconomy_visions", con=engine, if_exists="replace", index=False
)

split_innovation_types = split_cols(
    cleaned_innovation_types, col_to_split="innovation_type", sep=","
)
melted_innovation_types = melt_table(
    split_innovation_types,
    id_vars="sinno_id",
    col_start="innovation",
    value_name="innovation_type",
)
melted_innovation_types.dropna()
melted_innovation_types.to_sql(
    name="eco_innovations", con=engine, if_exists="replace", index=False
)

notes = notes.dropna()
notes.to_sql(name="categorization_notes", con=engine, index=False, if_exists="replace")

283